Message ID | f9f7d28624f8084ef07842ee569c22b324ee4055.1703059341.git.hengqi@linux.alibaba.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next] virtio-net: switch napi_tx without downing nic | expand |
Heng Qi wrote: > virtio-net has two ways to switch napi_tx: one is through the > module parameter, and the other is through coalescing parameter > settings (provided that the nic status is down). > > Sometimes we face performance regression caused by napi_tx, > then we need to switch napi_tx when debugging. However, the > existing methods are a bit troublesome, such as needing to > reload the driver or turn off the network card. So try to make > this update. > > Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> The commit does not explain why it is safe to do so. The tx-napi weights are not really weights: it is a boolean whether napi is used for transmit cleaning, or whether packets are cleaned in ndo_start_xmit. There certainly are some subtle issues with regard to pausing/waking queues when switching between modes. Calling napi_enable/napi_disable without bringing down the device is allowed. The actually napi.weight field is only updated when neither napi nor ndo_start_xmit is running. So I don't see an immediate issue. > --- > drivers/net/virtio_net.c | 81 ++++++++++++++++++---------------------- > 1 file changed, 37 insertions(+), 44 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 10614e9f7cad..12f8e1f9971c 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -3559,16 +3559,37 @@ static int virtnet_coal_params_supported(struct ethtool_coalesce *ec) > return 0; > } > > -static int virtnet_should_update_vq_weight(int dev_flags, int weight, > - int vq_weight, bool *should_update) > +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 qstart, > + u32 qend, u32 tx_frames) > { > - if (weight ^ vq_weight) { > - if (dev_flags & IFF_UP) > - return -EBUSY; > - *should_update = true; > - } > + struct net_device *dev = vi->dev; > + int new_weight, cur_weight; > + struct netdev_queue *txq; > + struct send_queue *sq; > > - return 0; > + new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0; > + for (; qstart < qend; qstart++) { > + sq = &vi->sq[qstart]; > + cur_weight = sq->napi.weight; > + if (!(new_weight ^ cur_weight)) > + continue; > + > + if (!(dev->flags & IFF_UP)) { > + sq->napi.weight = new_weight; > + continue; > + } > + > + if (cur_weight) > + virtnet_napi_tx_disable(&sq->napi); > + > + txq = netdev_get_tx_queue(dev, qstart); > + __netif_tx_lock_bh(txq); > + sq->napi.weight = new_weight; > + __netif_tx_unlock_bh(txq); > + > + if (!cur_weight) > + virtnet_napi_tx_enable(vi, sq->vq, &sq->napi); > + } > } > > static int virtnet_set_coalesce(struct net_device *dev, > @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct net_device *dev, > struct netlink_ext_ack *extack) > { > struct virtnet_info *vi = netdev_priv(dev); > - int ret, queue_number, napi_weight; > - bool update_napi = false; > - > - /* Can't change NAPI weight if the link is up */ > - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; > - for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) { > - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, > - vi->sq[queue_number].napi.weight, > - &update_napi); > - if (ret) > - return ret; > - > - if (update_napi) { > - /* All queues that belong to [queue_number, vi->max_queue_pairs] will be > - * updated for the sake of simplicity, which might not be necessary > - */ > - break; > - } > - } > + int ret; > + > + /* Param tx_frames can be used to switch napi_tx */ > + virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs, > + ec->tx_max_coalesced_frames); > > if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) > ret = virtnet_send_notf_coal_cmds(vi, ec); > @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct net_device *dev, > if (ret) > return ret; > > - if (update_napi) { > - for (; queue_number < vi->max_queue_pairs; queue_number++) > - vi->sq[queue_number].napi.weight = napi_weight; > - } > - > return ret; > } > > @@ -3641,19 +3643,13 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev, > struct ethtool_coalesce *ec) > { > struct virtnet_info *vi = netdev_priv(dev); > - int ret, napi_weight; > - bool update_napi = false; > + int ret; > > if (queue >= vi->max_queue_pairs) > return -EINVAL; > > - /* Can't change NAPI weight if the link is up */ > - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; > - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, > - vi->sq[queue].napi.weight, > - &update_napi); > - if (ret) > - return ret; > + /* Param tx_frames can be used to switch napi_tx */ > + virtnet_switch_napi_tx(vi, queue, queue, ec->tx_max_coalesced_frames); > > if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) > ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue); > @@ -3663,9 +3659,6 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev, > if (ret) > return ret; > > - if (update_napi) > - vi->sq[queue].napi.weight = napi_weight; > - > return 0; > } > > -- > 2.19.1.6.gb485710b >
在 2023/12/20 16:07, Heng Qi 写道: > virtio-net has two ways to switch napi_tx: one is through the > module parameter, and the other is through coalescing parameter > settings (provided that the nic status is down). > > Sometimes we face performance regression caused by napi_tx, > then we need to switch napi_tx when debugging. However, the > existing methods are a bit troublesome, such as needing to > reload the driver or turn off the network card. So try to make > this update. What scenario can trigger this? We want to make tests on our device. Zhu Yanjun > > Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > --- > drivers/net/virtio_net.c | 81 ++++++++++++++++++---------------------- > 1 file changed, 37 insertions(+), 44 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 10614e9f7cad..12f8e1f9971c 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -3559,16 +3559,37 @@ static int virtnet_coal_params_supported(struct ethtool_coalesce *ec) > return 0; > } > > -static int virtnet_should_update_vq_weight(int dev_flags, int weight, > - int vq_weight, bool *should_update) > +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 qstart, > + u32 qend, u32 tx_frames) > { > - if (weight ^ vq_weight) { > - if (dev_flags & IFF_UP) > - return -EBUSY; > - *should_update = true; > - } > + struct net_device *dev = vi->dev; > + int new_weight, cur_weight; > + struct netdev_queue *txq; > + struct send_queue *sq; > > - return 0; > + new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0; > + for (; qstart < qend; qstart++) { > + sq = &vi->sq[qstart]; > + cur_weight = sq->napi.weight; > + if (!(new_weight ^ cur_weight)) > + continue; > + > + if (!(dev->flags & IFF_UP)) { > + sq->napi.weight = new_weight; > + continue; > + } > + > + if (cur_weight) > + virtnet_napi_tx_disable(&sq->napi); > + > + txq = netdev_get_tx_queue(dev, qstart); > + __netif_tx_lock_bh(txq); > + sq->napi.weight = new_weight; > + __netif_tx_unlock_bh(txq); > + > + if (!cur_weight) > + virtnet_napi_tx_enable(vi, sq->vq, &sq->napi); > + } > } > > static int virtnet_set_coalesce(struct net_device *dev, > @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct net_device *dev, > struct netlink_ext_ack *extack) > { > struct virtnet_info *vi = netdev_priv(dev); > - int ret, queue_number, napi_weight; > - bool update_napi = false; > - > - /* Can't change NAPI weight if the link is up */ > - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; > - for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) { > - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, > - vi->sq[queue_number].napi.weight, > - &update_napi); > - if (ret) > - return ret; > - > - if (update_napi) { > - /* All queues that belong to [queue_number, vi->max_queue_pairs] will be > - * updated for the sake of simplicity, which might not be necessary > - */ > - break; > - } > - } > + int ret; > + > + /* Param tx_frames can be used to switch napi_tx */ > + virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs, > + ec->tx_max_coalesced_frames); > > if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) > ret = virtnet_send_notf_coal_cmds(vi, ec); > @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct net_device *dev, > if (ret) > return ret; > > - if (update_napi) { > - for (; queue_number < vi->max_queue_pairs; queue_number++) > - vi->sq[queue_number].napi.weight = napi_weight; > - } > - > return ret; > } > > @@ -3641,19 +3643,13 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev, > struct ethtool_coalesce *ec) > { > struct virtnet_info *vi = netdev_priv(dev); > - int ret, napi_weight; > - bool update_napi = false; > + int ret; > > if (queue >= vi->max_queue_pairs) > return -EINVAL; > > - /* Can't change NAPI weight if the link is up */ > - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; > - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, > - vi->sq[queue].napi.weight, > - &update_napi); > - if (ret) > - return ret; > + /* Param tx_frames can be used to switch napi_tx */ > + virtnet_switch_napi_tx(vi, queue, queue, ec->tx_max_coalesced_frames); > > if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) > ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue); > @@ -3663,9 +3659,6 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev, > if (ret) > return ret; > > - if (update_napi) > - vi->sq[queue].napi.weight = napi_weight; > - > return 0; > } >
在 2023/12/20 下午10:45, Willem de Bruijn 写道: > Heng Qi wrote: >> virtio-net has two ways to switch napi_tx: one is through the >> module parameter, and the other is through coalescing parameter >> settings (provided that the nic status is down). >> >> Sometimes we face performance regression caused by napi_tx, >> then we need to switch napi_tx when debugging. However, the >> existing methods are a bit troublesome, such as needing to >> reload the driver or turn off the network card. So try to make >> this update. >> >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > The commit does not explain why it is safe to do so. virtnet_napi_tx_disable ensures that already scheduled tx napi ends and no new tx napi will be scheduled. Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot send the packet. Then we can safely toggle the weight to indicate where to clear the buffers. > > The tx-napi weights are not really weights: it is a boolean whether > napi is used for transmit cleaning, or whether packets are cleaned > in ndo_start_xmit. Right. > > There certainly are some subtle issues with regard to pausing/waking > queues when switching between modes. What are "subtle issues" and if there are any, we find them. So far my test results show it's working fine. > > Calling napi_enable/napi_disable without bringing down the device is > allowed. The actually napi.weight field is only updated when neither > napi nor ndo_start_xmit is running. YES. > So I don't see an immediate issue. Switching napi_tx requires reloading the driver or downing the nic, which is troublesome. I think it would be better if we could find a better way. Thanks! > > >> --- >> drivers/net/virtio_net.c | 81 ++++++++++++++++++---------------------- >> 1 file changed, 37 insertions(+), 44 deletions(-) >> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c >> index 10614e9f7cad..12f8e1f9971c 100644 >> --- a/drivers/net/virtio_net.c >> +++ b/drivers/net/virtio_net.c >> @@ -3559,16 +3559,37 @@ static int virtnet_coal_params_supported(struct ethtool_coalesce *ec) >> return 0; >> } >> >> -static int virtnet_should_update_vq_weight(int dev_flags, int weight, >> - int vq_weight, bool *should_update) >> +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 qstart, >> + u32 qend, u32 tx_frames) >> { >> - if (weight ^ vq_weight) { >> - if (dev_flags & IFF_UP) >> - return -EBUSY; >> - *should_update = true; >> - } >> + struct net_device *dev = vi->dev; >> + int new_weight, cur_weight; >> + struct netdev_queue *txq; >> + struct send_queue *sq; >> >> - return 0; >> + new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0; >> + for (; qstart < qend; qstart++) { >> + sq = &vi->sq[qstart]; >> + cur_weight = sq->napi.weight; >> + if (!(new_weight ^ cur_weight)) >> + continue; >> + >> + if (!(dev->flags & IFF_UP)) { >> + sq->napi.weight = new_weight; >> + continue; >> + } >> + >> + if (cur_weight) >> + virtnet_napi_tx_disable(&sq->napi); >> + >> + txq = netdev_get_tx_queue(dev, qstart); >> + __netif_tx_lock_bh(txq); >> + sq->napi.weight = new_weight; >> + __netif_tx_unlock_bh(txq); >> + >> + if (!cur_weight) >> + virtnet_napi_tx_enable(vi, sq->vq, &sq->napi); >> + } >> } >> >> static int virtnet_set_coalesce(struct net_device *dev, >> @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct net_device *dev, >> struct netlink_ext_ack *extack) >> { >> struct virtnet_info *vi = netdev_priv(dev); >> - int ret, queue_number, napi_weight; >> - bool update_napi = false; >> - >> - /* Can't change NAPI weight if the link is up */ >> - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; >> - for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) { >> - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, >> - vi->sq[queue_number].napi.weight, >> - &update_napi); >> - if (ret) >> - return ret; >> - >> - if (update_napi) { >> - /* All queues that belong to [queue_number, vi->max_queue_pairs] will be >> - * updated for the sake of simplicity, which might not be necessary >> - */ >> - break; >> - } >> - } >> + int ret; >> + >> + /* Param tx_frames can be used to switch napi_tx */ >> + virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs, >> + ec->tx_max_coalesced_frames); >> >> if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) >> ret = virtnet_send_notf_coal_cmds(vi, ec); >> @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct net_device *dev, >> if (ret) >> return ret; >> >> - if (update_napi) { >> - for (; queue_number < vi->max_queue_pairs; queue_number++) >> - vi->sq[queue_number].napi.weight = napi_weight; >> - } >> - >> return ret; >> } >> >> @@ -3641,19 +3643,13 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev, >> struct ethtool_coalesce *ec) >> { >> struct virtnet_info *vi = netdev_priv(dev); >> - int ret, napi_weight; >> - bool update_napi = false; >> + int ret; >> >> if (queue >= vi->max_queue_pairs) >> return -EINVAL; >> >> - /* Can't change NAPI weight if the link is up */ >> - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; >> - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, >> - vi->sq[queue].napi.weight, >> - &update_napi); >> - if (ret) >> - return ret; >> + /* Param tx_frames can be used to switch napi_tx */ >> + virtnet_switch_napi_tx(vi, queue, queue, ec->tx_max_coalesced_frames); >> >> if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) >> ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue); >> @@ -3663,9 +3659,6 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev, >> if (ret) >> return ret; >> >> - if (update_napi) >> - vi->sq[queue].napi.weight = napi_weight; >> - >> return 0; >> } >> >> -- >> 2.19.1.6.gb485710b >>
在 2023/12/21 上午11:02, Zhu Yanjun 写道: > 在 2023/12/20 16:07, Heng Qi 写道: >> virtio-net has two ways to switch napi_tx: one is through the >> module parameter, and the other is through coalescing parameter >> settings (provided that the nic status is down). >> >> Sometimes we face performance regression caused by napi_tx, >> then we need to switch napi_tx when debugging. However, the >> existing methods are a bit troublesome, such as needing to >> reload the driver or turn off the network card. So try to make >> this update. > > What scenario can trigger this? We want to make tests on our device. Hi Zhu Yanjun, you can use the following cmds: ethtool -C tx-frames 0, to disable napi_tx ethtool -C tx-frames 1, to enable napi_tx Thanks. > > Zhu Yanjun > >> >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> >> --- >> drivers/net/virtio_net.c | 81 ++++++++++++++++++---------------------- >> 1 file changed, 37 insertions(+), 44 deletions(-) >> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c >> index 10614e9f7cad..12f8e1f9971c 100644 >> --- a/drivers/net/virtio_net.c >> +++ b/drivers/net/virtio_net.c >> @@ -3559,16 +3559,37 @@ static int >> virtnet_coal_params_supported(struct ethtool_coalesce *ec) >> return 0; >> } >> -static int virtnet_should_update_vq_weight(int dev_flags, int weight, >> - int vq_weight, bool *should_update) >> +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 qstart, >> + u32 qend, u32 tx_frames) >> { >> - if (weight ^ vq_weight) { >> - if (dev_flags & IFF_UP) >> - return -EBUSY; >> - *should_update = true; >> - } >> + struct net_device *dev = vi->dev; >> + int new_weight, cur_weight; >> + struct netdev_queue *txq; >> + struct send_queue *sq; >> - return 0; >> + new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0; >> + for (; qstart < qend; qstart++) { >> + sq = &vi->sq[qstart]; >> + cur_weight = sq->napi.weight; >> + if (!(new_weight ^ cur_weight)) >> + continue; >> + >> + if (!(dev->flags & IFF_UP)) { >> + sq->napi.weight = new_weight; >> + continue; >> + } >> + >> + if (cur_weight) >> + virtnet_napi_tx_disable(&sq->napi); >> + >> + txq = netdev_get_tx_queue(dev, qstart); >> + __netif_tx_lock_bh(txq); >> + sq->napi.weight = new_weight; >> + __netif_tx_unlock_bh(txq); >> + >> + if (!cur_weight) >> + virtnet_napi_tx_enable(vi, sq->vq, &sq->napi); >> + } >> } >> static int virtnet_set_coalesce(struct net_device *dev, >> @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct >> net_device *dev, >> struct netlink_ext_ack *extack) >> { >> struct virtnet_info *vi = netdev_priv(dev); >> - int ret, queue_number, napi_weight; >> - bool update_napi = false; >> - >> - /* Can't change NAPI weight if the link is up */ >> - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; >> - for (queue_number = 0; queue_number < vi->max_queue_pairs; >> queue_number++) { >> - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, >> - vi->sq[queue_number].napi.weight, >> - &update_napi); >> - if (ret) >> - return ret; >> - >> - if (update_napi) { >> - /* All queues that belong to [queue_number, >> vi->max_queue_pairs] will be >> - * updated for the sake of simplicity, which might not >> be necessary >> - */ >> - break; >> - } >> - } >> + int ret; >> + >> + /* Param tx_frames can be used to switch napi_tx */ >> + virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs, >> + ec->tx_max_coalesced_frames); >> if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) >> ret = virtnet_send_notf_coal_cmds(vi, ec); >> @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct >> net_device *dev, >> if (ret) >> return ret; >> - if (update_napi) { >> - for (; queue_number < vi->max_queue_pairs; queue_number++) >> - vi->sq[queue_number].napi.weight = napi_weight; >> - } >> - >> return ret; >> } >> @@ -3641,19 +3643,13 @@ static int >> virtnet_set_per_queue_coalesce(struct net_device *dev, >> struct ethtool_coalesce *ec) >> { >> struct virtnet_info *vi = netdev_priv(dev); >> - int ret, napi_weight; >> - bool update_napi = false; >> + int ret; >> if (queue >= vi->max_queue_pairs) >> return -EINVAL; >> - /* Can't change NAPI weight if the link is up */ >> - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; >> - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, >> - vi->sq[queue].napi.weight, >> - &update_napi); >> - if (ret) >> - return ret; >> + /* Param tx_frames can be used to switch napi_tx */ >> + virtnet_switch_napi_tx(vi, queue, queue, >> ec->tx_max_coalesced_frames); >> if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) >> ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue); >> @@ -3663,9 +3659,6 @@ static int >> virtnet_set_per_queue_coalesce(struct net_device *dev, >> if (ret) >> return ret; >> - if (update_napi) >> - vi->sq[queue].napi.weight = napi_weight; >> - >> return 0; >> }
在 2023/12/21 13:20, Heng Qi 写道: > > > 在 2023/12/21 上午11:02, Zhu Yanjun 写道: >> 在 2023/12/20 16:07, Heng Qi 写道: >>> virtio-net has two ways to switch napi_tx: one is through the >>> module parameter, and the other is through coalescing parameter >>> settings (provided that the nic status is down). >>> >>> Sometimes we face performance regression caused by napi_tx, >>> then we need to switch napi_tx when debugging. However, the >>> existing methods are a bit troublesome, such as needing to >>> reload the driver or turn off the network card. So try to make >>> this update. >> >> What scenario can trigger this? We want to make tests on our device. > > Hi Zhu Yanjun, you can use the following cmds: > > ethtool -C tx-frames 0, to disable napi_tx > ethtool -C tx-frames 1, to enable napi_tx Thanks a lot. Just now I made tests on our device. I confirmed that virtion_net driver can work well after running "ethtool -C NIC tx-frames 0 && sleep 3 && ethtool -C NIC tx-frames 1". You can add "Reviewed-and-tested-by: Zhu Yanjun <yanjun.zhu@linux.dev>" Thanks, Zhu Yanjun > > Thanks. > >> >> Zhu Yanjun >> >>> >>> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> >>> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> >>> --- >>> drivers/net/virtio_net.c | 81 >>> ++++++++++++++++++---------------------- >>> 1 file changed, 37 insertions(+), 44 deletions(-) >>> >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c >>> index 10614e9f7cad..12f8e1f9971c 100644 >>> --- a/drivers/net/virtio_net.c >>> +++ b/drivers/net/virtio_net.c >>> @@ -3559,16 +3559,37 @@ static int >>> virtnet_coal_params_supported(struct ethtool_coalesce *ec) >>> return 0; >>> } >>> -static int virtnet_should_update_vq_weight(int dev_flags, int >>> weight, >>> - int vq_weight, bool *should_update) >>> +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 >>> qstart, >>> + u32 qend, u32 tx_frames) >>> { >>> - if (weight ^ vq_weight) { >>> - if (dev_flags & IFF_UP) >>> - return -EBUSY; >>> - *should_update = true; >>> - } >>> + struct net_device *dev = vi->dev; >>> + int new_weight, cur_weight; >>> + struct netdev_queue *txq; >>> + struct send_queue *sq; >>> - return 0; >>> + new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0; >>> + for (; qstart < qend; qstart++) { >>> + sq = &vi->sq[qstart]; >>> + cur_weight = sq->napi.weight; >>> + if (!(new_weight ^ cur_weight)) >>> + continue; >>> + >>> + if (!(dev->flags & IFF_UP)) { >>> + sq->napi.weight = new_weight; >>> + continue; >>> + } >>> + >>> + if (cur_weight) >>> + virtnet_napi_tx_disable(&sq->napi); >>> + >>> + txq = netdev_get_tx_queue(dev, qstart); >>> + __netif_tx_lock_bh(txq); >>> + sq->napi.weight = new_weight; >>> + __netif_tx_unlock_bh(txq); >>> + >>> + if (!cur_weight) >>> + virtnet_napi_tx_enable(vi, sq->vq, &sq->napi); >>> + } >>> } >>> static int virtnet_set_coalesce(struct net_device *dev, >>> @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct >>> net_device *dev, >>> struct netlink_ext_ack *extack) >>> { >>> struct virtnet_info *vi = netdev_priv(dev); >>> - int ret, queue_number, napi_weight; >>> - bool update_napi = false; >>> - >>> - /* Can't change NAPI weight if the link is up */ >>> - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; >>> - for (queue_number = 0; queue_number < vi->max_queue_pairs; >>> queue_number++) { >>> - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, >>> - vi->sq[queue_number].napi.weight, >>> - &update_napi); >>> - if (ret) >>> - return ret; >>> - >>> - if (update_napi) { >>> - /* All queues that belong to [queue_number, >>> vi->max_queue_pairs] will be >>> - * updated for the sake of simplicity, which might not >>> be necessary >>> - */ >>> - break; >>> - } >>> - } >>> + int ret; >>> + >>> + /* Param tx_frames can be used to switch napi_tx */ >>> + virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs, >>> + ec->tx_max_coalesced_frames); >>> if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) >>> ret = virtnet_send_notf_coal_cmds(vi, ec); >>> @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct >>> net_device *dev, >>> if (ret) >>> return ret; >>> - if (update_napi) { >>> - for (; queue_number < vi->max_queue_pairs; queue_number++) >>> - vi->sq[queue_number].napi.weight = napi_weight; >>> - } >>> - >>> return ret; >>> } >>> @@ -3641,19 +3643,13 @@ static int >>> virtnet_set_per_queue_coalesce(struct net_device *dev, >>> struct ethtool_coalesce *ec) >>> { >>> struct virtnet_info *vi = netdev_priv(dev); >>> - int ret, napi_weight; >>> - bool update_napi = false; >>> + int ret; >>> if (queue >= vi->max_queue_pairs) >>> return -EINVAL; >>> - /* Can't change NAPI weight if the link is up */ >>> - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; >>> - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, >>> - vi->sq[queue].napi.weight, >>> - &update_napi); >>> - if (ret) >>> - return ret; >>> + /* Param tx_frames can be used to switch napi_tx */ >>> + virtnet_switch_napi_tx(vi, queue, queue, >>> ec->tx_max_coalesced_frames); >>> if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) >>> ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue); >>> @@ -3663,9 +3659,6 @@ static int >>> virtnet_set_per_queue_coalesce(struct net_device *dev, >>> if (ret) >>> return ret; >>> - if (update_napi) >>> - vi->sq[queue].napi.weight = napi_weight; >>> - >>> return 0; >>> } >
Heng Qi wrote: > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道: > > Heng Qi wrote: > >> virtio-net has two ways to switch napi_tx: one is through the > >> module parameter, and the other is through coalescing parameter > >> settings (provided that the nic status is down). > >> > >> Sometimes we face performance regression caused by napi_tx, > >> then we need to switch napi_tx when debugging. However, the > >> existing methods are a bit troublesome, such as needing to > >> reload the driver or turn off the network card. So try to make > >> this update. > >> > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > The commit does not explain why it is safe to do so. > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and > no new tx napi will be scheduled. > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot > send the packet. > > Then we can safely toggle the weight to indicate where to clear the buffers. > > > > > The tx-napi weights are not really weights: it is a boolean whether > > napi is used for transmit cleaning, or whether packets are cleaned > > in ndo_start_xmit. > > Right. > > > > > There certainly are some subtle issues with regard to pausing/waking > > queues when switching between modes. > > What are "subtle issues" and if there are any, we find them. A single runtime test is not sufficient to exercise all edge cases. Please don't leave it to reviewers to establish the correctness of a patch. The napi_tx and non-napi code paths differ in how they handle at least the following structures: 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is needed as delay until the next ndo_start_xmit and thus completion is unbounded. When switching to napi mode, orphaned skbs may now be cleaned by the napi handler. This is indeed safe. When switching from napi to non-napi, the unbound latency resurfaces. It is a small edge case, and I think a potentially acceptable risk, if the user of this knob is aware of the risk. 2. virtqueue callback ("interrupt" masking). The non-napi path enables the interrupt (disables the mask) when available descriptors falls beneath a low watermark, and reenables when it recovers above a high watermark. Napi disables when napi is scheduled, and reenables on napi complete. 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below a low watermark, the driver stops the stack for queuing more packets. In napi mode, it schedules napi to clean packets. See the calls to netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and netif_tx_wake_queue. Some if this can be assumed safe by looking at existing analogous code, such as the queue stop/start in virtnet_tx_resize. But that all virtqueue callback and dev_queue->state transitions are correct when switching between modes at runtime is not trivial to establish, deserves some thought and explanation in the commit message. > So far my test results show it's working fine. > > > > > Calling napi_enable/napi_disable without bringing down the device is > > allowed. The actually napi.weight field is only updated when neither > > napi nor ndo_start_xmit is running. > > YES. > > > So I don't see an immediate issue. > > Switching napi_tx requires reloading the driver or downing the nic, > which is troublesome. > I think it would be better if we could find a better way. > > Thanks! > > > > > > >> --- > >> drivers/net/virtio_net.c | 81 ++++++++++++++++++---------------------- > >> 1 file changed, 37 insertions(+), 44 deletions(-) > >> > >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > >> index 10614e9f7cad..12f8e1f9971c 100644 > >> --- a/drivers/net/virtio_net.c > >> +++ b/drivers/net/virtio_net.c > >> @@ -3559,16 +3559,37 @@ static int virtnet_coal_params_supported(struct ethtool_coalesce *ec) > >> return 0; > >> } > >> > >> -static int virtnet_should_update_vq_weight(int dev_flags, int weight, > >> - int vq_weight, bool *should_update) > >> +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 qstart, > >> + u32 qend, u32 tx_frames) > >> { > >> - if (weight ^ vq_weight) { > >> - if (dev_flags & IFF_UP) > >> - return -EBUSY; > >> - *should_update = true; > >> - } > >> + struct net_device *dev = vi->dev; > >> + int new_weight, cur_weight; > >> + struct netdev_queue *txq; > >> + struct send_queue *sq; > >> > >> - return 0; > >> + new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0; > >> + for (; qstart < qend; qstart++) { > >> + sq = &vi->sq[qstart]; > >> + cur_weight = sq->napi.weight; > >> + if (!(new_weight ^ cur_weight)) > >> + continue; > >> + > >> + if (!(dev->flags & IFF_UP)) { > >> + sq->napi.weight = new_weight; > >> + continue; > >> + } > >> + > >> + if (cur_weight) > >> + virtnet_napi_tx_disable(&sq->napi); > >> + > >> + txq = netdev_get_tx_queue(dev, qstart); > >> + __netif_tx_lock_bh(txq); > >> + sq->napi.weight = new_weight; > >> + __netif_tx_unlock_bh(txq); > >> + > >> + if (!cur_weight) > >> + virtnet_napi_tx_enable(vi, sq->vq, &sq->napi); > >> + } > >> } > >> > >> static int virtnet_set_coalesce(struct net_device *dev, > >> @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct net_device *dev, > >> struct netlink_ext_ack *extack) > >> { > >> struct virtnet_info *vi = netdev_priv(dev); > >> - int ret, queue_number, napi_weight; > >> - bool update_napi = false; > >> - > >> - /* Can't change NAPI weight if the link is up */ > >> - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; > >> - for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) { > >> - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, > >> - vi->sq[queue_number].napi.weight, > >> - &update_napi); > >> - if (ret) > >> - return ret; > >> - > >> - if (update_napi) { > >> - /* All queues that belong to [queue_number, vi->max_queue_pairs] will be > >> - * updated for the sake of simplicity, which might not be necessary > >> - */ > >> - break; > >> - } > >> - } > >> + int ret; > >> + > >> + /* Param tx_frames can be used to switch napi_tx */ > >> + virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs, > >> + ec->tx_max_coalesced_frames); > >> > >> if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) > >> ret = virtnet_send_notf_coal_cmds(vi, ec); > >> @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct net_device *dev, > >> if (ret) > >> return ret; > >> > >> - if (update_napi) { > >> - for (; queue_number < vi->max_queue_pairs; queue_number++) > >> - vi->sq[queue_number].napi.weight = napi_weight; > >> - } > >> - > >> return ret; > >> } > >> > >> @@ -3641,19 +3643,13 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev, > >> struct ethtool_coalesce *ec) > >> { > >> struct virtnet_info *vi = netdev_priv(dev); > >> - int ret, napi_weight; > >> - bool update_napi = false; > >> + int ret; > >> > >> if (queue >= vi->max_queue_pairs) > >> return -EINVAL; > >> > >> - /* Can't change NAPI weight if the link is up */ > >> - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; > >> - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, > >> - vi->sq[queue].napi.weight, > >> - &update_napi); > >> - if (ret) > >> - return ret; > >> + /* Param tx_frames can be used to switch napi_tx */ > >> + virtnet_switch_napi_tx(vi, queue, queue, ec->tx_max_coalesced_frames); > >> > >> if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) > >> ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue); > >> @@ -3663,9 +3659,6 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev, > >> if (ret) > >> return ret; > >> > >> - if (update_napi) > >> - vi->sq[queue].napi.weight = napi_weight; > >> - > >> return 0; > >> } > >> > >> -- > >> 2.19.1.6.gb485710b > >> >
On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > Heng Qi wrote: > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道: > > > Heng Qi wrote: > > >> virtio-net has two ways to switch napi_tx: one is through the > > >> module parameter, and the other is through coalescing parameter > > >> settings (provided that the nic status is down). > > >> > > >> Sometimes we face performance regression caused by napi_tx, > > >> then we need to switch napi_tx when debugging. However, the > > >> existing methods are a bit troublesome, such as needing to > > >> reload the driver or turn off the network card. Why is this troublesome? We don't need to turn off the card, it's just a toggling of the interface. This ends up with pretty simple code. > So try to make > > >> this update. > > >> > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > The commit does not explain why it is safe to do so. > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and > > no new tx napi will be scheduled. > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot > > send the packet. > > > > Then we can safely toggle the weight to indicate where to clear the buffers. > > > > > > > > The tx-napi weights are not really weights: it is a boolean whether > > > napi is used for transmit cleaning, or whether packets are cleaned > > > in ndo_start_xmit. > > > > Right. > > > > > > > > There certainly are some subtle issues with regard to pausing/waking > > > queues when switching between modes. > > > > What are "subtle issues" and if there are any, we find them. > > A single runtime test is not sufficient to exercise all edge cases. > > Please don't leave it to reviewers to establish the correctness of a > patch. +1 And instead of trying to do this, it would be much better to optimize the NAPI performance. Then we can drop the orphan mode. > > The napi_tx and non-napi code paths differ in how they handle at least > the following structures: > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is > needed as delay until the next ndo_start_xmit and thus completion is > unbounded. > > When switching to napi mode, orphaned skbs may now be cleaned by the > napi handler. This is indeed safe. > > When switching from napi to non-napi, the unbound latency resurfaces. > It is a small edge case, and I think a potentially acceptable risk, if > the user of this knob is aware of the risk. > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables > the interrupt (disables the mask) when available descriptors falls > beneath a low watermark, and reenables when it recovers above a high > watermark. Napi disables when napi is scheduled, and reenables on > napi complete. > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below > a low watermark, the driver stops the stack for queuing more packets. > In napi mode, it schedules napi to clean packets. See the calls to > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and > netif_tx_wake_queue. > > Some if this can be assumed safe by looking at existing analogous > code, such as the queue stop/start in virtnet_tx_resize. > > But that all virtqueue callback and dev_queue->state transitions are > correct when switching between modes at runtime is not trivial to > establish, deserves some thought and explanation in the commit > message. Thanks
On Fri, Dec 22, 2023 at 10:35:07AM +0800, Jason Wang wrote: > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn > <willemdebruijn.kernel@gmail.com> wrote: > > > > Heng Qi wrote: > > > > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道: > > > > Heng Qi wrote: > > > >> virtio-net has two ways to switch napi_tx: one is through the > > > >> module parameter, and the other is through coalescing parameter > > > >> settings (provided that the nic status is down). > > > >> > > > >> Sometimes we face performance regression caused by napi_tx, > > > >> then we need to switch napi_tx when debugging. However, the > > > >> existing methods are a bit troublesome, such as needing to > > > >> reload the driver or turn off the network card. > > Why is this troublesome? We don't need to turn off the card, it's just > a toggling of the interface. > > This ends up with pretty simple code. > > > So try to make > > > >> this update. > > > >> > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > > The commit does not explain why it is safe to do so. > > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and > > > no new tx napi will be scheduled. > > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot > > > send the packet. > > > > > > Then we can safely toggle the weight to indicate where to clear the buffers. > > > > > > > > > > > The tx-napi weights are not really weights: it is a boolean whether > > > > napi is used for transmit cleaning, or whether packets are cleaned > > > > in ndo_start_xmit. > > > > > > Right. > > > > > > > > > > > There certainly are some subtle issues with regard to pausing/waking > > > > queues when switching between modes. > > > > > > What are "subtle issues" and if there are any, we find them. > > > > A single runtime test is not sufficient to exercise all edge cases. > > > > Please don't leave it to reviewers to establish the correctness of a > > patch. > > +1 > > And instead of trying to do this, it would be much better to optimize > the NAPI performance. Then we can drop the orphan mode. "To address your problem, optimize our code to the level which we couldn't achieve in more than 10 years". That's not a reasonable requirement. Not getting an interrupt will always be better for some workloads. > > > > The napi_tx and non-napi code paths differ in how they handle at least > > the following structures: > > > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is > > needed as delay until the next ndo_start_xmit and thus completion is > > unbounded. > > > > When switching to napi mode, orphaned skbs may now be cleaned by the > > napi handler. This is indeed safe. > > > > When switching from napi to non-napi, the unbound latency resurfaces. > > It is a small edge case, and I think a potentially acceptable risk, if > > the user of this knob is aware of the risk. > > > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables > > the interrupt (disables the mask) when available descriptors falls > > beneath a low watermark, and reenables when it recovers above a high > > watermark. Napi disables when napi is scheduled, and reenables on > > napi complete. > > > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below > > a low watermark, the driver stops the stack for queuing more packets. > > In napi mode, it schedules napi to clean packets. See the calls to > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and > > netif_tx_wake_queue. > > > > Some if this can be assumed safe by looking at existing analogous > > code, such as the queue stop/start in virtnet_tx_resize. > > > > But that all virtqueue callback and dev_queue->state transitions are > > correct when switching between modes at runtime is not trivial to > > establish, deserves some thought and explanation in the commit > > message. > > Thanks
Hello Jason, On Fri, Dec 22, 2023 at 10:36 AM Jason Wang <jasowang@redhat.com> wrote: > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn > <willemdebruijn.kernel@gmail.com> wrote: > > > > Heng Qi wrote: > > > > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道: > > > > Heng Qi wrote: > > > >> virtio-net has two ways to switch napi_tx: one is through the > > > >> module parameter, and the other is through coalescing parameter > > > >> settings (provided that the nic status is down). > > > >> > > > >> Sometimes we face performance regression caused by napi_tx, > > > >> then we need to switch napi_tx when debugging. However, the > > > >> existing methods are a bit troublesome, such as needing to > > > >> reload the driver or turn off the network card. > > Why is this troublesome? We don't need to turn off the card, it's just > a toggling of the interface. > > This ends up with pretty simple code. > > > So try to make > > > >> this update. > > > >> > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > > The commit does not explain why it is safe to do so. > > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and > > > no new tx napi will be scheduled. > > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot > > > send the packet. > > > > > > Then we can safely toggle the weight to indicate where to clear the buffers. > > > > > > > > > > > The tx-napi weights are not really weights: it is a boolean whether > > > > napi is used for transmit cleaning, or whether packets are cleaned > > > > in ndo_start_xmit. > > > > > > Right. > > > > > > > > > > > There certainly are some subtle issues with regard to pausing/waking > > > > queues when switching between modes. > > > > > > What are "subtle issues" and if there are any, we find them. > > > > A single runtime test is not sufficient to exercise all edge cases. > > > > Please don't leave it to reviewers to establish the correctness of a > > patch. > > +1 > [...] > And instead of trying to do this, it would be much better to optimize > the NAPI performance. Then we can drop the orphan mode. Do you mean when to call skb_orphan()? If yes, I just want to provide more information that we also have some performance issues where the flow control takes a bad effect, especially under some small throughput in our production environment. What strikes me as odd is if I restart the network, then the issue will go with the wind. I cannot reproduce it in my testing machine. One more thing: if I force skb_orphan() the current skb in every start_xmit(), it could also solve the issue but not in a proper way. After all, it drops the flow control... :S Thanks, Jason > > > > > The napi_tx and non-napi code paths differ in how they handle at least > > the following structures: > > > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is > > needed as delay until the next ndo_start_xmit and thus completion is > > unbounded. > > > > When switching to napi mode, orphaned skbs may now be cleaned by the > > napi handler. This is indeed safe. > > > > When switching from napi to non-napi, the unbound latency resurfaces. > > It is a small edge case, and I think a potentially acceptable risk, if > > the user of this knob is aware of the risk. > > > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables > > the interrupt (disables the mask) when available descriptors falls > > beneath a low watermark, and reenables when it recovers above a high > > watermark. Napi disables when napi is scheduled, and reenables on > > napi complete. > > > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below > > a low watermark, the driver stops the stack for queuing more packets. > > In napi mode, it schedules napi to clean packets. See the calls to > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and > > netif_tx_wake_queue. > > > > Some if this can be assumed safe by looking at existing analogous > > code, such as the queue stop/start in virtnet_tx_resize. > > > > But that all virtqueue callback and dev_queue->state transitions are > > correct when switching between modes at runtime is not trivial to > > establish, deserves some thought and explanation in the commit > > message. > > Thanks > >
On Fri, Dec 22, 2023 at 4:14 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Fri, Dec 22, 2023 at 10:35:07AM +0800, Jason Wang wrote: > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn > > <willemdebruijn.kernel@gmail.com> wrote: > > > > > > Heng Qi wrote: > > > > > > > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道: > > > > > Heng Qi wrote: > > > > >> virtio-net has two ways to switch napi_tx: one is through the > > > > >> module parameter, and the other is through coalescing parameter > > > > >> settings (provided that the nic status is down). > > > > >> > > > > >> Sometimes we face performance regression caused by napi_tx, > > > > >> then we need to switch napi_tx when debugging. However, the > > > > >> existing methods are a bit troublesome, such as needing to > > > > >> reload the driver or turn off the network card. > > > > Why is this troublesome? We don't need to turn off the card, it's just > > a toggling of the interface. > > > > This ends up with pretty simple code. > > > > > So try to make > > > > >> this update. > > > > >> > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > > > The commit does not explain why it is safe to do so. > > > > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and > > > > no new tx napi will be scheduled. > > > > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot > > > > send the packet. > > > > > > > > Then we can safely toggle the weight to indicate where to clear the buffers. > > > > > > > > > > > > > > The tx-napi weights are not really weights: it is a boolean whether > > > > > napi is used for transmit cleaning, or whether packets are cleaned > > > > > in ndo_start_xmit. > > > > > > > > Right. > > > > > > > > > > > > > > There certainly are some subtle issues with regard to pausing/waking > > > > > queues when switching between modes. > > > > > > > > What are "subtle issues" and if there are any, we find them. > > > > > > A single runtime test is not sufficient to exercise all edge cases. > > > > > > Please don't leave it to reviewers to establish the correctness of a > > > patch. > > > > +1 > > > > And instead of trying to do this, it would be much better to optimize > > the NAPI performance. Then we can drop the orphan mode. > > "To address your problem, optimize our code to the level which we > couldn't achieve in more than 10 years". Last time QE didn't report any issue for TCP. For others, the code might just need some optimization if it really matters, it's just because nobody has worked on this part in the past years. The ethtool trick is just for debugging purposes, I can hardly believe it is used by any management layer software. > That's not a reasonable > requirement. Not getting an interrupt will always be better for some > workloads. So NAPI has been enabled by default for many years. And most of the NIC doesn't do orphans. Orphan has known issues like pktgen and others. Keeping two modes may result in tricky code and complicate the features like BQL [1]. We would have interrupt coalescing and dim soon. Admin can enable the heuristic or tweak it according to the workload which is much more user friendly than orphaning. I don't see a strong point to keep the orphan mode any more considering the advantages of NAPI. Thanks [1] https://lore.kernel.org/lkml/20181205225323.12555-2-mst@redhat.com/ > > > > > > > > The napi_tx and non-napi code paths differ in how they handle at least > > > the following structures: > > > > > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is > > > needed as delay until the next ndo_start_xmit and thus completion is > > > unbounded. > > > > > > When switching to napi mode, orphaned skbs may now be cleaned by the > > > napi handler. This is indeed safe. > > > > > > When switching from napi to non-napi, the unbound latency resurfaces. > > > It is a small edge case, and I think a potentially acceptable risk, if > > > the user of this knob is aware of the risk. > > > > > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables > > > the interrupt (disables the mask) when available descriptors falls > > > beneath a low watermark, and reenables when it recovers above a high > > > watermark. Napi disables when napi is scheduled, and reenables on > > > napi complete. > > > > > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below > > > a low watermark, the driver stops the stack for queuing more packets. > > > In napi mode, it schedules napi to clean packets. See the calls to > > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and > > > netif_tx_wake_queue. > > > > > > Some if this can be assumed safe by looking at existing analogous > > > code, such as the queue stop/start in virtnet_tx_resize. > > > > > > But that all virtqueue callback and dev_queue->state transitions are > > > correct when switching between modes at runtime is not trivial to > > > establish, deserves some thought and explanation in the commit > > > message. > > > > Thanks >
On Mon, Dec 25, 2023 at 10:25 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > Hello Jason, > On Fri, Dec 22, 2023 at 10:36 AM Jason Wang <jasowang@redhat.com> wrote: > > > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn > > <willemdebruijn.kernel@gmail.com> wrote: > > > > > > Heng Qi wrote: > > > > > > > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道: > > > > > Heng Qi wrote: > > > > >> virtio-net has two ways to switch napi_tx: one is through the > > > > >> module parameter, and the other is through coalescing parameter > > > > >> settings (provided that the nic status is down). > > > > >> > > > > >> Sometimes we face performance regression caused by napi_tx, > > > > >> then we need to switch napi_tx when debugging. However, the > > > > >> existing methods are a bit troublesome, such as needing to > > > > >> reload the driver or turn off the network card. > > > > Why is this troublesome? We don't need to turn off the card, it's just > > a toggling of the interface. > > > > This ends up with pretty simple code. > > > > > So try to make > > > > >> this update. > > > > >> > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > > > The commit does not explain why it is safe to do so. > > > > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and > > > > no new tx napi will be scheduled. > > > > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot > > > > send the packet. > > > > > > > > Then we can safely toggle the weight to indicate where to clear the buffers. > > > > > > > > > > > > > > The tx-napi weights are not really weights: it is a boolean whether > > > > > napi is used for transmit cleaning, or whether packets are cleaned > > > > > in ndo_start_xmit. > > > > > > > > Right. > > > > > > > > > > > > > > There certainly are some subtle issues with regard to pausing/waking > > > > > queues when switching between modes. > > > > > > > > What are "subtle issues" and if there are any, we find them. > > > > > > A single runtime test is not sufficient to exercise all edge cases. > > > > > > Please don't leave it to reviewers to establish the correctness of a > > > patch. > > > > +1 > > > [...] > > And instead of trying to do this, it would be much better to optimize > > the NAPI performance. Then we can drop the orphan mode. > > Do you mean when to call skb_orphan()? If yes, I just want to provide > more information that we also have some performance issues where the > flow control takes a bad effect, especially under some small > throughput in our production environment. I think you need to describe it in detail. > What strikes me as odd is if I restart the network, then the issue > will go with the wind. I cannot reproduce it in my testing machine. > One more thing: if I force skb_orphan() the current skb in every > start_xmit(), it could also solve the issue but not in a proper way. > After all, it drops the flow control... :S Yes, that's the known issue. Thanks > > Thanks, > Jason > > > > > > > > The napi_tx and non-napi code paths differ in how they handle at least > > > the following structures: > > > > > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is > > > needed as delay until the next ndo_start_xmit and thus completion is > > > unbounded. > > > > > > When switching to napi mode, orphaned skbs may now be cleaned by the > > > napi handler. This is indeed safe. > > > > > > When switching from napi to non-napi, the unbound latency resurfaces. > > > It is a small edge case, and I think a potentially acceptable risk, if > > > the user of this knob is aware of the risk. > > > > > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables > > > the interrupt (disables the mask) when available descriptors falls > > > beneath a low watermark, and reenables when it recovers above a high > > > watermark. Napi disables when napi is scheduled, and reenables on > > > napi complete. > > > > > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below > > > a low watermark, the driver stops the stack for queuing more packets. > > > In napi mode, it schedules napi to clean packets. See the calls to > > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and > > > netif_tx_wake_queue. > > > > > > Some if this can be assumed safe by looking at existing analogous > > > code, such as the queue stop/start in virtnet_tx_resize. > > > > > > But that all virtqueue callback and dev_queue->state transitions are > > > correct when switching between modes at runtime is not trivial to > > > establish, deserves some thought and explanation in the commit > > > message. > > > > Thanks > > > > >
On Mon, Dec 25, 2023 at 12:14 PM Jason Wang <jasowang@redhat.com> wrote: > > On Mon, Dec 25, 2023 at 10:25 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > Hello Jason, > > On Fri, Dec 22, 2023 at 10:36 AM Jason Wang <jasowang@redhat.com> wrote: > > > > > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn > > > <willemdebruijn.kernel@gmail.com> wrote: > > > > > > > > Heng Qi wrote: > > > > > > > > > > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道: > > > > > > Heng Qi wrote: > > > > > >> virtio-net has two ways to switch napi_tx: one is through the > > > > > >> module parameter, and the other is through coalescing parameter > > > > > >> settings (provided that the nic status is down). > > > > > >> > > > > > >> Sometimes we face performance regression caused by napi_tx, > > > > > >> then we need to switch napi_tx when debugging. However, the > > > > > >> existing methods are a bit troublesome, such as needing to > > > > > >> reload the driver or turn off the network card. > > > > > > Why is this troublesome? We don't need to turn off the card, it's just > > > a toggling of the interface. > > > > > > This ends up with pretty simple code. > > > > > > > So try to make > > > > > >> this update. > > > > > >> > > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > > > > The commit does not explain why it is safe to do so. > > > > > > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and > > > > > no new tx napi will be scheduled. > > > > > > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot > > > > > send the packet. > > > > > > > > > > Then we can safely toggle the weight to indicate where to clear the buffers. > > > > > > > > > > > > > > > > > The tx-napi weights are not really weights: it is a boolean whether > > > > > > napi is used for transmit cleaning, or whether packets are cleaned > > > > > > in ndo_start_xmit. > > > > > > > > > > Right. > > > > > > > > > > > > > > > > > There certainly are some subtle issues with regard to pausing/waking > > > > > > queues when switching between modes. > > > > > > > > > > What are "subtle issues" and if there are any, we find them. > > > > > > > > A single runtime test is not sufficient to exercise all edge cases. > > > > > > > > Please don't leave it to reviewers to establish the correctness of a > > > > patch. > > > > > > +1 > > > > > [...] > > > And instead of trying to do this, it would be much better to optimize > > > the NAPI performance. Then we can drop the orphan mode. > > [...] > > Do you mean when to call skb_orphan()? If yes, I just want to provide > > more information that we also have some performance issues where the > > flow control takes a bad effect, especially under some small > > throughput in our production environment. > > I think you need to describe it in detail. Some of the details were described below in the last email. The decreased performance happened because of flow control: the delay of skb free means the delay of decreasing of sk_wmem_alloc, then it will hit the limit of TSQ mechanism, finally causing transmitting slowly in the TCP layer. > > > What strikes me as odd is if I restart the network, then the issue > > will go with the wind. I cannot reproduce it in my testing machine. > > One more thing: if I force skb_orphan() the current skb in every > > start_xmit(), it could also solve the issue but not in a proper way. > > After all, it drops the flow control... :S > > Yes, that's the known issue. Really? Did you have some numbers or have some discussion links to share? I failed to reproduce on my testing machine, probably the short rtt is the key/obstacle. @Eric, it seems it still exists. Thanks, Jason > > Thanks > > > > > Thanks, > > Jason > > > > > > > > > > > The napi_tx and non-napi code paths differ in how they handle at least > > > > the following structures: > > > > > > > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is > > > > needed as delay until the next ndo_start_xmit and thus completion is > > > > unbounded. > > > > > > > > When switching to napi mode, orphaned skbs may now be cleaned by the > > > > napi handler. This is indeed safe. > > > > > > > > When switching from napi to non-napi, the unbound latency resurfaces. > > > > It is a small edge case, and I think a potentially acceptable risk, if > > > > the user of this knob is aware of the risk. > > > > > > > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables > > > > the interrupt (disables the mask) when available descriptors falls > > > > beneath a low watermark, and reenables when it recovers above a high > > > > watermark. Napi disables when napi is scheduled, and reenables on > > > > napi complete. > > > > > > > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below > > > > a low watermark, the driver stops the stack for queuing more packets. > > > > In napi mode, it schedules napi to clean packets. See the calls to > > > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and > > > > netif_tx_wake_queue. > > > > > > > > Some if this can be assumed safe by looking at existing analogous > > > > code, such as the queue stop/start in virtnet_tx_resize. > > > > > > > > But that all virtqueue callback and dev_queue->state transitions are > > > > correct when switching between modes at runtime is not trivial to > > > > establish, deserves some thought and explanation in the commit > > > > message. > > > > > > Thanks > > > > > > > > >
On Mon, Dec 25, 2023 at 2:34 PM Jason Xing <kerneljasonxing@gmail.com> wrote: > > On Mon, Dec 25, 2023 at 12:14 PM Jason Wang <jasowang@redhat.com> wrote: > > > > On Mon, Dec 25, 2023 at 10:25 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > > > Hello Jason, > > > On Fri, Dec 22, 2023 at 10:36 AM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn > > > > <willemdebruijn.kernel@gmail.com> wrote: > > > > > > > > > > Heng Qi wrote: > > > > > > > > > > > > > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道: > > > > > > > Heng Qi wrote: > > > > > > >> virtio-net has two ways to switch napi_tx: one is through the > > > > > > >> module parameter, and the other is through coalescing parameter > > > > > > >> settings (provided that the nic status is down). > > > > > > >> > > > > > > >> Sometimes we face performance regression caused by napi_tx, > > > > > > >> then we need to switch napi_tx when debugging. However, the > > > > > > >> existing methods are a bit troublesome, such as needing to > > > > > > >> reload the driver or turn off the network card. > > > > > > > > Why is this troublesome? We don't need to turn off the card, it's just > > > > a toggling of the interface. > > > > > > > > This ends up with pretty simple code. > > > > > > > > > So try to make > > > > > > >> this update. > > > > > > >> > > > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > > > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > > > > > The commit does not explain why it is safe to do so. > > > > > > > > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and > > > > > > no new tx napi will be scheduled. > > > > > > > > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot > > > > > > send the packet. > > > > > > > > > > > > Then we can safely toggle the weight to indicate where to clear the buffers. > > > > > > > > > > > > > > > > > > > > The tx-napi weights are not really weights: it is a boolean whether > > > > > > > napi is used for transmit cleaning, or whether packets are cleaned > > > > > > > in ndo_start_xmit. > > > > > > > > > > > > Right. > > > > > > > > > > > > > > > > > > > > There certainly are some subtle issues with regard to pausing/waking > > > > > > > queues when switching between modes. > > > > > > > > > > > > What are "subtle issues" and if there are any, we find them. > > > > > > > > > > A single runtime test is not sufficient to exercise all edge cases. > > > > > > > > > > Please don't leave it to reviewers to establish the correctness of a > > > > > patch. > > > > > > > > +1 > > > > > > > [...] > > > > And instead of trying to do this, it would be much better to optimize > > > > the NAPI performance. Then we can drop the orphan mode. > > > > [...] > > > Do you mean when to call skb_orphan()? If yes, I just want to provide > > > more information that we also have some performance issues where the > > > flow control takes a bad effect, especially under some small > > > throughput in our production environment. > > > > I think you need to describe it in detail. > > Some of the details were described below in the last email. The > decreased performance happened because of flow control: the delay of > skb free means the delay What do you mean by delay here? Is it an interrupt delay? If yes, Does it work better if you simply remove virtqueue_enable_cb_delayed() with virtqueue_enable_cb()? As the former may delay the interrupt more or less depend on the traffic. > of decreasing of sk_wmem_alloc, then it will > hit the limit of TSQ mechanism, finally causing transmitting slowly in > the TCP layer. TSQ might work better with BQL which virtio-net doesn't have right now. > > > > > > What strikes me as odd is if I restart the network, then the issue > > > will go with the wind. I cannot reproduce it in my testing machine. > > > One more thing: if I force skb_orphan() the current skb in every > > > start_xmit(), it could also solve the issue but not in a proper way. > > > After all, it drops the flow control... :S > > > > Yes, that's the known issue. > > Really? Did you have some numbers or have some discussion links to > share? I failed to reproduce on my testing machine, probably the short > rtt is the key/obstacle. I basically mean it is a known side effect of skb_orphan() as it might decrease sk_wmem_alloc too early. Thanks > > @Eric, it seems it still exists. > > Thanks, > Jason > > > > > Thanks > > > > > > > > Thanks, > > > Jason > > > > > > > > > > > > > > The napi_tx and non-napi code paths differ in how they handle at least > > > > > the following structures: > > > > > > > > > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is > > > > > needed as delay until the next ndo_start_xmit and thus completion is > > > > > unbounded. > > > > > > > > > > When switching to napi mode, orphaned skbs may now be cleaned by the > > > > > napi handler. This is indeed safe. > > > > > > > > > > When switching from napi to non-napi, the unbound latency resurfaces. > > > > > It is a small edge case, and I think a potentially acceptable risk, if > > > > > the user of this knob is aware of the risk. > > > > > > > > > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables > > > > > the interrupt (disables the mask) when available descriptors falls > > > > > beneath a low watermark, and reenables when it recovers above a high > > > > > watermark. Napi disables when napi is scheduled, and reenables on > > > > > napi complete. > > > > > > > > > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below > > > > > a low watermark, the driver stops the stack for queuing more packets. > > > > > In napi mode, it schedules napi to clean packets. See the calls to > > > > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and > > > > > netif_tx_wake_queue. > > > > > > > > > > Some if this can be assumed safe by looking at existing analogous > > > > > code, such as the queue stop/start in virtnet_tx_resize. > > > > > > > > > > But that all virtqueue callback and dev_queue->state transitions are > > > > > correct when switching between modes at runtime is not trivial to > > > > > establish, deserves some thought and explanation in the commit > > > > > message. > > > > > > > > Thanks > > > > > > > > > > > > > >
On Mon, Dec 25, 2023 at 2:44 PM Jason Wang <jasowang@redhat.com> wrote: > > On Mon, Dec 25, 2023 at 2:34 PM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > On Mon, Dec 25, 2023 at 12:14 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > On Mon, Dec 25, 2023 at 10:25 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > > > > > Hello Jason, > > > > On Fri, Dec 22, 2023 at 10:36 AM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn > > > > > <willemdebruijn.kernel@gmail.com> wrote: > > > > > > > > > > > > Heng Qi wrote: > > > > > > > > > > > > > > > > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道: > > > > > > > > Heng Qi wrote: > > > > > > > >> virtio-net has two ways to switch napi_tx: one is through the > > > > > > > >> module parameter, and the other is through coalescing parameter > > > > > > > >> settings (provided that the nic status is down). > > > > > > > >> > > > > > > > >> Sometimes we face performance regression caused by napi_tx, > > > > > > > >> then we need to switch napi_tx when debugging. However, the > > > > > > > >> existing methods are a bit troublesome, such as needing to > > > > > > > >> reload the driver or turn off the network card. > > > > > > > > > > Why is this troublesome? We don't need to turn off the card, it's just > > > > > a toggling of the interface. > > > > > > > > > > This ends up with pretty simple code. > > > > > > > > > > > So try to make > > > > > > > >> this update. > > > > > > > >> > > > > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > > > > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > > > > > > The commit does not explain why it is safe to do so. > > > > > > > > > > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and > > > > > > > no new tx napi will be scheduled. > > > > > > > > > > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot > > > > > > > send the packet. > > > > > > > > > > > > > > Then we can safely toggle the weight to indicate where to clear the buffers. > > > > > > > > > > > > > > > > > > > > > > > The tx-napi weights are not really weights: it is a boolean whether > > > > > > > > napi is used for transmit cleaning, or whether packets are cleaned > > > > > > > > in ndo_start_xmit. > > > > > > > > > > > > > > Right. > > > > > > > > > > > > > > > > > > > > > > > There certainly are some subtle issues with regard to pausing/waking > > > > > > > > queues when switching between modes. > > > > > > > > > > > > > > What are "subtle issues" and if there are any, we find them. > > > > > > > > > > > > A single runtime test is not sufficient to exercise all edge cases. > > > > > > > > > > > > Please don't leave it to reviewers to establish the correctness of a > > > > > > patch. > > > > > > > > > > +1 > > > > > > > > > [...] > > > > > And instead of trying to do this, it would be much better to optimize > > > > > the NAPI performance. Then we can drop the orphan mode. > > > > > > [...] > > > > Do you mean when to call skb_orphan()? If yes, I just want to provide > > > > more information that we also have some performance issues where the > > > > flow control takes a bad effect, especially under some small > > > > throughput in our production environment. > > > > > > I think you need to describe it in detail. > > > > Some of the details were described below in the last email. The > > decreased performance happened because of flow control: the delay of > > skb free means the delay > > What do you mean by delay here? Is it an interrupt delay? If yes, Does > it work better if you simply remove > > virtqueue_enable_cb_delayed() with virtqueue_enable_cb()? As the > former may delay the interrupt more or less depend on the traffic. > > > of decreasing of sk_wmem_alloc, then it will > > hit the limit of TSQ mechanism, finally causing transmitting slowly in > > the TCP layer. > > TSQ might work better with BQL which virtio-net doesn't have right now. > > > > > > > > > > What strikes me as odd is if I restart the network, then the issue > > > > will go with the wind. I cannot reproduce it in my testing machine. > > > > One more thing: if I force skb_orphan() the current skb in every > > > > start_xmit(), it could also solve the issue but not in a proper way. > > > > After all, it drops the flow control... :S > > > > > > Yes, that's the known issue. Just to clarify, I mean the skb_orphan() drop flow control is a known issue. Not the restart... Thanks > > > > Really? Did you have some numbers or have some discussion links to > > share? I failed to reproduce on my testing machine, probably the short > > rtt is the key/obstacle. > > I basically mean it is a known side effect of skb_orphan() as it might > decrease sk_wmem_alloc too early. > > Thanks > > > > > > @Eric, it seems it still exists. > > > > Thanks, > > Jason > > > > > > > > Thanks > > > > > > > > > > > Thanks, > > > > Jason > > > > > > > > > > > > > > > > > The napi_tx and non-napi code paths differ in how they handle at least > > > > > > the following structures: > > > > > > > > > > > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is > > > > > > needed as delay until the next ndo_start_xmit and thus completion is > > > > > > unbounded. > > > > > > > > > > > > When switching to napi mode, orphaned skbs may now be cleaned by the > > > > > > napi handler. This is indeed safe. > > > > > > > > > > > > When switching from napi to non-napi, the unbound latency resurfaces. > > > > > > It is a small edge case, and I think a potentially acceptable risk, if > > > > > > the user of this knob is aware of the risk. > > > > > > > > > > > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables > > > > > > the interrupt (disables the mask) when available descriptors falls > > > > > > beneath a low watermark, and reenables when it recovers above a high > > > > > > watermark. Napi disables when napi is scheduled, and reenables on > > > > > > napi complete. > > > > > > > > > > > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below > > > > > > a low watermark, the driver stops the stack for queuing more packets. > > > > > > In napi mode, it schedules napi to clean packets. See the calls to > > > > > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and > > > > > > netif_tx_wake_queue. > > > > > > > > > > > > Some if this can be assumed safe by looking at existing analogous > > > > > > code, such as the queue stop/start in virtnet_tx_resize. > > > > > > > > > > > > But that all virtqueue callback and dev_queue->state transitions are > > > > > > correct when switching between modes at runtime is not trivial to > > > > > > establish, deserves some thought and explanation in the commit > > > > > > message. > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > >
On Mon, Dec 25, 2023 at 2:45 PM Jason Wang <jasowang@redhat.com> wrote: > > On Mon, Dec 25, 2023 at 2:34 PM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > On Mon, Dec 25, 2023 at 12:14 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > On Mon, Dec 25, 2023 at 10:25 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > > > > > Hello Jason, > > > > On Fri, Dec 22, 2023 at 10:36 AM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn > > > > > <willemdebruijn.kernel@gmail.com> wrote: > > > > > > > > > > > > Heng Qi wrote: > > > > > > > > > > > > > > > > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道: > > > > > > > > Heng Qi wrote: > > > > > > > >> virtio-net has two ways to switch napi_tx: one is through the > > > > > > > >> module parameter, and the other is through coalescing parameter > > > > > > > >> settings (provided that the nic status is down). > > > > > > > >> > > > > > > > >> Sometimes we face performance regression caused by napi_tx, > > > > > > > >> then we need to switch napi_tx when debugging. However, the > > > > > > > >> existing methods are a bit troublesome, such as needing to > > > > > > > >> reload the driver or turn off the network card. > > > > > > > > > > Why is this troublesome? We don't need to turn off the card, it's just > > > > > a toggling of the interface. > > > > > > > > > > This ends up with pretty simple code. > > > > > > > > > > > So try to make > > > > > > > >> this update. > > > > > > > >> > > > > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > > > > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > > > > > > The commit does not explain why it is safe to do so. > > > > > > > > > > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and > > > > > > > no new tx napi will be scheduled. > > > > > > > > > > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot > > > > > > > send the packet. > > > > > > > > > > > > > > Then we can safely toggle the weight to indicate where to clear the buffers. > > > > > > > > > > > > > > > > > > > > > > > The tx-napi weights are not really weights: it is a boolean whether > > > > > > > > napi is used for transmit cleaning, or whether packets are cleaned > > > > > > > > in ndo_start_xmit. > > > > > > > > > > > > > > Right. > > > > > > > > > > > > > > > > > > > > > > > There certainly are some subtle issues with regard to pausing/waking > > > > > > > > queues when switching between modes. > > > > > > > > > > > > > > What are "subtle issues" and if there are any, we find them. > > > > > > > > > > > > A single runtime test is not sufficient to exercise all edge cases. > > > > > > > > > > > > Please don't leave it to reviewers to establish the correctness of a > > > > > > patch. > > > > > > > > > > +1 > > > > > > > > > [...] > > > > > And instead of trying to do this, it would be much better to optimize > > > > > the NAPI performance. Then we can drop the orphan mode. > > > > > > [...] > > > > Do you mean when to call skb_orphan()? If yes, I just want to provide > > > > more information that we also have some performance issues where the > > > > flow control takes a bad effect, especially under some small > > > > throughput in our production environment. > > > > > > I think you need to describe it in detail. > > > > Some of the details were described below in the last email. The > > decreased performance happened because of flow control: the delay of > > skb free means the delay > > What do you mean by delay here? Is it an interrupt delay? If yes, Does > it work better if you simply remove Delay means the interval between start_xmit() to skb free. I counted some numbers and then found out that some of them have very long intervals which might be normal? $ sudo bpftrace txcompletion.bt Attaching 4 probes... trace from virtio start_xmit() to tcp_wfree() longer than 0 ns ^C @average: 384386 @count: 348302 @hist: [1K, 2K) 9551 |@@@@ | [2K, 4K) 116513 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [4K, 8K) 88295 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [8K, 16K) 39965 |@@@@@@@@@@@@@@@@@ | [16K, 32K) 39584 |@@@@@@@@@@@@@@@@@ | [32K, 64K) 53970 |@@@@@@@@@@@@@@@@@@@@@@@@ | [64K, 128K) 415 | | [128K, 256K) 0 | | [256K, 512K) 0 | | [512K, 1M) 0 | | [1M, 2M) 0 | | [2M, 4M) 0 | | [4M, 8M) 0 | | [8M, 16M) 0 | | [16M, 32M) 0 | | [32M, 64M) 0 | | [64M, 128M) 0 | | [128M, 256M) 0 | | [256M, 512M) 0 | | [512M, 1G) 0 | | [1G, 2G) 0 | | [2G, 4G) 9 | | > > virtqueue_enable_cb_delayed() with virtqueue_enable_cb()? As the > former may delay the interrupt more or less depend on the traffic. Due to the complexity of the production environment, I suspect the interrupt could be delayed on the host. Thanks for the suggestion. > > > of decreasing of sk_wmem_alloc, then it will > > hit the limit of TSQ mechanism, finally causing transmitting slowly in > > the TCP layer. > > TSQ might work better with BQL which virtio-net doesn't have right now. > > > > > > > > > > What strikes me as odd is if I restart the network, then the issue > > > > will go with the wind. I cannot reproduce it in my testing machine. > > > > One more thing: if I force skb_orphan() the current skb in every > > > > start_xmit(), it could also solve the issue but not in a proper way. > > > > After all, it drops the flow control... :S > > > > > > Yes, that's the known issue. > > > > Really? Did you have some numbers or have some discussion links to > > share? I failed to reproduce on my testing machine, probably the short > > rtt is the key/obstacle. > > I basically mean it is a known side effect of skb_orphan() as it might > decrease sk_wmem_alloc too early. Oh, I got it wrong :( > > Thanks > > > > > > @Eric, it seems it still exists. > > > > Thanks, > > Jason > > > > > > > > Thanks > > > > > > > > > > > Thanks, > > > > Jason > > > > > > > > > > > > > > > > > The napi_tx and non-napi code paths differ in how they handle at least > > > > > > the following structures: > > > > > > > > > > > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is > > > > > > needed as delay until the next ndo_start_xmit and thus completion is > > > > > > unbounded. > > > > > > > > > > > > When switching to napi mode, orphaned skbs may now be cleaned by the > > > > > > napi handler. This is indeed safe. > > > > > > > > > > > > When switching from napi to non-napi, the unbound latency resurfaces. > > > > > > It is a small edge case, and I think a potentially acceptable risk, if > > > > > > the user of this knob is aware of the risk. > > > > > > > > > > > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables > > > > > > the interrupt (disables the mask) when available descriptors falls > > > > > > beneath a low watermark, and reenables when it recovers above a high > > > > > > watermark. Napi disables when napi is scheduled, and reenables on > > > > > > napi complete. > > > > > > > > > > > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below > > > > > > a low watermark, the driver stops the stack for queuing more packets. > > > > > > In napi mode, it schedules napi to clean packets. See the calls to > > > > > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and > > > > > > netif_tx_wake_queue. > > > > > > > > > > > > Some if this can be assumed safe by looking at existing analogous > > > > > > code, such as the queue stop/start in virtnet_tx_resize. > > > > > > > > > > > > But that all virtqueue callback and dev_queue->state transitions are > > > > > > correct when switching between modes at runtime is not trivial to > > > > > > establish, deserves some thought and explanation in the commit > > > > > > message. > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > >
On Mon, Dec 25, 2023 at 12:12:48PM +0800, Jason Wang wrote: > On Fri, Dec 22, 2023 at 4:14 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Fri, Dec 22, 2023 at 10:35:07AM +0800, Jason Wang wrote: > > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn > > > <willemdebruijn.kernel@gmail.com> wrote: > > > > > > > > Heng Qi wrote: > > > > > > > > > > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道: > > > > > > Heng Qi wrote: > > > > > >> virtio-net has two ways to switch napi_tx: one is through the > > > > > >> module parameter, and the other is through coalescing parameter > > > > > >> settings (provided that the nic status is down). > > > > > >> > > > > > >> Sometimes we face performance regression caused by napi_tx, > > > > > >> then we need to switch napi_tx when debugging. However, the > > > > > >> existing methods are a bit troublesome, such as needing to > > > > > >> reload the driver or turn off the network card. > > > > > > Why is this troublesome? We don't need to turn off the card, it's just > > > a toggling of the interface. > > > > > > This ends up with pretty simple code. > > > > > > > So try to make > > > > > >> this update. > > > > > >> > > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > > > > The commit does not explain why it is safe to do so. > > > > > > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and > > > > > no new tx napi will be scheduled. > > > > > > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot > > > > > send the packet. > > > > > > > > > > Then we can safely toggle the weight to indicate where to clear the buffers. > > > > > > > > > > > > > > > > > The tx-napi weights are not really weights: it is a boolean whether > > > > > > napi is used for transmit cleaning, or whether packets are cleaned > > > > > > in ndo_start_xmit. > > > > > > > > > > Right. > > > > > > > > > > > > > > > > > There certainly are some subtle issues with regard to pausing/waking > > > > > > queues when switching between modes. > > > > > > > > > > What are "subtle issues" and if there are any, we find them. > > > > > > > > A single runtime test is not sufficient to exercise all edge cases. > > > > > > > > Please don't leave it to reviewers to establish the correctness of a > > > > patch. > > > > > > +1 > > > > > > And instead of trying to do this, it would be much better to optimize > > > the NAPI performance. Then we can drop the orphan mode. > > > > "To address your problem, optimize our code to the level which we > > couldn't achieve in more than 10 years". > > Last time QE didn't report any issue for TCP. For others, the code > might just need some optimization if it really matters, it's just > because nobody has worked on this part in the past years. You think nobody worked on performance of virtio net because nobody could bother? I think it's just micro optimized to a level where progress is difficult. > The ethtool trick is just for debugging purposes, I can hardly believe > it is used by any management layer software. It's UAPI - someone somewhere uses it, management layer or not.
On Mon, Dec 25, 2023 at 4:03 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Mon, Dec 25, 2023 at 12:12:48PM +0800, Jason Wang wrote: > > On Fri, Dec 22, 2023 at 4:14 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Fri, Dec 22, 2023 at 10:35:07AM +0800, Jason Wang wrote: > > > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn > > > > <willemdebruijn.kernel@gmail.com> wrote: > > > > > > > > > > Heng Qi wrote: > > > > > > > > > > > > > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道: > > > > > > > Heng Qi wrote: > > > > > > >> virtio-net has two ways to switch napi_tx: one is through the > > > > > > >> module parameter, and the other is through coalescing parameter > > > > > > >> settings (provided that the nic status is down). > > > > > > >> > > > > > > >> Sometimes we face performance regression caused by napi_tx, > > > > > > >> then we need to switch napi_tx when debugging. However, the > > > > > > >> existing methods are a bit troublesome, such as needing to > > > > > > >> reload the driver or turn off the network card. > > > > > > > > Why is this troublesome? We don't need to turn off the card, it's just > > > > a toggling of the interface. > > > > > > > > This ends up with pretty simple code. > > > > > > > > > So try to make > > > > > > >> this update. > > > > > > >> > > > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com> > > > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > > > > > The commit does not explain why it is safe to do so. > > > > > > > > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and > > > > > > no new tx napi will be scheduled. > > > > > > > > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot > > > > > > send the packet. > > > > > > > > > > > > Then we can safely toggle the weight to indicate where to clear the buffers. > > > > > > > > > > > > > > > > > > > > The tx-napi weights are not really weights: it is a boolean whether > > > > > > > napi is used for transmit cleaning, or whether packets are cleaned > > > > > > > in ndo_start_xmit. > > > > > > > > > > > > Right. > > > > > > > > > > > > > > > > > > > > There certainly are some subtle issues with regard to pausing/waking > > > > > > > queues when switching between modes. > > > > > > > > > > > > What are "subtle issues" and if there are any, we find them. > > > > > > > > > > A single runtime test is not sufficient to exercise all edge cases. > > > > > > > > > > Please don't leave it to reviewers to establish the correctness of a > > > > > patch. > > > > > > > > +1 > > > > > > > > And instead of trying to do this, it would be much better to optimize > > > > the NAPI performance. Then we can drop the orphan mode. > > > > > > "To address your problem, optimize our code to the level which we > > > couldn't achieve in more than 10 years". > > > > Last time QE didn't report any issue for TCP. For others, the code > > might just need some optimization if it really matters, it's just > > because nobody has worked on this part in the past years. > > You think nobody worked on performance of virtio net because nobody > could bother? No, I just describe what I've seen from the list. No patches were posted for performance optimization in recent years. > I think it's just micro optimized to a level where > progress is difficult. > > Maybe. Let me clarify what I meant. If people can help to optimize the NAPI to be as fast as orphans or something like 80%-90% of orphans. It should be sufficient to remove the orphan path completely. This allows a lot of good features to be built easily on top. To me, it seems fine to remove it even now as it breaks socket accounting which is actually a bug and NAPI has been enabled for years but if you don't wish to do that, that's fine. We can wait for the coalescing and dim to be used for a while and revisit this. Thanks
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 10614e9f7cad..12f8e1f9971c 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3559,16 +3559,37 @@ static int virtnet_coal_params_supported(struct ethtool_coalesce *ec) return 0; } -static int virtnet_should_update_vq_weight(int dev_flags, int weight, - int vq_weight, bool *should_update) +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 qstart, + u32 qend, u32 tx_frames) { - if (weight ^ vq_weight) { - if (dev_flags & IFF_UP) - return -EBUSY; - *should_update = true; - } + struct net_device *dev = vi->dev; + int new_weight, cur_weight; + struct netdev_queue *txq; + struct send_queue *sq; - return 0; + new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0; + for (; qstart < qend; qstart++) { + sq = &vi->sq[qstart]; + cur_weight = sq->napi.weight; + if (!(new_weight ^ cur_weight)) + continue; + + if (!(dev->flags & IFF_UP)) { + sq->napi.weight = new_weight; + continue; + } + + if (cur_weight) + virtnet_napi_tx_disable(&sq->napi); + + txq = netdev_get_tx_queue(dev, qstart); + __netif_tx_lock_bh(txq); + sq->napi.weight = new_weight; + __netif_tx_unlock_bh(txq); + + if (!cur_weight) + virtnet_napi_tx_enable(vi, sq->vq, &sq->napi); + } } static int virtnet_set_coalesce(struct net_device *dev, @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct net_device *dev, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); - int ret, queue_number, napi_weight; - bool update_napi = false; - - /* Can't change NAPI weight if the link is up */ - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; - for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) { - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, - vi->sq[queue_number].napi.weight, - &update_napi); - if (ret) - return ret; - - if (update_napi) { - /* All queues that belong to [queue_number, vi->max_queue_pairs] will be - * updated for the sake of simplicity, which might not be necessary - */ - break; - } - } + int ret; + + /* Param tx_frames can be used to switch napi_tx */ + virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs, + ec->tx_max_coalesced_frames); if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) ret = virtnet_send_notf_coal_cmds(vi, ec); @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct net_device *dev, if (ret) return ret; - if (update_napi) { - for (; queue_number < vi->max_queue_pairs; queue_number++) - vi->sq[queue_number].napi.weight = napi_weight; - } - return ret; } @@ -3641,19 +3643,13 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) { struct virtnet_info *vi = netdev_priv(dev); - int ret, napi_weight; - bool update_napi = false; + int ret; if (queue >= vi->max_queue_pairs) return -EINVAL; - /* Can't change NAPI weight if the link is up */ - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; - ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, - vi->sq[queue].napi.weight, - &update_napi); - if (ret) - return ret; + /* Param tx_frames can be used to switch napi_tx */ + virtnet_switch_napi_tx(vi, queue, queue, ec->tx_max_coalesced_frames); if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue); @@ -3663,9 +3659,6 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev, if (ret) return ret; - if (update_napi) - vi->sq[queue].napi.weight = napi_weight; - return 0; }