[net-next,V5,3/3] tun: rx batching

Message ID	1484722923-7698-4-git-send-email-jasowang@redhat.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@kernel.org> From: Jason Wang <jasowang@redhat.com> To: mst@redhat.com, virtualization@lists.linux-foundation.org, netdev@vger.kernel.org, kvm@vger.kernel.org Cc: stephen@networkplumber.org, wexu@redhat.com, stefanha@redhat.com, Jason Wang <jasowang@redhat.com> Subject: [PATCH net-next V5 3/3] tun: rx batching Date: Wed, 18 Jan 2017 15:02:03 +0800 Message-Id: <1484722923-7698-4-git-send-email-jasowang@redhat.com> In-Reply-To: <1484722923-7698-1-git-send-email-jasowang@redhat.com> References: <1484722923-7698-1-git-send-email-jasowang@redhat.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk

Message ID

1484722923-7698-4-git-send-email-jasowang@redhat.com (mailing list archive)

State

New, archived

Headers

From: Jason Wang <jasowang@redhat.com>
To: mst@redhat.com, virtualization@lists.linux-foundation.org,
	netdev@vger.kernel.org, kvm@vger.kernel.org
Cc: stephen@networkplumber.org, wexu@redhat.com, stefanha@redhat.com,
	Jason Wang <jasowang@redhat.com>
Subject: [PATCH net-next V5 3/3] tun: rx batching
Date: Wed, 18 Jan 2017 15:02:03 +0800
Message-Id: <1484722923-7698-4-git-send-email-jasowang@redhat.com>
In-Reply-To: <1484722923-7698-1-git-send-email-jasowang@redhat.com>
References: <1484722923-7698-1-git-send-email-jasowang@redhat.com>
Sender: kvm-owner@vger.kernel.org
Precedence: bulk

Commit Message

Jason Wang Jan. 18, 2017, 7:02 a.m. UTC

We can only process 1 packet at one time during sendmsg(). This often
lead bad cache utilization under heavy load. So this patch tries to do
some batching during rx before submitting them to host network
stack. This is done through accepting MSG_MORE as a hint from
sendmsg() caller, if it was set, batch the packet temporarily in a
linked list and submit them all once MSG_MORE were cleared.

Tests were done by pktgen (burst=128) in guest over mlx4(noqueue) on host:

                                 Mpps  -+%
    rx-frames = 0                0.91  +0%
    rx-frames = 4                1.00  +9.8%
    rx-frames = 8                1.00  +9.8%
    rx-frames = 16               1.01  +10.9%
    rx-frames = 32               1.07  +17.5%
    rx-frames = 48               1.07  +17.5%
    rx-frames = 64               1.08  +18.6%
    rx-frames = 64 (no MSG_MORE) 0.91  +0%

User were allowed to change per device batched packets through
ethtool -C rx-frames. NAPI_POLL_WEIGHT were used as upper limitation
to prevent bh from being disabled too long.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 70 insertions(+), 6 deletions(-)

Comments

Michael S. Tsirkin Jan. 18, 2017, 5:03 p.m. UTC | #1

On Wed, Jan 18, 2017 at 03:02:03PM +0800, Jason Wang wrote:
> We can only process 1 packet at one time during sendmsg(). This often
> lead bad cache utilization under heavy load. So this patch tries to do
> some batching during rx before submitting them to host network
> stack. This is done through accepting MSG_MORE as a hint from
> sendmsg() caller, if it was set, batch the packet temporarily in a
> linked list and submit them all once MSG_MORE were cleared.
> 
> Tests were done by pktgen (burst=128) in guest over mlx4(noqueue) on host:
> 
>                                  Mpps  -+%
>     rx-frames = 0                0.91  +0%
>     rx-frames = 4                1.00  +9.8%
>     rx-frames = 8                1.00  +9.8%
>     rx-frames = 16               1.01  +10.9%
>     rx-frames = 32               1.07  +17.5%
>     rx-frames = 48               1.07  +17.5%
>     rx-frames = 64               1.08  +18.6%
>     rx-frames = 64 (no MSG_MORE) 0.91  +0%
> 
> User were allowed to change per device batched packets through
> ethtool -C rx-frames. NAPI_POLL_WEIGHT were used as upper limitation
> to prevent bh from being disabled too long.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>


> ---
>  drivers/net/tun.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 70 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 8c1d3bd..13890ac 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -218,6 +218,7 @@ struct tun_struct {
>  	struct list_head disabled;
>  	void *security;
>  	u32 flow_count;
> +	u32 rx_batched;
>  	struct tun_pcpu_stats __percpu *pcpu_stats;
>  };
>  
> @@ -522,6 +523,7 @@ static void tun_queue_purge(struct tun_file *tfile)
>  	while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
>  		kfree_skb(skb);
>  
> +	skb_queue_purge(&tfile->sk.sk_write_queue);
>  	skb_queue_purge(&tfile->sk.sk_error_queue);
>  }
>  
> @@ -1139,10 +1141,46 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
>  	return skb;
>  }
>  
> +static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
> +			   struct sk_buff *skb, int more)
> +{
> +	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
> +	struct sk_buff_head process_queue;
> +	u32 rx_batched = tun->rx_batched;
> +	bool rcv = false;
> +
> +	if (!rx_batched || (!more && skb_queue_empty(queue))) {
> +		local_bh_disable();
> +		netif_receive_skb(skb);
> +		local_bh_enable();
> +		return;
> +	}
> +
> +	spin_lock(&queue->lock);
> +	if (!more || skb_queue_len(queue) == rx_batched) {
> +		__skb_queue_head_init(&process_queue);
> +		skb_queue_splice_tail_init(queue, &process_queue);
> +		rcv = true;
> +	} else {
> +		__skb_queue_tail(queue, skb);
> +	}
> +	spin_unlock(&queue->lock);
> +
> +	if (rcv) {
> +		struct sk_buff *nskb;
> +
> +		local_bh_disable();
> +		while ((nskb = __skb_dequeue(&process_queue)))
> +			netif_receive_skb(nskb);
> +		netif_receive_skb(skb);
> +		local_bh_enable();
> +	}
> +}
> +
>  /* Get packet from user space buffer */
>  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>  			    void *msg_control, struct iov_iter *from,
> -			    int noblock)
> +			    int noblock, bool more)
>  {
>  	struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
>  	struct sk_buff *skb;
> @@ -1283,9 +1321,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>  
>  	rxhash = skb_get_hash(skb);
>  #ifndef CONFIG_4KSTACKS
> -	local_bh_disable();
> -	netif_receive_skb(skb);
> -	local_bh_enable();
> +	tun_rx_batched(tun, tfile, skb, more);
>  #else
>  	netif_rx_ni(skb);
>  #endif
> @@ -1311,7 +1347,8 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	if (!tun)
>  		return -EBADFD;
>  
> -	result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK);
> +	result = tun_get_user(tun, tfile, NULL, from,
> +			      file->f_flags & O_NONBLOCK, false);
>  
>  	tun_put(tun);
>  	return result;
> @@ -1569,7 +1606,8 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
>  		return -EBADFD;
>  
>  	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
> -			   m->msg_flags & MSG_DONTWAIT);
> +			   m->msg_flags & MSG_DONTWAIT,
> +			   m->msg_flags & MSG_MORE);
>  	tun_put(tun);
>  	return ret;
>  }
> @@ -1770,6 +1808,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  		tun->align = NET_SKB_PAD;
>  		tun->filter_attached = false;
>  		tun->sndbuf = tfile->socket.sk->sk_sndbuf;
> +		tun->rx_batched = 0;
>  
>  		tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
>  		if (!tun->pcpu_stats) {
> @@ -2438,6 +2477,29 @@ static void tun_set_msglevel(struct net_device *dev, u32 value)
>  #endif
>  }
>  
> +static int tun_get_coalesce(struct net_device *dev,
> +			    struct ethtool_coalesce *ec)
> +{
> +	struct tun_struct *tun = netdev_priv(dev);
> +
> +	ec->rx_max_coalesced_frames = tun->rx_batched;
> +
> +	return 0;
> +}
> +
> +static int tun_set_coalesce(struct net_device *dev,
> +			    struct ethtool_coalesce *ec)
> +{
> +	struct tun_struct *tun = netdev_priv(dev);
> +
> +	if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
> +		tun->rx_batched = NAPI_POLL_WEIGHT;
> +	else
> +		tun->rx_batched = ec->rx_max_coalesced_frames;
> +
> +	return 0;
> +}
> +
>  static const struct ethtool_ops tun_ethtool_ops = {
>  	.get_settings	= tun_get_settings,
>  	.get_drvinfo	= tun_get_drvinfo,
> @@ -2445,6 +2507,8 @@ static const struct ethtool_ops tun_ethtool_ops = {
>  	.set_msglevel	= tun_set_msglevel,
>  	.get_link	= ethtool_op_get_link,
>  	.get_ts_info	= ethtool_op_get_ts_info,
> +	.get_coalesce   = tun_get_coalesce,
> +	.set_coalesce   = tun_set_coalesce,
>  };
>  
>  static int tun_queue_resize(struct tun_struct *tun)
> -- 
> 2.7.4
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8c1d3bd..13890ac 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -218,6 +218,7 @@  struct tun_struct {
 	struct list_head disabled;
 	void *security;
 	u32 flow_count;
+	u32 rx_batched;
 	struct tun_pcpu_stats __percpu *pcpu_stats;
 };
 
@@ -522,6 +523,7 @@  static void tun_queue_purge(struct tun_file *tfile)
 	while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
 		kfree_skb(skb);
 
+	skb_queue_purge(&tfile->sk.sk_write_queue);
 	skb_queue_purge(&tfile->sk.sk_error_queue);
 }
 
@@ -1139,10 +1141,46 @@  static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
 	return skb;
 }
 
+static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
+			   struct sk_buff *skb, int more)
+{
+	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
+	struct sk_buff_head process_queue;
+	u32 rx_batched = tun->rx_batched;
+	bool rcv = false;
+
+	if (!rx_batched || (!more && skb_queue_empty(queue))) {
+		local_bh_disable();
+		netif_receive_skb(skb);
+		local_bh_enable();
+		return;
+	}
+
+	spin_lock(&queue->lock);
+	if (!more || skb_queue_len(queue) == rx_batched) {
+		__skb_queue_head_init(&process_queue);
+		skb_queue_splice_tail_init(queue, &process_queue);
+		rcv = true;
+	} else {
+		__skb_queue_tail(queue, skb);
+	}
+	spin_unlock(&queue->lock);
+
+	if (rcv) {
+		struct sk_buff *nskb;
+
+		local_bh_disable();
+		while ((nskb = __skb_dequeue(&process_queue)))
+			netif_receive_skb(nskb);
+		netif_receive_skb(skb);
+		local_bh_enable();
+	}
+}
+
 /* Get packet from user space buffer */
 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			    void *msg_control, struct iov_iter *from,
-			    int noblock)
+			    int noblock, bool more)
 {
 	struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
 	struct sk_buff *skb;
@@ -1283,9 +1321,7 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 
 	rxhash = skb_get_hash(skb);
 #ifndef CONFIG_4KSTACKS
-	local_bh_disable();
-	netif_receive_skb(skb);
-	local_bh_enable();
+	tun_rx_batched(tun, tfile, skb, more);
 #else
 	netif_rx_ni(skb);
 #endif
@@ -1311,7 +1347,8 @@  static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (!tun)
 		return -EBADFD;
 
-	result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK);
+	result = tun_get_user(tun, tfile, NULL, from,
+			      file->f_flags & O_NONBLOCK, false);
 
 	tun_put(tun);
 	return result;
@@ -1569,7 +1606,8 @@  static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 		return -EBADFD;
 
 	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
-			   m->msg_flags & MSG_DONTWAIT);
+			   m->msg_flags & MSG_DONTWAIT,
+			   m->msg_flags & MSG_MORE);
 	tun_put(tun);
 	return ret;
 }
@@ -1770,6 +1808,7 @@  static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		tun->align = NET_SKB_PAD;
 		tun->filter_attached = false;
 		tun->sndbuf = tfile->socket.sk->sk_sndbuf;
+		tun->rx_batched = 0;
 
 		tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
 		if (!tun->pcpu_stats) {
@@ -2438,6 +2477,29 @@  static void tun_set_msglevel(struct net_device *dev, u32 value)
 #endif
 }
 
+static int tun_get_coalesce(struct net_device *dev,
+			    struct ethtool_coalesce *ec)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+
+	ec->rx_max_coalesced_frames = tun->rx_batched;
+
+	return 0;
+}
+
+static int tun_set_coalesce(struct net_device *dev,
+			    struct ethtool_coalesce *ec)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+
+	if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
+		tun->rx_batched = NAPI_POLL_WEIGHT;
+	else
+		tun->rx_batched = ec->rx_max_coalesced_frames;
+
+	return 0;
+}
+
 static const struct ethtool_ops tun_ethtool_ops = {
 	.get_settings	= tun_get_settings,
 	.get_drvinfo	= tun_get_drvinfo,
@@ -2445,6 +2507,8 @@  static const struct ethtool_ops tun_ethtool_ops = {
 	.set_msglevel	= tun_set_msglevel,
 	.get_link	= ethtool_op_get_link,
 	.get_ts_info	= ethtool_op_get_ts_info,
+	.get_coalesce   = tun_get_coalesce,
+	.set_coalesce   = tun_set_coalesce,
 };
 
 static int tun_queue_resize(struct tun_struct *tun)

[net-next,V5,3/3] tun: rx batching

Commit Message

Comments

Patch