diff mbox series

[RFC,bpf-next,1/9] net: introduce biased busy-polling

Message ID 20201028133437.212503-2-bjorn.topel@gmail.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series Introduce biased busy-polling | expand

Commit Message

Björn Töpel Oct. 28, 2020, 1:34 p.m. UTC
From: Björn Töpel <bjorn.topel@intel.com>

This change adds a new NAPI mode, called biased busy-polling, which is
an extension to the existing busy-polling mode. The new mode is
enabled on the socket layer, where a socket setting this option
"promisies" to busy-poll the NAPI context via a system call. When this
mode is enabled, the NAPI context will operate in a mode with
interrupts disabled. The kernel monitors that the busy-polling promise
is fulfilled by an internal watchdog. If the socket fail/stop
performing the busy-polling, the mode will be disabled. The watchdog
is currently 200 ms.

Biased busy-polling follows the same mechanism as the existing
busy-poll; The napi_id is reported to the socket via the skbuff. Later
commits will extend napi_id reporting to XDP, in order to work
correctly with XDP sockets.

Let us walk through a flow of execution:

1. A socket sets the new SO_BIAS_BUSY_POLL socket option to true. The
   socket now shows an intent of doing busy-polling. No data has been
   received to the socket, so the napi_id of the socket is still 0
   (non-valid). As usual for busy-polling, the SO_BUSY_POLL option
   also has to be non-zero for biased busy-polling.

2. Data is received on the socket changing the napi_id to non-zero.

3. The socket does a system call that has the busy-polling logic wired
   up, e.g. recvfrom() for UDP sockets. The NAPI context is now marked
   as biased busy-poll. The kernel watchdog is armed. If the NAPI
   context is already running, it will try to finish as soon as
   possible and move to busy-polling. If the NAPI context is not
   running, it will execute the NAPI poll function for the
   corresponding napi_id.

4. Goto 3, or wait until the watchdog timeout.

Given the nature of busy-polling, this mode only make sense for
non-blocking sockets.

When the NAPI context is in biased busy-polling mode, it will not
allow a NAPI to be scheduled using the
napi_schedule_prep()/napi_scheduleXXX() combo.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 arch/alpha/include/uapi/asm/socket.h  |  2 +
 arch/mips/include/uapi/asm/socket.h   |  2 +
 arch/parisc/include/uapi/asm/socket.h |  2 +
 arch/sparc/include/uapi/asm/socket.h  |  2 +
 include/linux/netdevice.h             | 33 +++++-----
 include/net/busy_poll.h               | 17 ++++-
 include/net/sock.h                    |  3 +
 include/uapi/asm-generic/socket.h     |  2 +
 net/core/dev.c                        | 89 +++++++++++++++++++++++++--
 net/core/sock.c                       |  9 +++
 10 files changed, 140 insertions(+), 21 deletions(-)
diff mbox series

Patch

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index de6c4df61082..0f776668fb09 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -124,6 +124,8 @@ 
 
 #define SO_DETACH_REUSEPORT_BPF 68
 
+#define SO_BIAS_BUSY_POLL	69
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index d0a9ed2ca2d6..d23984731504 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -135,6 +135,8 @@ 
 
 #define SO_DETACH_REUSEPORT_BPF 68
 
+#define SO_BIAS_BUSY_POLL	69
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 10173c32195e..49469713ed2a 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -116,6 +116,8 @@ 
 
 #define SO_DETACH_REUSEPORT_BPF 0x4042
 
+#define SO_BIAS_BUSY_POLL	0x4043
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 8029b681fc7c..009aba6f7a54 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -117,6 +117,8 @@ 
 
 #define SO_DETACH_REUSEPORT_BPF  0x0047
 
+#define SO_BIAS_BUSY_POLL	 0x0048
+
 #if !defined(__KERNEL__)
 
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 964b494b0e8d..9bdc84d3d6b8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -344,29 +344,32 @@  struct napi_struct {
 	struct list_head	rx_list; /* Pending GRO_NORMAL skbs */
 	int			rx_count; /* length of rx_list */
 	struct hrtimer		timer;
+	struct hrtimer		bp_watchdog;
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
 	unsigned int		napi_id;
 };
 
 enum {
-	NAPI_STATE_SCHED,	/* Poll is scheduled */
-	NAPI_STATE_MISSED,	/* reschedule a napi */
-	NAPI_STATE_DISABLE,	/* Disable pending */
-	NAPI_STATE_NPSVC,	/* Netpoll - don't dequeue from poll_list */
-	NAPI_STATE_LISTED,	/* NAPI added to system lists */
-	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
-	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+	NAPI_STATE_SCHED,		/* Poll is scheduled */
+	NAPI_STATE_MISSED,		/* reschedule a napi */
+	NAPI_STATE_DISABLE,		/* Disable pending */
+	NAPI_STATE_NPSVC,		/* Netpoll - don't dequeue from poll_list */
+	NAPI_STATE_LISTED,		/* NAPI added to system lists */
+	NAPI_STATE_NO_BUSY_POLL,	/* Do not add in napi_hash, no busy polling */
+	NAPI_STATE_IN_BUSY_POLL,	/* sk_busy_loop() owns this NAPI */
+	NAPI_STATE_BIAS_BUSY_POLL,	/* biased busy-polling */
 };
 
 enum {
-	NAPIF_STATE_SCHED	 = BIT(NAPI_STATE_SCHED),
-	NAPIF_STATE_MISSED	 = BIT(NAPI_STATE_MISSED),
-	NAPIF_STATE_DISABLE	 = BIT(NAPI_STATE_DISABLE),
-	NAPIF_STATE_NPSVC	 = BIT(NAPI_STATE_NPSVC),
-	NAPIF_STATE_LISTED	 = BIT(NAPI_STATE_LISTED),
-	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
-	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+	NAPIF_STATE_SCHED	   = BIT(NAPI_STATE_SCHED),
+	NAPIF_STATE_MISSED	   = BIT(NAPI_STATE_MISSED),
+	NAPIF_STATE_DISABLE	   = BIT(NAPI_STATE_DISABLE),
+	NAPIF_STATE_NPSVC	   = BIT(NAPI_STATE_NPSVC),
+	NAPIF_STATE_LISTED	   = BIT(NAPI_STATE_LISTED),
+	NAPIF_STATE_NO_BUSY_POLL   = BIT(NAPI_STATE_NO_BUSY_POLL),
+	NAPIF_STATE_IN_BUSY_POLL   = BIT(NAPI_STATE_IN_BUSY_POLL),
+	NAPIF_STATE_BIAS_BUSY_POLL = BIT(NAPI_STATE_BIAS_BUSY_POLL),
 };
 
 enum gro_result {
@@ -555,6 +558,8 @@  static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
 	return true;
 }
 
+void napi_bias_busy_poll(unsigned int napi_id);
+
 enum netdev_queue_state_t {
 	__QUEUE_STATE_DRV_XOFF,
 	__QUEUE_STATE_STACK_XOFF,
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index b001fa91c14e..9738923ed17b 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -23,6 +23,9 @@ 
  */
 #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1))
 
+/* Biased busy-poll watchdog timeout in ms */
+#define BIASED_BUSY_POLL_TIMEOUT 200
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
 
 struct napi_struct;
@@ -99,13 +102,25 @@  static inline bool sk_busy_loop_timeout(struct sock *sk,
 	return true;
 }
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static inline void __sk_bias_busy_poll(struct sock *sk, unsigned int napi_id)
+{
+	if (likely(!READ_ONCE(sk->sk_bias_busy_poll)))
+		return;
+
+	napi_bias_busy_poll(napi_id);
+}
+#endif
+
 static inline void sk_busy_loop(struct sock *sk, int nonblock)
 {
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	unsigned int napi_id = READ_ONCE(sk->sk_napi_id);
 
-	if (napi_id >= MIN_NAPI_ID)
+	if (napi_id >= MIN_NAPI_ID) {
+		__sk_bias_busy_poll(sk, napi_id);
 		napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk);
+	}
 #endif
 }
 
diff --git a/include/net/sock.h b/include/net/sock.h
index a5c6ae78df77..cf71834fb601 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -479,6 +479,9 @@  struct sock {
 	u32			sk_ack_backlog;
 	u32			sk_max_ack_backlog;
 	kuid_t			sk_uid;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	u8			sk_bias_busy_poll;
+#endif
 	struct pid		*sk_peer_pid;
 	const struct cred	*sk_peer_cred;
 	long			sk_rcvtimeo;
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 77f7c1638eb1..8a2b37ccd9d5 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -119,6 +119,8 @@ 
 
 #define SO_DETACH_REUSEPORT_BPF 68
 
+#define SO_BIAS_BUSY_POLL	69
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/net/core/dev.c b/net/core/dev.c
index 9499a414d67e..a29e4c4a35f6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6378,6 +6378,9 @@  bool napi_schedule_prep(struct napi_struct *n)
 		val = READ_ONCE(n->state);
 		if (unlikely(val & NAPIF_STATE_DISABLE))
 			return false;
+		if (unlikely(val & NAPIF_STATE_BIAS_BUSY_POLL))
+			return false;
+
 		new = val | NAPIF_STATE_SCHED;
 
 		/* Sets STATE_MISSED bit if STATE_SCHED was already set
@@ -6458,12 +6461,14 @@  bool napi_complete_done(struct napi_struct *n, int work_done)
 
 		/* If STATE_MISSED was set, leave STATE_SCHED set,
 		 * because we will call napi->poll() one more time.
-		 * This C code was suggested by Alexander Duyck to help gcc.
 		 */
-		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
-						    NAPIF_STATE_SCHED;
+		if (val & NAPIF_STATE_MISSED && !(val & NAPIF_STATE_BIAS_BUSY_POLL))
+			new |= NAPIF_STATE_SCHED;
 	} while (cmpxchg(&n->state, val, new) != val);
 
+	if (unlikely(val & NAPIF_STATE_BIAS_BUSY_POLL))
+		return false;
+
 	if (unlikely(val & NAPIF_STATE_MISSED)) {
 		__napi_schedule(n);
 		return false;
@@ -6497,6 +6502,20 @@  static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 {
 	int rc;
 
+	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
+
+	local_bh_disable();
+	/* If we're biased towards busy poll, clear the sched flags,
+	 * so that we can enter again.
+	 */
+	if (READ_ONCE(napi->state) & NAPIF_STATE_BIAS_BUSY_POLL) {
+		netpoll_poll_unlock(have_poll_lock);
+		napi_complete(napi);
+		__kfree_skb_flush();
+		local_bh_enable();
+		return;
+	}
+
 	/* Busy polling means there is a high chance device driver hard irq
 	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 	 * set in napi_schedule_prep().
@@ -6507,9 +6526,6 @@  static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 	 * to perform these two clear_bit()
 	 */
 	clear_bit(NAPI_STATE_MISSED, &napi->state);
-	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
-
-	local_bh_disable();
 
 	/* All we really want here is to re-enable device interrupts.
 	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
@@ -6569,6 +6585,11 @@  void napi_busy_loop(unsigned int napi_id,
 				goto count;
 			have_poll_lock = netpoll_poll_lock(napi);
 			napi_poll = napi->poll;
+			if (val & NAPIF_STATE_BIAS_BUSY_POLL) {
+				hrtimer_start(&napi->bp_watchdog,
+					      ms_to_ktime(BIASED_BUSY_POLL_TIMEOUT),
+					      HRTIMER_MODE_REL_PINNED);
+			}
 		}
 		work = napi_poll(napi, BUSY_POLL_BUDGET);
 		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
@@ -6652,6 +6673,53 @@  static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+static enum hrtimer_restart napi_biased_busy_poll_watchdog(struct hrtimer *timer)
+{
+	struct napi_struct *napi;
+	unsigned long val, new;
+
+	napi = container_of(timer, struct napi_struct, bp_watchdog);
+
+	do {
+		val = READ_ONCE(napi->state);
+		if (WARN_ON_ONCE(!(val & NAPIF_STATE_BIAS_BUSY_POLL)))
+			return HRTIMER_NORESTART;
+
+		new = val & ~NAPIF_STATE_BIAS_BUSY_POLL;
+	} while (cmpxchg(&napi->state, val, new) != val);
+
+	if (!napi_disable_pending(napi) &&
+	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+		__napi_schedule_irqoff(napi);
+
+	return HRTIMER_NORESTART;
+}
+
+void napi_bias_busy_poll(unsigned int napi_id)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	struct napi_struct *napi;
+	unsigned long val, new;
+
+	napi = napi_by_id(napi_id);
+	if (!napi)
+		return;
+
+	do {
+		val = READ_ONCE(napi->state);
+		if (val & NAPIF_STATE_BIAS_BUSY_POLL)
+			return;
+
+		new = val | NAPIF_STATE_BIAS_BUSY_POLL;
+	} while (cmpxchg(&napi->state, val, new) != val);
+
+	hrtimer_start(&napi->bp_watchdog, ms_to_ktime(BIASED_BUSY_POLL_TIMEOUT),
+		      HRTIMER_MODE_REL_PINNED);
+#endif
+}
+EXPORT_SYMBOL(napi_bias_busy_poll);
+
+
 static void init_gro_hash(struct napi_struct *napi)
 {
 	int i;
@@ -6673,6 +6741,8 @@  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	INIT_HLIST_NODE(&napi->napi_hash_node);
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
+	hrtimer_init(&napi->bp_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+	napi->bp_watchdog.function = napi_biased_busy_poll_watchdog;
 	init_gro_hash(napi);
 	napi->skb = NULL;
 	INIT_LIST_HEAD(&napi->rx_list);
@@ -6704,7 +6774,9 @@  void napi_disable(struct napi_struct *n)
 		msleep(1);
 
 	hrtimer_cancel(&n->timer);
+	hrtimer_cancel(&n->bp_watchdog);
 
+	clear_bit(NAPI_STATE_BIAS_BUSY_POLL, &n->state);
 	clear_bit(NAPI_STATE_DISABLE, &n->state);
 }
 EXPORT_SYMBOL(napi_disable);
@@ -6767,6 +6839,11 @@  static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 	if (likely(work < weight))
 		goto out_unlock;
 
+	if (unlikely(n->state & NAPIF_STATE_BIAS_BUSY_POLL)) {
+		napi_complete(n);
+		goto out_unlock;
+	}
+
 	/* Drivers must not modify the NAPI state if they
 	 * consume the entire weight.  In such cases this code
 	 * still "owns" the NAPI instance and therefore can
diff --git a/net/core/sock.c b/net/core/sock.c
index 727ea1cc633c..686eb5549b79 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1159,6 +1159,12 @@  int sock_setsockopt(struct socket *sock, int level, int optname,
 				sk->sk_ll_usec = val;
 		}
 		break;
+	case SO_BIAS_BUSY_POLL:
+		if (valbool && !capable(CAP_NET_ADMIN))
+			ret = -EPERM;
+		else
+			sk->sk_bias_busy_poll = valbool;
+		break;
 #endif
 
 	case SO_MAX_PACING_RATE:
@@ -1523,6 +1529,9 @@  int sock_getsockopt(struct socket *sock, int level, int optname,
 	case SO_BUSY_POLL:
 		v.val = sk->sk_ll_usec;
 		break;
+	case SO_BIAS_BUSY_POLL:
+		v.val = sk->sk_bias_busy_poll;
+		break;
 #endif
 
 	case SO_MAX_PACING_RATE: