@@ -124,6 +124,8 @@
#define SO_DETACH_REUSEPORT_BPF 68
+#define SO_BIAS_BUSY_POLL 69
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
@@ -135,6 +135,8 @@
#define SO_DETACH_REUSEPORT_BPF 68
+#define SO_BIAS_BUSY_POLL 69
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
@@ -116,6 +116,8 @@
#define SO_DETACH_REUSEPORT_BPF 0x4042
+#define SO_BIAS_BUSY_POLL 0x4043
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
@@ -117,6 +117,8 @@
#define SO_DETACH_REUSEPORT_BPF 0x0047
+#define SO_BIAS_BUSY_POLL 0x0048
+
#if !defined(__KERNEL__)
@@ -344,29 +344,32 @@ struct napi_struct {
struct list_head rx_list; /* Pending GRO_NORMAL skbs */
int rx_count; /* length of rx_list */
struct hrtimer timer;
+ struct hrtimer bp_watchdog;
struct list_head dev_list;
struct hlist_node napi_hash_node;
unsigned int napi_id;
};
enum {
- NAPI_STATE_SCHED, /* Poll is scheduled */
- NAPI_STATE_MISSED, /* reschedule a napi */
- NAPI_STATE_DISABLE, /* Disable pending */
- NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
- NAPI_STATE_LISTED, /* NAPI added to system lists */
- NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
- NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_SCHED, /* Poll is scheduled */
+ NAPI_STATE_MISSED, /* reschedule a napi */
+ NAPI_STATE_DISABLE, /* Disable pending */
+ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
+ NAPI_STATE_LISTED, /* NAPI added to system lists */
+ NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */
+ NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_BIAS_BUSY_POLL, /* biased busy-polling */
};
enum {
- NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED),
- NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED),
- NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE),
- NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC),
- NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
- NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
- NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED),
+ NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED),
+ NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE),
+ NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC),
+ NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
+ NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
+ NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_BIAS_BUSY_POLL = BIT(NAPI_STATE_BIAS_BUSY_POLL),
};
enum gro_result {
@@ -555,6 +558,8 @@ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
return true;
}
+void napi_bias_busy_poll(unsigned int napi_id);
+
enum netdev_queue_state_t {
__QUEUE_STATE_DRV_XOFF,
__QUEUE_STATE_STACK_XOFF,
@@ -23,6 +23,9 @@
*/
#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1))
+/* Biased busy-poll watchdog timeout in ms */
+#define BIASED_BUSY_POLL_TIMEOUT 200
+
#ifdef CONFIG_NET_RX_BUSY_POLL
struct napi_struct;
@@ -99,13 +102,25 @@ static inline bool sk_busy_loop_timeout(struct sock *sk,
return true;
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static inline void __sk_bias_busy_poll(struct sock *sk, unsigned int napi_id)
+{
+ if (likely(!READ_ONCE(sk->sk_bias_busy_poll)))
+ return;
+
+ napi_bias_busy_poll(napi_id);
+}
+#endif
+
static inline void sk_busy_loop(struct sock *sk, int nonblock)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int napi_id = READ_ONCE(sk->sk_napi_id);
- if (napi_id >= MIN_NAPI_ID)
+ if (napi_id >= MIN_NAPI_ID) {
+ __sk_bias_busy_poll(sk, napi_id);
napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk);
+ }
#endif
}
@@ -479,6 +479,9 @@ struct sock {
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
kuid_t sk_uid;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ u8 sk_bias_busy_poll;
+#endif
struct pid *sk_peer_pid;
const struct cred *sk_peer_cred;
long sk_rcvtimeo;
@@ -119,6 +119,8 @@
#define SO_DETACH_REUSEPORT_BPF 68
+#define SO_BIAS_BUSY_POLL 69
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
@@ -6378,6 +6378,9 @@ bool napi_schedule_prep(struct napi_struct *n)
val = READ_ONCE(n->state);
if (unlikely(val & NAPIF_STATE_DISABLE))
return false;
+ if (unlikely(val & NAPIF_STATE_BIAS_BUSY_POLL))
+ return false;
+
new = val | NAPIF_STATE_SCHED;
/* Sets STATE_MISSED bit if STATE_SCHED was already set
@@ -6458,12 +6461,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
- * This C code was suggested by Alexander Duyck to help gcc.
*/
- new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
- NAPIF_STATE_SCHED;
+ if (val & NAPIF_STATE_MISSED && !(val & NAPIF_STATE_BIAS_BUSY_POLL))
+ new |= NAPIF_STATE_SCHED;
} while (cmpxchg(&n->state, val, new) != val);
+ if (unlikely(val & NAPIF_STATE_BIAS_BUSY_POLL))
+ return false;
+
if (unlikely(val & NAPIF_STATE_MISSED)) {
__napi_schedule(n);
return false;
@@ -6497,6 +6502,20 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
{
int rc;
+ clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
+
+ local_bh_disable();
+ /* If we're biased towards busy poll, clear the sched flags,
+ * so that we can enter again.
+ */
+ if (READ_ONCE(napi->state) & NAPIF_STATE_BIAS_BUSY_POLL) {
+ netpoll_poll_unlock(have_poll_lock);
+ napi_complete(napi);
+ __kfree_skb_flush();
+ local_bh_enable();
+ return;
+ }
+
/* Busy polling means there is a high chance device driver hard irq
* could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
* set in napi_schedule_prep().
@@ -6507,9 +6526,6 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
* to perform these two clear_bit()
*/
clear_bit(NAPI_STATE_MISSED, &napi->state);
- clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
-
- local_bh_disable();
/* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
@@ -6569,6 +6585,11 @@ void napi_busy_loop(unsigned int napi_id,
goto count;
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
+ if (val & NAPIF_STATE_BIAS_BUSY_POLL) {
+ hrtimer_start(&napi->bp_watchdog,
+ ms_to_ktime(BIASED_BUSY_POLL_TIMEOUT),
+ HRTIMER_MODE_REL_PINNED);
+ }
}
work = napi_poll(napi, BUSY_POLL_BUDGET);
trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
@@ -6652,6 +6673,53 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+static enum hrtimer_restart napi_biased_busy_poll_watchdog(struct hrtimer *timer)
+{
+ struct napi_struct *napi;
+ unsigned long val, new;
+
+ napi = container_of(timer, struct napi_struct, bp_watchdog);
+
+ do {
+ val = READ_ONCE(napi->state);
+ if (WARN_ON_ONCE(!(val & NAPIF_STATE_BIAS_BUSY_POLL)))
+ return HRTIMER_NORESTART;
+
+ new = val & ~NAPIF_STATE_BIAS_BUSY_POLL;
+ } while (cmpxchg(&napi->state, val, new) != val);
+
+ if (!napi_disable_pending(napi) &&
+ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+ __napi_schedule_irqoff(napi);
+
+ return HRTIMER_NORESTART;
+}
+
+void napi_bias_busy_poll(unsigned int napi_id)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ struct napi_struct *napi;
+ unsigned long val, new;
+
+ napi = napi_by_id(napi_id);
+ if (!napi)
+ return;
+
+ do {
+ val = READ_ONCE(napi->state);
+ if (val & NAPIF_STATE_BIAS_BUSY_POLL)
+ return;
+
+ new = val | NAPIF_STATE_BIAS_BUSY_POLL;
+ } while (cmpxchg(&napi->state, val, new) != val);
+
+ hrtimer_start(&napi->bp_watchdog, ms_to_ktime(BIASED_BUSY_POLL_TIMEOUT),
+ HRTIMER_MODE_REL_PINNED);
+#endif
+}
+EXPORT_SYMBOL(napi_bias_busy_poll);
+
+
static void init_gro_hash(struct napi_struct *napi)
{
int i;
@@ -6673,6 +6741,8 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
+ hrtimer_init(&napi->bp_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ napi->bp_watchdog.function = napi_biased_busy_poll_watchdog;
init_gro_hash(napi);
napi->skb = NULL;
INIT_LIST_HEAD(&napi->rx_list);
@@ -6704,7 +6774,9 @@ void napi_disable(struct napi_struct *n)
msleep(1);
hrtimer_cancel(&n->timer);
+ hrtimer_cancel(&n->bp_watchdog);
+ clear_bit(NAPI_STATE_BIAS_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);
@@ -6767,6 +6839,11 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
if (likely(work < weight))
goto out_unlock;
+ if (unlikely(n->state & NAPIF_STATE_BIAS_BUSY_POLL)) {
+ napi_complete(n);
+ goto out_unlock;
+ }
+
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still "owns" the NAPI instance and therefore can
@@ -1159,6 +1159,12 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sk->sk_ll_usec = val;
}
break;
+ case SO_BIAS_BUSY_POLL:
+ if (valbool && !capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else
+ sk->sk_bias_busy_poll = valbool;
+ break;
#endif
case SO_MAX_PACING_RATE:
@@ -1523,6 +1529,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_BUSY_POLL:
v.val = sk->sk_ll_usec;
break;
+ case SO_BIAS_BUSY_POLL:
+ v.val = sk->sk_bias_busy_poll;
+ break;
#endif
case SO_MAX_PACING_RATE: