@@ -1070,6 +1070,17 @@ udp_rmem_min - INTEGER
udp_wmem_min - INTEGER
UDP does not have tx memory accounting and this tunable has no effect.
+udp_backpressure_interval - INTEGER
+ The time interval (in milliseconds) in which an overlimit UDP socket
+ tries to increase its effective send buffer size, used by Qdisc
+ backpressure. A longer interval typically results in a lower packet
+ drop rate, but also makes it slower for overlimit UDP sockets to
+ recover from backpressure when TC egress becomes idle.
+
+ 0 to disable Qdisc backpressure for UDP sockets.
+
+ Default: 100
+
RAW variables
=============
@@ -86,6 +86,9 @@ struct udp_sock {
/* This field is dirtied by udp_recvmsg() */
int forward_deficit;
+
+ /* Qdisc backpressure timer */
+ struct timer_list backpressure_timer;
};
#define UDP_MAX_SEGMENTS (1 << 6UL)
@@ -182,6 +182,7 @@ struct netns_ipv4 {
int sysctl_udp_wmem_min;
int sysctl_udp_rmem_min;
+ int sysctl_udp_backpressure_interval;
u8 sysctl_fib_notify_on_flag_change;
@@ -279,6 +279,7 @@ int udp_init_sock(struct sock *sk);
int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int __udp_disconnect(struct sock *sk, int flags);
int udp_disconnect(struct sock *sk, int flags);
+void udp_backpressure(struct sock *sk);
__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
@@ -2614,7 +2614,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
+ if (refcount_read(&sk->sk_wmem_alloc) < sk_sndbuf_avail(sk))
break;
if (sk->sk_shutdown & SEND_SHUTDOWN)
break;
@@ -2649,7 +2649,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;
- if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
+ if (sk_wmem_alloc_get(sk) < sk_sndbuf_avail(sk))
break;
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -1337,6 +1337,13 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE
},
+ {
+ .procname = "udp_backpressure_interval",
+ .data = &init_net.ipv4.sysctl_udp_backpressure_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
{
.procname = "fib_notify_on_flag_change",
.data = &init_net.ipv4.sysctl_fib_notify_on_flag_change,
@@ -110,6 +110,7 @@
#include <trace/events/skb.h>
#include <net/busy_poll.h>
#include "udp_impl.h"
+#include <net/sock.h>
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
#include <net/udp_tunnel.h>
@@ -1614,10 +1615,73 @@ void udp_destruct_sock(struct sock *sk)
}
EXPORT_SYMBOL_GPL(udp_destruct_sock);
+static inline int udp_backpressure_interval_get(struct sock *sk)
+{
+ return READ_ONCE(sock_net(sk)->ipv4.sysctl_udp_backpressure_interval);
+}
+
+static inline void udp_reset_backpressure_timer(struct sock *sk,
+ unsigned long expires)
+{
+ sk_reset_timer(sk, &udp_sk(sk)->backpressure_timer, expires);
+}
+
+static void udp_backpressure_timer(struct timer_list *t)
+{
+ struct udp_sock *up = from_timer(up, t, backpressure_timer);
+ int interval, sndbuf, overlimits;
+ struct sock *sk = &up->inet.sk;
+
+ interval = udp_backpressure_interval_get(sk);
+ if (!interval) {
+ /* Qdisc backpressure has been turned off */
+ WRITE_ONCE(sk->sk_overlimits, 0);
+ goto out;
+ }
+
+ sndbuf = READ_ONCE(sk->sk_sndbuf);
+ overlimits = READ_ONCE(sk->sk_overlimits);
+
+ /* sndbuf - overlimits_new == 2 * (sndbuf - overlimits_old) */
+ overlimits = min_t(int, overlimits, sndbuf - SOCK_MIN_SNDBUF);
+ overlimits = max_t(int, (2 * overlimits) - sndbuf, 0);
+ WRITE_ONCE(sk->sk_overlimits, overlimits);
+
+ if (overlimits > 0)
+ udp_reset_backpressure_timer(sk, jiffies + interval);
+
+out:
+ sock_put(sk);
+}
+
+void udp_backpressure(struct sock *sk)
+{
+ int interval, sndbuf, overlimits;
+
+ interval = udp_backpressure_interval_get(sk);
+ if (!interval) /* Qdisc backpressure is off */
+ return;
+
+ sndbuf = READ_ONCE(sk->sk_sndbuf);
+ overlimits = READ_ONCE(sk->sk_overlimits);
+
+ /* sndbuf - overlimits_new == 1/2 * (sndbuf - overlimits_old) */
+ overlimits = min_t(int, overlimits, sndbuf - SOCK_MIN_SNDBUF);
+ overlimits += (sndbuf - overlimits) >> 1;
+ WRITE_ONCE(sk->sk_overlimits, overlimits);
+
+ if (overlimits > 0)
+ udp_reset_backpressure_timer(sk, jiffies + interval);
+}
+EXPORT_SYMBOL_GPL(udp_backpressure);
+
int udp_init_sock(struct sock *sk)
{
- skb_queue_head_init(&udp_sk(sk)->reader_queue);
+ struct udp_sock *up = udp_sk(sk);
+
+ skb_queue_head_init(&up->reader_queue);
sk->sk_destruct = udp_destruct_sock;
+ timer_setup(&up->backpressure_timer, udp_backpressure_timer, 0);
return 0;
}
EXPORT_SYMBOL_GPL(udp_init_sock);
@@ -2653,6 +2717,7 @@ void udp_destroy_sock(struct sock *sk)
/* protects from races with udp_abort() */
sock_set_flag(sk, SOCK_DEAD);
udp_flush_pending_frames(sk);
+ sk_stop_timer(sk, &up->backpressure_timer);
unlock_sock_fast(sk, slow);
if (static_branch_unlikely(&udp_encap_needed_key)) {
if (up->encap_type) {
@@ -2946,6 +3011,7 @@ struct proto udp_prot = {
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = udp_bpf_update_proto,
#endif
+ .backpressure = udp_backpressure,
.memory_allocated = &udp_memory_allocated,
.per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
@@ -3268,6 +3334,7 @@ static int __net_init udp_sysctl_init(struct net *net)
{
net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
+ net->ipv4.sysctl_udp_backpressure_interval = msecs_to_jiffies(100);
#ifdef CONFIG_NET_L3_MASTER_DEV
net->ipv4.sysctl_udp_l3mdev_accept = 0;
@@ -1735,7 +1735,7 @@ struct proto udpv6_prot = {
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = udp_bpf_update_proto,
#endif
-
+ .backpressure = udp_backpressure,
.memory_allocated = &udp_memory_allocated,
.per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,