@@ -306,7 +306,10 @@ struct tcp_sock {
u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */
unused2:4;
u8 accecn_minlen:2,/* Minimum length of AccECN option sent */
+ prev_ecnfield:2,/* ECN bits from the previous segment */
+ accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
est_ecnfield:2;/* ECN field for AccECN delivered estimates */
+ u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */
u32 app_limited; /* limited until "delivered" reaches this val */
u32 rcv_wnd; /* Current receiver window */
/*
@@ -139,6 +139,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_ecn;
u8 sysctl_tcp_ecn_option;
+ u8 sysctl_tcp_ecn_option_beacon;
u8 sysctl_tcp_ecn_fallback;
u8 sysctl_ip_default_ttl;
@@ -1067,6 +1067,7 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
__tcp_accecn_init_bytes_counters(tp->received_ecn_bytes);
__tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes);
tp->accecn_minlen = 0;
+ tp->accecn_opt_demand = 0;
tp->est_ecnfield = 0;
}
@@ -740,6 +740,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
+ {
+ .procname = "tcp_ecn_option_beacon",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_FOUR,
+ },
{
.procname = "tcp_ecn_fallback",
.data = &init_net.ipv4.sysctl_tcp_ecn_fallback,
@@ -3366,6 +3366,8 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->wait_third_ack = 0;
tp->accecn_fail_mode = 0;
tcp_accecn_init_counters(tp);
+ tp->prev_ecnfield = 0;
+ tp->accecn_opt_tstamp = 0;
if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -5077,6 +5079,7 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ecn_bytes);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, accecn_opt_tstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
@@ -5084,7 +5087,7 @@ static void __init tcp_struct_check(void)
/* 32bit arches with 8byte alignment on u64 fields might need padding
* before tcp_clock_cache.
*/
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 122 + 6);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 130 + 6);
/* RX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
@@ -467,6 +467,7 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th,
default:
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK;
+ tp->accecn_opt_demand = 2;
if (INET_ECN_is_ce(ip_dsfield) &&
tcp_accecn_validate_syn_feedback(sk, ace,
tp->syn_ect_snt)) {
@@ -487,6 +488,7 @@ static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th,
} else {
tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
INET_ECN_MASK;
+ tp->prev_ecnfield = tp->syn_ect_rcv;
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
}
}
@@ -6279,6 +6281,7 @@ void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb,
u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
u8 is_ce = INET_ECN_is_ce(ecnfield);
struct tcp_sock *tp = tcp_sk(sk);
+ bool ecn_edge;
if (!INET_ECN_is_not_ect(ecnfield)) {
u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
@@ -6292,9 +6295,36 @@ void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb,
if (payload_len > 0) {
u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield);
+ u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1];
+
tp->received_ecn_bytes[ecnfield - 1] += payload_len;
tp->accecn_minlen = max_t(u8, tp->accecn_minlen,
minlen);
+
+ /* Demand AccECN option at least every 2^22 bytes to
+ * avoid overflowing the ECN byte counters.
+ */
+ if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) &
+ ~((1 << 22) - 1)) {
+ u8 opt_demand = max_t(u8, 1,
+ tp->accecn_opt_demand);
+
+ tp->accecn_opt_demand = opt_demand;
+ }
+ }
+ }
+
+ ecn_edge = tp->prev_ecnfield != ecnfield;
+ if (ecn_edge || is_ce) {
+ tp->prev_ecnfield = ecnfield;
+ /* Demand Accurate ECN change-triggered ACKs. Two ACK are
+ * demanded to indicate unambiguously the ecnfield value
+ * in the latter ACK.
+ */
+ if (tcp_ecn_mode_accecn(tp)) {
+ if (ecn_edge)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ tp->accecn_opt_demand = 2;
}
}
}
@@ -6427,8 +6457,12 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
* RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
- if (tcp_ecn_mode_accecn(tp))
+ if (tcp_ecn_mode_accecn(tp)) {
+ u8 opt_demand = max_t(u8, 1, tp->accecn_opt_demand);
+
send_accecn_reflector = true;
+ tp->accecn_opt_demand = opt_demand;
+ }
if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
@@ -3452,6 +3452,7 @@ static int __net_init tcp_sk_init(struct net *net)
{
net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_ecn_option = 2;
+ net->ipv4.sysctl_tcp_ecn_option_beacon = 3;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
@@ -498,6 +498,8 @@ static void tcp_ecn_openreq_child(struct sock *sk,
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
tp->syn_ect_snt = treq->syn_ect_snt;
tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
+ tp->prev_ecnfield = treq->syn_ect_rcv;
+ tp->accecn_opt_demand = 1;
tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4);
} else {
tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
@@ -806,8 +806,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
*ptr++ = htonl(((e0b & 0xffffff) << 8) |
TCPOPT_NOP);
}
- if (tp)
+ if (tp) {
tp->accecn_minlen = 0;
+ tp->accecn_opt_tstamp = tp->tcp_mstamp;
+ if (tp->accecn_opt_demand)
+ tp->accecn_opt_demand--;
+ }
}
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
@@ -984,6 +988,18 @@ static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required,
return size;
}
+static bool tcp_accecn_option_beacon_check(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon)
+ return false;
+
+ return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) *
+ sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon >=
+ (tp->srtt_us >> 3);
+}
+
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
@@ -1237,13 +1253,18 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
if (tcp_ecn_mode_accecn(tp) &&
sock_net(sk)->ipv4.sysctl_tcp_ecn_option) {
- int saving = opts->num_sack_blocks > 0 ? 2 : 0;
- int remaining = MAX_TCP_OPTION_SPACE - size;
-
- opts->ecn_bytes = tp->received_ecn_bytes;
- size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
- remaining,
- saving);
+ if (sock_net(sk)->ipv4.sysctl_tcp_ecn_option >= 2 ||
+ tp->accecn_opt_demand ||
+ tcp_accecn_option_beacon_check(sk)) {
+ int saving = opts->num_sack_blocks > 0 ? 2 : 0;
+ int remaining = MAX_TCP_OPTION_SPACE - size;
+
+ opts->ecn_bytes = tp->received_ecn_bytes;
+ size += tcp_options_fit_accecn(opts,
+ tp->accecn_minlen,
+ remaining,
+ saving);
+ }
}
if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
@@ -2959,6 +2980,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
sent_pkts = 0;
tcp_mstamp_refresh(tp);
+
+ /* AccECN option beacon depends on mstamp, it may change mss */
+ if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk))
+ mss_now = tcp_current_mss(sk);
+
if (!push_one) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);