@@ -753,10 +753,8 @@ struct sk_buff {
int ip_defrag_offset;
};
- union {
- ktime_t tstamp;
- u64 skb_mstamp_ns; /* earliest departure time */
- };
+ ktime_t tstamp;
+
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
@@ -3694,6 +3692,7 @@ int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
int len, int hlen);
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
+void skb_scrub_tstamp(struct sk_buff *skb);
void skb_scrub_packet(struct sk_buff *skb, bool xnet);
bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu);
bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);
@@ -3810,7 +3809,7 @@ void skb_init(void);
static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
{
- return skb->tstamp;
+ return ktime_mono_to_real_cond(skb->tstamp);
}
/**
@@ -3825,13 +3824,13 @@ static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
static inline void skb_get_timestamp(const struct sk_buff *skb,
struct __kernel_old_timeval *stamp)
{
- *stamp = ns_to_kernel_old_timeval(skb->tstamp);
+ *stamp = ns_to_kernel_old_timeval(skb_get_ktime(skb));
}
static inline void skb_get_new_timestamp(const struct sk_buff *skb,
struct __kernel_sock_timeval *stamp)
{
- struct timespec64 ts = ktime_to_timespec64(skb->tstamp);
+ struct timespec64 ts = ktime_to_timespec64(skb_get_ktime(skb));
stamp->tv_sec = ts.tv_sec;
stamp->tv_usec = ts.tv_nsec / 1000;
@@ -3840,7 +3839,7 @@ static inline void skb_get_new_timestamp(const struct sk_buff *skb,
static inline void skb_get_timestampns(const struct sk_buff *skb,
struct __kernel_old_timespec *stamp)
{
- struct timespec64 ts = ktime_to_timespec64(skb->tstamp);
+ struct timespec64 ts = ktime_to_timespec64(skb_get_ktime(skb));
stamp->tv_sec = ts.tv_sec;
stamp->tv_nsec = ts.tv_nsec;
@@ -3849,7 +3848,7 @@ static inline void skb_get_timestampns(const struct sk_buff *skb,
static inline void skb_get_new_timestampns(const struct sk_buff *skb,
struct __kernel_timespec *stamp)
{
- struct timespec64 ts = ktime_to_timespec64(skb->tstamp);
+ struct timespec64 ts = ktime_to_timespec64(skb_get_ktime(skb));
stamp->tv_sec = ts.tv_sec;
stamp->tv_nsec = ts.tv_nsec;
@@ -3857,12 +3856,12 @@ static inline void skb_get_new_timestampns(const struct sk_buff *skb,
static inline void __net_timestamp(struct sk_buff *skb)
{
- skb->tstamp = ktime_get_real();
+ skb->tstamp = ktime_get_ns();
}
static inline ktime_t net_timedelta(ktime_t t)
{
- return ktime_sub(ktime_get_real(), t);
+ return ktime_sub(ktime_get_ns(), t);
}
static inline ktime_t net_invalid_timestamp(void)
@@ -4726,7 +4725,7 @@ static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
#ifdef CONFIG_NET_REDIRECT
skb->from_ingress = from_ingress;
if (skb->from_ingress)
- skb->tstamp = 0;
+ skb_scrub_tstamp(skb);
#endif
}
@@ -149,6 +149,11 @@ static inline ktime_t ktime_mono_to_real(ktime_t mono)
return ktime_mono_to_any(mono, TK_OFFS_REAL);
}
+static inline ktime_t ktime_mono_to_real_cond(ktime_t mono)
+{
+ return mono ? ktime_mono_to_any(mono, TK_OFFS_REAL) : 0;
+}
+
static inline u64 ktime_get_ns(void)
{
return ktime_to_ns(ktime_get());
@@ -184,7 +184,7 @@ struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
*offload);
void taprio_offload_free(struct tc_taprio_qopt_offload *offload);
-/* Ensure skb_mstamp_ns, which might have been populated with the txtime, is
+/* Ensure tstamp, which might have been populated with the txtime, is
* not mistaken for a software timestamp, because this will otherwise prevent
* the dispatch of hardware timestamps to the socket.
*/
@@ -2603,7 +2603,7 @@ void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
static inline void
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
- ktime_t kt = skb->tstamp;
+ ktime_t kt = skb_get_ktime(skb);
struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
/*
@@ -2640,7 +2640,7 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY)
__sock_recv_ts_and_drops(msg, sk, skb);
else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
- sock_write_timestamp(sk, skb->tstamp);
+ sock_write_timestamp(sk, skb_get_ktime(skb));
else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP))
sock_write_timestamp(sk, 0);
}
@@ -810,13 +810,13 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
- return tcp_ns_to_ts(skb->skb_mstamp_ns);
+ return tcp_ns_to_ts(skb->tstamp);
}
/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
- return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
+ return div_u64(skb->tstamp, NSEC_PER_USEC);
}
@@ -862,7 +862,7 @@ struct tcp_skb_cb {
#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
#define TCPCB_LOST 0x04 /* SKB is lost */
#define TCPCB_TAGBITS 0x07 /* All tag bits */
-#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */
+#define TCPCB_REPAIRED 0x10 /* SKB repaired (no tstamp) */
#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
TCPCB_REPAIRED)
@@ -2395,7 +2395,7 @@ static inline void tcp_add_tx_delay(struct sk_buff *skb,
const struct tcp_sock *tp)
{
if (static_branch_unlikely(&tcp_tx_delay_enabled))
- skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
+ skb->tstamp += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
}
/* Compute Earliest Departure Time for some control packets
@@ -62,7 +62,7 @@ EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);
int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- skb->tstamp = 0;
+ skb_scrub_tstamp(skb);
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING,
net, sk, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
@@ -2108,7 +2108,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
}
skb->dev = dev;
- skb->tstamp = 0;
+ skb_scrub_tstamp(skb);
dev_xmit_recursion_inc();
ret = dev_queue_xmit(skb);
@@ -2177,7 +2177,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
}
skb->dev = dev;
- skb->tstamp = 0;
+ skb_scrub_tstamp(skb);
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
skb = skb_expand_head(skb, hh_len);
@@ -2275,7 +2275,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
}
skb->dev = dev;
- skb->tstamp = 0;
+ skb_scrub_tstamp(skb);
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
skb = skb_expand_head(skb, hh_len);
@@ -4794,7 +4794,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (hwtstamps)
*skb_hwtstamps(skb) = *hwtstamps;
else
- skb->tstamp = ktime_get_real();
+ __net_timestamp(skb);
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
@@ -5291,6 +5291,14 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
}
EXPORT_SYMBOL(skb_try_coalesce);
+void skb_scrub_tstamp(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME))
+ skb->tstamp = 0;
+}
+
/**
* skb_scrub_packet - scrub an skb
*
@@ -5324,7 +5332,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
ipvs_reset(skb);
skb->mark = 0;
- skb->tstamp = 0;
+ skb_scrub_tstamp(skb);
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);
@@ -79,7 +79,7 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
if (unlikely(opt->optlen))
ip_forward_options(skb);
- skb->tstamp = 0;
+ skb_scrub_tstamp(skb);
return dst_output(net, sk, skb);
}
@@ -1289,7 +1289,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
copy = size_goal;
/* All packets are restored as if they have
- * already been sent. skb_mstamp_ns isn't set to
+ * already been sent. skb->tstamp isn't set to
* avoid wrong rtt estimation.
*/
if (tp->repair)
@@ -1754,7 +1754,7 @@ void tcp_update_recv_tstamps(struct sk_buff *skb,
struct scm_timestamping_internal *tss)
{
if (skb->tstamp)
- tss->ts[0] = ktime_to_timespec64(skb->tstamp);
+ tss->ts[0] = ktime_to_timespec64(skb_get_ktime(skb));
else
tss->ts[0] = (struct timespec64) {0};
@@ -3928,7 +3928,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
max_t(int, 0, tp->write_seq - tp->snd_nxt));
- nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
+ nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->tstamp,
TCP_NLA_PAD);
if (ack_skb)
nla_put_u8(stats, TCP_NLA_TTL,
@@ -1253,7 +1253,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tp = tcp_sk(sk);
prior_wstamp = tp->tcp_wstamp_ns;
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
- skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
+ skb->tstamp = tp->tcp_wstamp_ns;
if (clone_it) {
oskb = skb;
@@ -1391,7 +1391,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
- /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
+ /* Leave earliest departure time in skb->tstamp */
/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
@@ -2615,8 +2615,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unsigned int limit;
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
- /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
- skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
+ /* "skb->tstamp" is used as a start point for the retransmit timer */
+ skb->tstamp = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
tcp_init_tso_segs(skb, mss_now);
goto repair; /* Skip network transmission */
@@ -3541,11 +3541,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
now = tcp_clock_ns();
#ifdef CONFIG_SYN_COOKIES
if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
- skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
+ skb->tstamp = cookie_init_timestamp(req, now);
else
#endif
{
- skb->skb_mstamp_ns = now;
+ skb->tstamp = now;
if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
}
@@ -3594,7 +3594,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
synack_type, &opts);
- skb->skb_mstamp_ns = now;
+ skb->tstamp = now;
tcp_add_tx_delay(skb, tp);
return skb;
@@ -3771,7 +3771,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
- syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
+ syn->tstamp = syn_data->tstamp;
/* Now full SYN+DATA was cloned and sent (or not),
* remove the SYN from the original skb (syn_data)
@@ -440,7 +440,7 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk,
}
#endif
- skb->tstamp = 0;
+ skb_scrub_tstamp(skb);
return dst_output(net, sk, skb);
}
@@ -871,7 +871,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
memset(&tss, 0, sizeof(tss));
if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
- ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
+ ktime_to_timespec64_cond(skb_get_ktime(skb), tss.ts + 0))
empty = 0;
if (shhwtstamps &&
(sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
@@ -470,12 +470,9 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
len = svc_addr_len(svc_addr(rqstp));
rqstp->rq_addrlen = len;
- if (skb->tstamp == 0) {
- skb->tstamp = ktime_get_real();
- /* Don't enable netstamp, sunrpc doesn't
- need that much accuracy */
- }
- sock_write_timestamp(svsk->sk_sk, skb->tstamp);
+ if (skb->tstamp == 0)
+ __net_timestamp(skb);
+ sock_write_timestamp(svsk->sk_sk, skb_get_ktime(skb));
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
len = skb->len;
The TX skb->skb_mstamp_ns is used as the EDT (Earliest Department Time) in TCP. skb->skb_mstamp_ns is an union member of skb->tstamp. Mono clock is used in EDT. The EDT will be lost when the packet is forwarded. For example, the tcp sender is in its own netns and its packet will eventually be sent out from the host eth0 like this: (skb->tstamp = 0) vv tcp-sender => veth@netns => veth@hostns(rx: skb->tstamp = real_clock) => fq@eth0 ^^ (skb->tstamp = 0) veth@netns forward to veth@hostns: (a) skb->tstamp is reset to 0 RX in veth@hostns: (b) skb->tstamp is stamped with real clock if skb->tstamp is 0. Thus, the earlier skb->tstamp reset to 0 is needed in (a). veth@hostns forward to fq@eth0: (c) skb->tstamp is reset back to 0 again because fq is using mono clock. Resetting the skb->skb_mstamp_ns marked by the tcp-sender@netns leads to unstable TCP throughput issue described by Daniel in [0]. We also have a use case that a bpf runs at ingress@veth@hostns to set EDT in skb->tstamp to limit the bandwidth usage of a particular netns. This EDT (set by the bpf@ingress@veth@hostns) currently also gets reset in step (c) as described above. This patch tries to use mono clock instead of real clock in the RX timestamp. Before exposing the RX timestamp to the userspace, skb_get_ktime() is used to convert the mono clock to real clock. When forwarding a packet, instead of always resetting skb->tstamp, this patch only resets when the skb->tstamp is marked by userspace (by testing sock_flag(skb->sk, SOCK_TXTIME)). It is done in a new helper skb_scrub_tstamp(). Not all RX real clock usages in skb->tstamp have been audited and changed, so under RFC. The RX timestamp has been in real clock for a long time and it is likely I missed some of the reasons, so it will be useful to get feedback before converting the remaining RX usages. [0] (slide 22): https://linuxplumbersconf.org/event/11/contributions/953/attachments/867/1658/LPC_2021_BPF_Datapath_Extensions.pdf Signed-off-by: Martin KaFai Lau <kafai@fb.com> --- include/linux/skbuff.h | 23 +++++++++++------------ include/linux/timekeeping.h | 5 +++++ include/net/pkt_sched.h | 2 +- include/net/sock.h | 4 ++-- include/net/tcp.h | 8 ++++---- net/bridge/br_forward.c | 2 +- net/core/filter.c | 6 +++--- net/core/skbuff.c | 12 ++++++++++-- net/ipv4/ip_forward.c | 2 +- net/ipv4/tcp.c | 6 +++--- net/ipv4/tcp_output.c | 16 ++++++++-------- net/ipv6/ip6_output.c | 2 +- net/socket.c | 2 +- net/sunrpc/svcsock.c | 9 +++------ 14 files changed, 54 insertions(+), 45 deletions(-)