diff mbox series

[v4,net-next,3/8] net: Set skb->mono_delivery_time and clear it after sch_handle_ingress()

Message ID 20220211071251.887078-1-kafai@fb.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series Preserve mono delivery time (EDT) in skb->tstamp | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/apply fail Patch does not apply to net-next

Commit Message

Martin KaFai Lau Feb. 11, 2022, 7:12 a.m. UTC
This patch sets the skb->mono_delivery_time to flag the skb->tstamp
is used as the mono delivery_time (EDT) instead of the (rcv) timestamp.

skb_clear_delivery_time() is added to clear the delivery_time and set
back to the (rcv) timestamp after sch_handle_ingress() such that
the tc-bpf prog can use bpf_redirect_*() to forward it to the egress
of another iface and keep the EDT delivery_time.

The next patch will postpone the skb_clear_delivery_time() until the
stack learns that the skb is being delivered locally and that will
make other kernel forwarding paths (ip[6]_forward) able to keep
the delivery_time also.  Thus, like the previous patches on using
the skb->mono_delivery_time bit, calling skb_clear_delivery_time()
is not done within the CONFIG_NET_INGRESS to avoid too many code
churns among this set.

Before sch_handle_ingress(), another case needs to clear the delivery_time
is the network tapping (e.g. af_packet by tcpdump).  Regardless of tapping
at the ingress or egress,  the tapped skb is received by the af_packet
socket, so it is ingress to the af_packet socket and it expects
the (rcv) timestamp.

When tapping at egress, dev_queue_xmit_nit() is used.  It has already
expected skb->tstamp may have delivery_time,  so it does
skb_clone()+net_timestamp_set() to ensure the cloned skb has
the (rcv) timestamp before passing to the af_packet sk.
This patch only adds to clear the skb->mono_delivery_time
bit in net_timestamp_set().

When tapping at ingress, it currently expects the skb->tstamp is either 0
or has the (rcv) timestamp.  Meaning, the tapping at ingress path
has already expected the skb->tstamp could be 0 and it will get
the (rcv) timestamp by ktime_get_real() when needed.

There are two cases for tapping at ingress:

One case is af_packet queues the skb to its sk_receive_queue.  The skb
is either not shared or new clone created.  The skb_clear_delivery_time()
is called to clear the delivery_time (if any) before it is queued to the
sk_receive_queue.

Another case, the ingress skb is directly copied to the rx_ring
and tpacket_get_timestamp() is used to get the (rcv) timestamp.
skb_tstamp() is used in tpacket_get_timestamp() to check
the skb->mono_delivery_time bit before returning skb->tstamp.
As mentioned earlier, the tapping@ingress has already expected
the skb may not have the (rcv) timestamp (because no sk has asked
for it) and has handled this case by directly calling ktime_get_real().

In __skb_tstamp_tx, it clones the egress skb and queues the clone to the
sk_error_queue.  The outgoing skb may have the mono delivery_time while
the (rcv) timestamp is expected for the clone, so the
skb->mono_delivery_time bit is also cleared from the clone.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/linux/skbuff.h | 27 +++++++++++++++++++++++++--
 net/core/dev.c         |  5 ++++-
 net/core/skbuff.c      |  6 ++++--
 net/ipv4/ip_output.c   |  3 +--
 net/packet/af_packet.c |  4 +++-
 5 files changed, 37 insertions(+), 8 deletions(-)

Comments

Daniel Borkmann Feb. 15, 2022, 9:04 p.m. UTC | #1
On 2/11/22 8:12 AM, Martin KaFai Lau wrote:
[...]
> +
> +DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);
> +
> +/* It is used in the ingress path to clear the delivery_time.
> + * If needed, set the skb->tstamp to the (rcv) timestamp.
> + */
> +static inline void skb_clear_delivery_time(struct sk_buff *skb)
> +{
> +	if (unlikely(skb->mono_delivery_time)) {
> +		skb->mono_delivery_time = 0;
> +		if (static_branch_unlikely(&netstamp_needed_key))
> +			skb->tstamp = ktime_get_real();
> +		else
> +			skb->tstamp = 0;
> +	}
>   }
>   
>   static inline void skb_clear_tstamp(struct sk_buff *skb)
> @@ -3946,6 +3961,14 @@ static inline void skb_clear_tstamp(struct sk_buff *skb)
>   	skb->tstamp = 0;
>   }
>   
> +static inline ktime_t skb_tstamp(const struct sk_buff *skb)
> +{
> +	if (unlikely(skb->mono_delivery_time))
> +		return 0;
> +
> +	return skb->tstamp;
> +}
> +
>   static inline u8 skb_metadata_len(const struct sk_buff *skb)
>   {

Just small nit, but I don't think here and in other patches as well the conditional
for skb->mono_delivery_time should be marked unlikely(). For container workloads
this is very likely.
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 07e618f8b41a..0e09e75fa787 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3934,8 +3934,23 @@  static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt,
 					 bool mono)
 {
 	skb->tstamp = kt;
-	/* Setting mono_delivery_time will be enabled later */
-	skb->mono_delivery_time = 0;
+	skb->mono_delivery_time = kt && mono;
+}
+
+DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);
+
+/* It is used in the ingress path to clear the delivery_time.
+ * If needed, set the skb->tstamp to the (rcv) timestamp.
+ */
+static inline void skb_clear_delivery_time(struct sk_buff *skb)
+{
+	if (unlikely(skb->mono_delivery_time)) {
+		skb->mono_delivery_time = 0;
+		if (static_branch_unlikely(&netstamp_needed_key))
+			skb->tstamp = ktime_get_real();
+		else
+			skb->tstamp = 0;
+	}
 }
 
 static inline void skb_clear_tstamp(struct sk_buff *skb)
@@ -3946,6 +3961,14 @@  static inline void skb_clear_tstamp(struct sk_buff *skb)
 	skb->tstamp = 0;
 }
 
+static inline ktime_t skb_tstamp(const struct sk_buff *skb)
+{
+	if (unlikely(skb->mono_delivery_time))
+		return 0;
+
+	return skb->tstamp;
+}
+
 static inline u8 skb_metadata_len(const struct sk_buff *skb)
 {
 	return skb_shinfo(skb)->meta_len;
diff --git a/net/core/dev.c b/net/core/dev.c
index f5ef51601081..f41707ab2fb9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2000,7 +2000,8 @@  void net_dec_egress_queue(void)
 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 #endif
 
-static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+EXPORT_SYMBOL(netstamp_needed_key);
 #ifdef CONFIG_JUMP_LABEL
 static atomic_t netstamp_needed_deferred;
 static atomic_t netstamp_wanted;
@@ -2061,6 +2062,7 @@  EXPORT_SYMBOL(net_disable_timestamp);
 static inline void net_timestamp_set(struct sk_buff *skb)
 {
 	skb->tstamp = 0;
+	skb->mono_delivery_time = 0;
 	if (static_branch_unlikely(&netstamp_needed_key))
 		__net_timestamp(skb);
 }
@@ -5220,6 +5222,7 @@  static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 			goto out;
 	}
 #endif
+	skb_clear_delivery_time(skb);
 	skb_reset_redirect(skb);
 skip_classify:
 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3e3da8fdf8f5..93dc763da8cb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4817,10 +4817,12 @@  void __skb_tstamp_tx(struct sk_buff *orig_skb,
 		skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
 	}
 
-	if (hwtstamps)
+	if (hwtstamps) {
 		*skb_hwtstamps(skb) = *hwtstamps;
-	else
+	} else {
 		skb->tstamp = ktime_get_real();
+		skb->mono_delivery_time = 0;
+	}
 
 	__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7af5d1849bc9..bfe08feb5d82 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1728,8 +1728,7 @@  void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
 								arg->csum));
 		nskb->ip_summed = CHECKSUM_NONE;
-		/* Setting mono_delivery_time will be enabled later */
-		nskb->mono_delivery_time = 0;
+		nskb->mono_delivery_time = !!transmit_time;
 		ip_push_pending_frames(sk, &fl4);
 	}
 out:
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ab87f22cc7ec..1b93ce1a5600 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -460,7 +460,7 @@  static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
 		return TP_STATUS_TS_RAW_HARDWARE;
 
 	if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
-	    ktime_to_timespec64_cond(skb->tstamp, ts))
+	    ktime_to_timespec64_cond(skb_tstamp(skb), ts))
 		return TP_STATUS_TS_SOFTWARE;
 
 	return 0;
@@ -2199,6 +2199,7 @@  static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 	spin_lock(&sk->sk_receive_queue.lock);
 	po->stats.stats1.tp_packets++;
 	sock_skb_set_dropcount(sk, skb);
+	skb_clear_delivery_time(skb);
 	__skb_queue_tail(&sk->sk_receive_queue, skb);
 	spin_unlock(&sk->sk_receive_queue.lock);
 	sk->sk_data_ready(sk);
@@ -2377,6 +2378,7 @@  static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	po->stats.stats1.tp_packets++;
 	if (copy_skb) {
 		status |= TP_STATUS_COPY;
+		skb_clear_delivery_time(copy_skb);
 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
 	}
 	spin_unlock(&sk->sk_receive_queue.lock);