diff mbox series

[net-next,5/8] gve: Add optional metadata descriptor type GVE_TXD_MTD

Message ID 20211216004652.1021911-6-jeroendb@google.com (mailing list archive)
State Accepted
Commit 497dbb2b97a0642ecd478d441643bf26804d5f96
Delegated to: Netdev Maintainers
Headers show
Series gve improvements | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 3 maintainers not CCed: csully@google.com bcf@google.com xliutaox@google.com
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch warning WARNING: line length of 84 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Jeroen de Borst Dec. 16, 2021, 12:46 a.m. UTC
From: Willem de Bruijn <willemb@google.com>

Allow drivers to pass metadata along with packet data to the device.
Introduce a new metadata descriptor type

* GVE_TXD_MTD

This descriptor is optional. If present it immediate follows the
packet descriptor and precedes the segment descriptor.

This descriptor may be repeated. Multiple metadata descriptors may
follow. There are no immediate uses for this, this is for future
proofing. At present devices allow only 1 MTD descriptor.

The lower four bits of the type_flags field encode GVE_TXD_MTD.
The upper four bits of the type_flags field encodes a *sub*type.

Introduce one such metadata descriptor subtype

* GVE_MTD_SUBTYPE_PATH

This shares path information with the device for network failure
discovery and robust response:

Linux derives ipv6 flowlabel and ECMP multipath from sk->sk_txhash,
and updates this field on error with sk_rethink_txhash. Allow the host
stack to do the same. Pass the tx_hash value if set. Also communicate
whether the path hash is set, or more exactly, what its type is. Define
two common types

  GVE_MTD_PATH_HASH_NONE
  GVE_MTD_PATH_HASH_L4

Concrete examples of error conditions that are resolved are
mentioned in the commits that add sk_rethink_txhash calls. Such as
commit 7788174e8726 ("tcp: change IPv6 flow-label upon receiving
spurious retransmission").

Experimental results mirror what the theory suggests: where IPv6
FlowLabel is included in path selection (e.g., LAG/ECMP), flowlabel
rotation on TCP timeout avoids the vast majority of TCP disconnects
that would otherwise have occurred during link failures in long-haul
backbones, when an alternative path is available.

Rotation can be applied to various bad connection signals, such as
timeouts and spurious retransmissions. In aggregate, such flow level
signals can help locate network issues. Define initial common states:

  GVE_MTD_PATH_STATE_DEFAULT
  GVE_MTD_PATH_STATE_TIMEOUT
  GVE_MTD_PATH_STATE_CONGESTION
  GVE_MTD_PATH_STATE_RETRANSMIT

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David Awogbemila <awogbemila@google.com>
---
 drivers/net/ethernet/google/gve/gve.h      |  1 +
 drivers/net/ethernet/google/gve/gve_desc.h | 20 ++++++
 drivers/net/ethernet/google/gve/gve_tx.c   | 73 ++++++++++++++++------
 3 files changed, 74 insertions(+), 20 deletions(-)
diff mbox series

Patch

diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
index b6bd8f679127..ed43b8ece5a2 100644
--- a/drivers/net/ethernet/google/gve/gve.h
+++ b/drivers/net/ethernet/google/gve/gve.h
@@ -229,6 +229,7 @@  struct gve_rx_ring {
 /* A TX desc ring entry */
 union gve_tx_desc {
 	struct gve_tx_pkt_desc pkt; /* first desc for a packet */
+	struct gve_tx_mtd_desc mtd; /* optional metadata descriptor */
 	struct gve_tx_seg_desc seg; /* subsequent descs for a packet */
 };
 
diff --git a/drivers/net/ethernet/google/gve/gve_desc.h b/drivers/net/ethernet/google/gve/gve_desc.h
index 4d225a18d8ce..f4ae9e19b844 100644
--- a/drivers/net/ethernet/google/gve/gve_desc.h
+++ b/drivers/net/ethernet/google/gve/gve_desc.h
@@ -33,6 +33,14 @@  struct gve_tx_pkt_desc {
 	__be64	seg_addr;  /* Base address (see note) of this segment */
 } __packed;
 
+struct gve_tx_mtd_desc {
+	u8      type_flags;     /* type is lower 4 bits, subtype upper  */
+	u8      path_state;     /* state is lower 4 bits, hash type upper */
+	__be16  reserved0;
+	__be32  path_hash;
+	__be64  reserved1;
+} __packed;
+
 struct gve_tx_seg_desc {
 	u8	type_flags;	/* type is lower 4 bits, flags upper	*/
 	u8	l3_offset;	/* TSO: 2 byte units to start of IPH	*/
@@ -46,6 +54,7 @@  struct gve_tx_seg_desc {
 #define	GVE_TXD_STD		(0x0 << 4) /* Std with Host Address	*/
 #define	GVE_TXD_TSO		(0x1 << 4) /* TSO with Host Address	*/
 #define	GVE_TXD_SEG		(0x2 << 4) /* Seg with Host Address	*/
+#define	GVE_TXD_MTD		(0x3 << 4) /* Metadata			*/
 
 /* GVE Transmit Descriptor Flags for Std Pkts */
 #define	GVE_TXF_L4CSUM	BIT(0)	/* Need csum offload */
@@ -54,6 +63,17 @@  struct gve_tx_seg_desc {
 /* GVE Transmit Descriptor Flags for TSO Segs */
 #define	GVE_TXSF_IPV6	BIT(1)	/* IPv6 TSO */
 
+/* GVE Transmit Descriptor Options for MTD Segs */
+#define GVE_MTD_SUBTYPE_PATH		0
+
+#define GVE_MTD_PATH_STATE_DEFAULT	0
+#define GVE_MTD_PATH_STATE_TIMEOUT	1
+#define GVE_MTD_PATH_STATE_CONGESTION	2
+#define GVE_MTD_PATH_STATE_RETRANSMIT	3
+
+#define GVE_MTD_PATH_HASH_NONE         (0x0 << 4)
+#define GVE_MTD_PATH_HASH_L4           (0x1 << 4)
+
 /* GVE Receive Packet Descriptor */
 /* The start of an ethernet packet comes 2 bytes into the rx buffer.
  * gVNIC adds this padding so that both the DMA and the L3/4 protocol header
diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c
index a9cb241fedf4..4888bf05fbed 100644
--- a/drivers/net/ethernet/google/gve/gve_tx.c
+++ b/drivers/net/ethernet/google/gve/gve_tx.c
@@ -296,11 +296,14 @@  static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx,
 	return bytes;
 }
 
-/* The most descriptors we could need is MAX_SKB_FRAGS + 3 : 1 for each skb frag,
- * +1 for the skb linear portion, +1 for when tcp hdr needs to be in separate descriptor,
- * and +1 if the payload wraps to the beginning of the FIFO.
+/* The most descriptors we could need is MAX_SKB_FRAGS + 4 :
+ * 1 for each skb frag
+ * 1 for the skb linear portion
+ * 1 for when tcp hdr needs to be in separate descriptor
+ * 1 if the payload wraps to the beginning of the FIFO
+ * 1 for metadata descriptor
  */
-#define MAX_TX_DESC_NEEDED	(MAX_SKB_FRAGS + 3)
+#define MAX_TX_DESC_NEEDED	(MAX_SKB_FRAGS + 4)
 static void gve_tx_unmap_buf(struct device *dev, struct gve_tx_buffer_state *info)
 {
 	if (info->skb) {
@@ -395,6 +398,19 @@  static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc,
 	pkt_desc->pkt.seg_addr = cpu_to_be64(addr);
 }
 
+static void gve_tx_fill_mtd_desc(union gve_tx_desc *mtd_desc,
+				 struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(mtd_desc->mtd) != sizeof(mtd_desc->pkt));
+
+	mtd_desc->mtd.type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
+	mtd_desc->mtd.path_state = GVE_MTD_PATH_STATE_DEFAULT |
+				   GVE_MTD_PATH_HASH_L4;
+	mtd_desc->mtd.path_hash = cpu_to_be32(skb->hash);
+	mtd_desc->mtd.reserved0 = 0;
+	mtd_desc->mtd.reserved1 = 0;
+}
+
 static void gve_tx_fill_seg_desc(union gve_tx_desc *seg_desc,
 				 struct sk_buff *skb, bool is_gso,
 				 u16 len, u64 addr)
@@ -426,6 +442,7 @@  static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, st
 	int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset;
 	union gve_tx_desc *pkt_desc, *seg_desc;
 	struct gve_tx_buffer_state *info;
+	int mtd_desc_nr = !!skb->l4_hash;
 	bool is_gso = skb_is_gso(skb);
 	u32 idx = tx->req & tx->mask;
 	int payload_iov = 2;
@@ -457,7 +474,7 @@  static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, st
 					   &info->iov[payload_iov]);
 
 	gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
-			     1 + payload_nfrags, hlen,
+			     1 + mtd_desc_nr + payload_nfrags, hlen,
 			     info->iov[hdr_nfrags - 1].iov_offset);
 
 	skb_copy_bits(skb, 0,
@@ -468,8 +485,13 @@  static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, st
 				info->iov[hdr_nfrags - 1].iov_len);
 	copy_offset = hlen;
 
+	if (mtd_desc_nr) {
+		next_idx = (tx->req + 1) & tx->mask;
+		gve_tx_fill_mtd_desc(&tx->desc[next_idx], skb);
+	}
+
 	for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
-		next_idx = (tx->req + 1 + i - payload_iov) & tx->mask;
+		next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
 		seg_desc = &tx->desc[next_idx];
 
 		gve_tx_fill_seg_desc(seg_desc, skb, is_gso,
@@ -485,16 +507,17 @@  static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, st
 		copy_offset += info->iov[i].iov_len;
 	}
 
-	return 1 + payload_nfrags;
+	return 1 + mtd_desc_nr + payload_nfrags;
 }
 
 static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
 				  struct sk_buff *skb)
 {
 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
-	int hlen, payload_nfrags, l4_hdr_offset;
-	union gve_tx_desc *pkt_desc, *seg_desc;
+	int hlen, num_descriptors, l4_hdr_offset;
+	union gve_tx_desc *pkt_desc, *mtd_desc, *seg_desc;
 	struct gve_tx_buffer_state *info;
+	int mtd_desc_nr = !!skb->l4_hash;
 	bool is_gso = skb_is_gso(skb);
 	u32 idx = tx->req & tx->mask;
 	u64 addr;
@@ -523,23 +546,30 @@  static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
 	dma_unmap_len_set(info, len, len);
 	dma_unmap_addr_set(info, dma, addr);
 
-	payload_nfrags = shinfo->nr_frags;
+	num_descriptors = 1 + shinfo->nr_frags;
+	if (hlen < len)
+		num_descriptors++;
+	if (mtd_desc_nr)
+		num_descriptors++;
+
+	gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
+			     num_descriptors, hlen, addr);
+
+	if (mtd_desc_nr) {
+		idx = (idx + 1) & tx->mask;
+		mtd_desc = &tx->desc[idx];
+		gve_tx_fill_mtd_desc(mtd_desc, skb);
+	}
+
 	if (hlen < len) {
 		/* For gso the rest of the linear portion of the skb needs to
 		 * be in its own descriptor.
 		 */
-		payload_nfrags++;
-		gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
-				     1 + payload_nfrags, hlen, addr);
-
 		len -= hlen;
 		addr += hlen;
-		idx = (tx->req + 1) & tx->mask;
+		idx = (idx + 1) & tx->mask;
 		seg_desc = &tx->desc[idx];
 		gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
-	} else {
-		gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
-				     1 + payload_nfrags, hlen, addr);
 	}
 
 	for (i = 0; i < shinfo->nr_frags; i++) {
@@ -560,11 +590,14 @@  static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
 		gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
 	}
 
-	return 1 + payload_nfrags;
+	return num_descriptors;
 
 unmap_drop:
-	i += (payload_nfrags == shinfo->nr_frags ? 1 : 2);
+	i += num_descriptors - shinfo->nr_frags;
 	while (i--) {
+		/* Skip metadata descriptor, if set */
+		if (i == 1 && mtd_desc_nr == 1)
+			continue;
 		idx--;
 		gve_tx_unmap_buf(tx->dev, &tx->info[idx & tx->mask]);
 	}