diff mbox series

[net-next,v14,06/13] rtase: Implement .ndo_start_xmit function

Message ID 20231208094733.1671296-7-justinlai0215@realtek.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series Add Realtek automotive PCIe driver | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/cc_maintainers success CCed 5 of 5 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 304 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Justin Lai Dec. 8, 2023, 9:47 a.m. UTC
Implement .ndo_start_xmit function to fill the information of the packet
to be transmitted into the tx descriptor, and then the hardware will
transmit the packet using the information in the tx descriptor.
In addition, we also implemented the tx_handler function to enable the
tx descriptor to be reused.

Signed-off-by: Justin Lai <justinlai0215@realtek.com>
---
 .../net/ethernet/realtek/rtase/rtase_main.c   | 286 ++++++++++++++++++
 1 file changed, 286 insertions(+)

Comments

Jakub Kicinski Dec. 12, 2023, 7:32 p.m. UTC | #1
On Fri, 8 Dec 2023 17:47:26 +0800 Justin Lai wrote:
> +static int tx_handler(struct rtase_ring *ring, int budget)

I don't see how this is called, the way you split the submission makes
it a bit hard to review, oh well. Anyway - if you pass the NAPI budget
here - that's not right, it may be 0, and you'd loop forever.
For Tx - you should try to reap some fixed number of packets, say 128,
the budget is for Rx, not for Tx.

> +	const struct rtase_private *tp = ring->ivec->tp;
> +	struct net_device *dev = tp->dev;
> +	int workdone = 0;
> +	u32 dirty_tx;
> +	u32 tx_left;
> +
> +	dirty_tx = ring->dirty_idx;
> +	tx_left = READ_ONCE(ring->cur_idx) - dirty_tx;
> +
> +	while (tx_left > 0) {
> +		u32 entry = dirty_tx % NUM_DESC;
> +		struct tx_desc *desc = ring->desc +
> +				       sizeof(struct tx_desc) * entry;
> +		u32 len = ring->mis.len[entry];
> +		u32 status;
> +
> +		status = le32_to_cpu(desc->opts1);
> +
> +		if (status & DESC_OWN)
> +			break;
> +
> +		rtase_unmap_tx_skb(tp->pdev, len, desc);
> +		ring->mis.len[entry] = 0;
> +		if (ring->skbuff[entry]) {
> +			dev_consume_skb_any(ring->skbuff[entry]);

napi_consume_skb, assuming you call this from NAPI

> +			ring->skbuff[entry] = NULL;
> +		}
> +
> +		dev->stats.tx_bytes += len;
> +		dev->stats.tx_packets++;
> +		dirty_tx++;
> +		tx_left--;
> +		workdone++;
> +
> +		if (workdone == budget)
> +			break;
> +	}
> +
> +	if (ring->dirty_idx != dirty_tx) {
> +		WRITE_ONCE(ring->dirty_idx, dirty_tx);
> +
> +		if (__netif_subqueue_stopped(dev, ring->index) &&
> +		    rtase_tx_avail(ring))
> +			netif_start_subqueue(dev, ring->index);

Please use the start / stop macros from include/net/netdev_queues.h
I'm pretty sure the current code is racy.

> +		if (ring->cur_idx != dirty_tx)
> +			rtase_w8(tp, RTASE_TPPOLL, BIT(ring->index));
> +	}
> +
> +	return workdone;
> +}

> +	/* multiqueues */
> +	q_idx = skb_get_queue_mapping(skb);
> +	ring = &tp->tx_ring[q_idx];

As Paolo pointed out elsewhere you seem to only support one queue.
Remove this indirection, please, and always use queue 0, otherwise
it's a bit confusing.
Justin Lai Dec. 14, 2023, 1 p.m. UTC | #2
> 
> On Fri, 8 Dec 2023 17:47:26 +0800 Justin Lai wrote:
> > +static int tx_handler(struct rtase_ring *ring, int budget)
> 
> I don't see how this is called, the way you split the submission makes it a bit
> hard to review, oh well. Anyway - if you pass the NAPI budget here - that's not
> right, it may be 0, and you'd loop forever.
> For Tx - you should try to reap some fixed number of packets, say 128, the
> budget is for Rx, not for Tx.

Even if the budget is 0, this function will not loop forever, it will just run all tx_left.
Or what changes would you like us to make?
> 
> > +     const struct rtase_private *tp = ring->ivec->tp;
> > +     struct net_device *dev = tp->dev;
> > +     int workdone = 0;
> > +     u32 dirty_tx;
> > +     u32 tx_left;
> > +
> > +     dirty_tx = ring->dirty_idx;
> > +     tx_left = READ_ONCE(ring->cur_idx) - dirty_tx;
> > +
> > +     while (tx_left > 0) {
> > +             u32 entry = dirty_tx % NUM_DESC;
> > +             struct tx_desc *desc = ring->desc +
> > +                                    sizeof(struct tx_desc) * entry;
> > +             u32 len = ring->mis.len[entry];
> > +             u32 status;
> > +
> > +             status = le32_to_cpu(desc->opts1);
> > +
> > +             if (status & DESC_OWN)
> > +                     break;
> > +
> > +             rtase_unmap_tx_skb(tp->pdev, len, desc);
> > +             ring->mis.len[entry] = 0;
> > +             if (ring->skbuff[entry]) {
> > +                     dev_consume_skb_any(ring->skbuff[entry]);
> 
> napi_consume_skb, assuming you call this from NAPI

Ok, I will modify it.
> 
> > +                     ring->skbuff[entry] = NULL;
> > +             }
> > +
> > +             dev->stats.tx_bytes += len;
> > +             dev->stats.tx_packets++;
> > +             dirty_tx++;
> > +             tx_left--;
> > +             workdone++;
> > +
> > +             if (workdone == budget)
> > +                     break;
> > +     }
> > +
> > +     if (ring->dirty_idx != dirty_tx) {
> > +             WRITE_ONCE(ring->dirty_idx, dirty_tx);
> > +
> > +             if (__netif_subqueue_stopped(dev, ring->index) &&
> > +                 rtase_tx_avail(ring))
> > +                     netif_start_subqueue(dev, ring->index);
> 
> Please use the start / stop macros from include/net/netdev_queues.h I'm
> pretty sure the current code is racy.
> 

Ok, I will use the macros from include/net/netdev_queues.h.
Jakub Kicinski Dec. 14, 2023, 5:24 p.m. UTC | #3
On Thu, 14 Dec 2023 13:00:29 +0000 JustinLai0215 wrote:
> > I don't see how this is called, the way you split the submission makes it a bit
> > hard to review, oh well. Anyway - if you pass the NAPI budget here - that's not
> > right, it may be 0, and you'd loop forever.
> > For Tx - you should try to reap some fixed number of packets, say 128, the
> > budget is for Rx, not for Tx.  
> 
> Even if the budget is 0, this function will not loop forever, it will just run all tx_left.
> Or what changes would you like us to make?

Ah, good point. It does seem a little accidental to me :S
In that case perhaps always consume all completed packets?
@budget should not constrain Tx completions directly, see:
https://www.kernel.org/doc/html/next/networking/napi.html
Justin Lai Dec. 18, 2023, 10:27 a.m. UTC | #4
> 
> > +             if (ring->cur_idx != dirty_tx)
> > +                     rtase_w8(tp, RTASE_TPPOLL, BIT(ring->index));
> > +     }
> > +
> > +     return workdone;
> > +}
> 
> > +     /* multiqueues */
> > +     q_idx = skb_get_queue_mapping(skb);
> > +     ring = &tp->tx_ring[q_idx];
> 
> As Paolo pointed out elsewhere you seem to only support one queue.
> Remove this indirection, please, and always use queue 0, otherwise it's a bit
> confusing.

Hi Jakub,

This device supports Multi-Queue.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c
index f6edb0f0a323..eee792ea4760 100644
--- a/drivers/net/ethernet/realtek/rtase/rtase_main.c
+++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c
@@ -251,6 +251,68 @@  static void rtase_mark_to_asic(union rx_desc *desc, u32 rx_buf_sz)
 		   cpu_to_le32(DESC_OWN | eor | rx_buf_sz));
 }
 
+static bool rtase_tx_avail(struct rtase_ring *ring)
+{
+	u32 avail_num = READ_ONCE(ring->dirty_idx) + NUM_DESC -
+			READ_ONCE(ring->cur_idx);
+
+	return avail_num > MAX_SKB_FRAGS;
+}
+
+static int tx_handler(struct rtase_ring *ring, int budget)
+{
+	const struct rtase_private *tp = ring->ivec->tp;
+	struct net_device *dev = tp->dev;
+	int workdone = 0;
+	u32 dirty_tx;
+	u32 tx_left;
+
+	dirty_tx = ring->dirty_idx;
+	tx_left = READ_ONCE(ring->cur_idx) - dirty_tx;
+
+	while (tx_left > 0) {
+		u32 entry = dirty_tx % NUM_DESC;
+		struct tx_desc *desc = ring->desc +
+				       sizeof(struct tx_desc) * entry;
+		u32 len = ring->mis.len[entry];
+		u32 status;
+
+		status = le32_to_cpu(desc->opts1);
+
+		if (status & DESC_OWN)
+			break;
+
+		rtase_unmap_tx_skb(tp->pdev, len, desc);
+		ring->mis.len[entry] = 0;
+		if (ring->skbuff[entry]) {
+			dev_consume_skb_any(ring->skbuff[entry]);
+			ring->skbuff[entry] = NULL;
+		}
+
+		dev->stats.tx_bytes += len;
+		dev->stats.tx_packets++;
+		dirty_tx++;
+		tx_left--;
+		workdone++;
+
+		if (workdone == budget)
+			break;
+	}
+
+	if (ring->dirty_idx != dirty_tx) {
+		WRITE_ONCE(ring->dirty_idx, dirty_tx);
+
+		if (__netif_subqueue_stopped(dev, ring->index) &&
+		    rtase_tx_avail(ring))
+			netif_start_subqueue(dev, ring->index);
+
+		if (ring->cur_idx != dirty_tx)
+			rtase_w8(tp, RTASE_TPPOLL, BIT(ring->index));
+	}
+
+	return workdone;
+}
+
 static void rtase_tx_desc_init(struct rtase_private *tp, u16 idx)
 {
 	struct rtase_ring *ring = &tp->tx_ring[idx];
@@ -995,6 +1057,229 @@  static int rtase_close(struct net_device *dev)
 	return 0;
 }
 
+static u32 rtase_tx_vlan_tag(const struct rtase_private *tp,
+			     const struct sk_buff *skb)
+{
+	return (skb_vlan_tag_present(skb)) ?
+		(TX_VLAN_TAG | swab16(skb_vlan_tag_get(skb))) : 0x00;
+}
+
+static u32 rtase_tx_csum(struct sk_buff *skb, const struct net_device *dev)
+{
+	u32 csum_cmd = 0;
+	u8 ip_protocol;
+
+	switch (vlan_get_protocol(skb)) {
+	case htons(ETH_P_IP):
+		csum_cmd = TX_IPCS_C;
+		ip_protocol = ip_hdr(skb)->protocol;
+		break;
+
+	case htons(ETH_P_IPV6):
+		csum_cmd = TX_IPV6F_C;
+		ip_protocol = ipv6_hdr(skb)->nexthdr;
+		break;
+
+	default:
+		ip_protocol = IPPROTO_RAW;
+		break;
+	}
+
+	if (ip_protocol == IPPROTO_TCP)
+		csum_cmd |= TX_TCPCS_C;
+	else if (ip_protocol == IPPROTO_UDP)
+		csum_cmd |= TX_UDPCS_C;
+	else
+		WARN_ON_ONCE(1);
+
+	csum_cmd |= u32_encode_bits(skb_transport_offset(skb), TCPHO_MASK);
+
+	return csum_cmd;
+}
+
+static int rtase_xmit_frags(struct rtase_ring *ring, struct sk_buff *skb,
+			    u32 opts1, u32 opts2)
+{
+	const struct skb_shared_info *info = skb_shinfo(skb);
+	const struct rtase_private *tp = ring->ivec->tp;
+	const u8 nr_frags = info->nr_frags;
+	struct tx_desc *txd = NULL;
+	u32 cur_frag, entry;
+
+	entry = ring->cur_idx;
+	for (cur_frag = 0; cur_frag < nr_frags; cur_frag++) {
+		const skb_frag_t *frag = &info->frags[cur_frag];
+		dma_addr_t mapping;
+		u32 status, len;
+		void *addr;
+
+		entry = (entry + 1) % NUM_DESC;
+
+		txd = ring->desc + sizeof(struct tx_desc) * entry;
+		len = skb_frag_size(frag);
+		addr = skb_frag_address(frag);
+		mapping = dma_map_single(&tp->pdev->dev, addr, len,
+					 DMA_TO_DEVICE);
+
+		if (unlikely(dma_mapping_error(&tp->pdev->dev, mapping))) {
+			if (unlikely(net_ratelimit()))
+				netdev_err(tp->dev,
+					   "Failed to map TX fragments DMA!\n");
+
+			goto err_out;
+		}
+
+		if (((entry + 1) % NUM_DESC) == 0)
+			status = (opts1 | len | RING_END);
+		else
+			status = opts1 | len;
+
+		if (cur_frag == (nr_frags - 1)) {
+			ring->skbuff[entry] = skb;
+			status |= TX_LAST_FRAG;
+		}
+
+		ring->mis.len[entry] = len;
+		txd->addr = cpu_to_le64(mapping);
+		txd->opts2 = cpu_to_le32(opts2);
+
+		/* make sure the operating fields have been updated */
+		wmb();
+		txd->opts1 = cpu_to_le32(status);
+	}
+
+	return cur_frag;
+
+err_out:
+	rtase_tx_clear_range(ring, ring->cur_idx + 1, cur_frag);
+	return -EIO;
+}
+
+static netdev_tx_t rtase_start_xmit(struct sk_buff *skb,
+				    struct net_device *dev)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	struct rtase_private *tp = netdev_priv(dev);
+	u32 q_idx, entry, len, opts1, opts2;
+	u32 mss = shinfo->gso_size;
+	struct rtase_ring *ring;
+	struct tx_desc *txd;
+	dma_addr_t mapping;
+	bool stop_queue;
+	int frags;
+
+	/* multiqueues */
+	q_idx = skb_get_queue_mapping(skb);
+	ring = &tp->tx_ring[q_idx];
+
+	if (unlikely(!rtase_tx_avail(ring))) {
+		if (net_ratelimit())
+			netdev_err(dev, "BUG! Tx Ring full when queue awake!\n");
+		goto err_stop;
+	}
+
+	entry = ring->cur_idx % NUM_DESC;
+	txd = ring->desc + sizeof(struct tx_desc) * entry;
+
+	opts1 = DESC_OWN;
+	opts2 = rtase_tx_vlan_tag(tp, skb);
+
+	/* tcp segmentation offload (or tcp large send) */
+	if (mss) {
+		if (shinfo->gso_type & SKB_GSO_TCPV4) {
+			opts1 |= GIANT_SEND_V4;
+		} else if (shinfo->gso_type & SKB_GSO_TCPV6) {
+			if (skb_cow_head(skb, 0))
+				goto err_dma_0;
+
+			tcp_v6_gso_csum_prep(skb);
+			opts1 |= GIANT_SEND_V6;
+		} else {
+			WARN_ON_ONCE(1);
+		}
+
+		opts1 |= u32_encode_bits(skb_transport_offset(skb), TCPHO_MASK);
+		opts2 |= u32_encode_bits(mss, MSS_MASK);
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		opts2 |= rtase_tx_csum(skb, dev);
+	}
+
+	frags = rtase_xmit_frags(ring, skb, opts1, opts2);
+	if (unlikely(frags < 0))
+		goto err_dma_0;
+
+	if (frags) {
+		len = skb_headlen(skb);
+		opts1 |= TX_FIRST_FRAG;
+	} else {
+		len = skb->len;
+		ring->skbuff[entry] = skb;
+		opts1 |= TX_FIRST_FRAG | TX_LAST_FRAG;
+	}
+
+	if (((entry + 1) % NUM_DESC) == 0)
+		opts1 |= (len | RING_END);
+	else
+		opts1 |= len;
+
+	mapping = dma_map_single(&tp->pdev->dev, skb->data, len,
+				 DMA_TO_DEVICE);
+
+	if (unlikely(dma_mapping_error(&tp->pdev->dev, mapping))) {
+		if (unlikely(net_ratelimit()))
+			netdev_err(dev, "Failed to map TX DMA!\n");
+
+		goto err_dma_1;
+	}
+
+	ring->mis.len[entry] = len;
+	txd->addr = cpu_to_le64(mapping);
+	txd->opts2 = cpu_to_le32(opts2);
+	txd->opts1 = cpu_to_le32(opts1 & ~DESC_OWN);
+
+	/* make sure the operating fields have been updated */
+	wmb();
+
+	txd->opts1 = cpu_to_le32(opts1);
+
+	skb_tx_timestamp(skb);
+
+	/* tx needs to see descriptor changes before updated cur_idx */
+	smp_wmb();
+
+	WRITE_ONCE(ring->cur_idx, ring->cur_idx + frags + 1);
+
+	stop_queue = !rtase_tx_avail(ring);
+	if (unlikely(stop_queue))
+		netif_stop_subqueue(dev, q_idx);
+
+	/* set polling bit */
+	rtase_w8(tp, RTASE_TPPOLL, BIT(ring->index));
+
+	if (unlikely(stop_queue)) {
+		/* make sure cur_idx and dirty_idx have been updated */
+		smp_rmb();
+		if (rtase_tx_avail(ring))
+			netif_start_subqueue(dev, q_idx);
+	}
+
+	return NETDEV_TX_OK;
+
+err_dma_1:
+	ring->skbuff[entry] = NULL;
+	rtase_tx_clear_range(ring, ring->cur_idx + 1, frags);
+
+err_dma_0:
+	dev->stats.tx_dropped++;
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+
+err_stop:
+	netif_stop_queue(dev);
+	dev->stats.tx_dropped++;
+	return NETDEV_TX_BUSY;
+}
+
 static void rtase_enable_eem_write(const struct rtase_private *tp)
 {
 	u8 val;
@@ -1046,6 +1331,7 @@  static void rtase_netpoll(struct net_device *dev)
 static const struct net_device_ops rtase_netdev_ops = {
 	.ndo_open = rtase_open,
 	.ndo_stop = rtase_close,
+	.ndo_start_xmit = rtase_start_xmit,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = rtase_netpoll,
 #endif