diff mbox series

[next-queue,v3,4/4] i40e: Add i40e_napi_poll tracepoint

Message ID 1665099838-94839-5-git-send-email-jdamato@fastly.com (mailing list archive)
State Awaiting Upstream
Delegated to: Netdev Maintainers
Headers show
Series i40e: Add an i40e_napi_poll tracepoint | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply

Commit Message

Joe Damato Oct. 6, 2022, 11:43 p.m. UTC
Add a tracepoint for i40e_napi_poll that allows users to get detailed
information about the amount of work done. This information can help users
better tune the correct NAPI parameters (like weight and budget), as well
as debug NIC settings like rx-usecs and tx-usecs, etc.

An example of the output from this tracepoint:

$ sudo perf trace -e i40e:i40e_napi_poll -a --call-graph=fp --libtraceevent_print

[..snip..]

388.258 :0/0 i40e:i40e_napi_poll(i40e_napi_poll on dev eth2 q i40e-eth2-TxRx-9 irq 346 irq_mask 00000000,00000000,00000000,00000000,00000000,00800000 curr_cpu 23 budget 64 bpr 64 rx_cleaned 28 tx_cleaned 0 rx_clean_complete 1 tx_clean_complete 1)
	i40e_napi_poll ([i40e])
	i40e_napi_poll ([i40e])
	__napi_poll ([kernel.kallsyms])
	net_rx_action ([kernel.kallsyms])
	__do_softirq ([kernel.kallsyms])
	common_interrupt ([kernel.kallsyms])
	asm_common_interrupt ([kernel.kallsyms])
	intel_idle_irq ([kernel.kallsyms])
	cpuidle_enter_state ([kernel.kallsyms])
	cpuidle_enter ([kernel.kallsyms])
	do_idle ([kernel.kallsyms])
	cpu_startup_entry ([kernel.kallsyms])
	[0x243fd8] ([kernel.kallsyms])
	secondary_startup_64_no_verify ([kernel.kallsyms])

Signed-off-by: Joe Damato <jdamato@fastly.com>
---
 drivers/net/ethernet/intel/i40e/i40e_trace.h | 49 ++++++++++++++++++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_txrx.c  |  3 ++
 2 files changed, 52 insertions(+)

Comments

Fijalkowski, Maciej Oct. 7, 2022, 8:18 a.m. UTC | #1
On Thu, Oct 06, 2022 at 04:43:58PM -0700, Joe Damato wrote:
> Add a tracepoint for i40e_napi_poll that allows users to get detailed
> information about the amount of work done. This information can help users
> better tune the correct NAPI parameters (like weight and budget), as well
> as debug NIC settings like rx-usecs and tx-usecs, etc.
> 
> An example of the output from this tracepoint:
> 
> $ sudo perf trace -e i40e:i40e_napi_poll -a --call-graph=fp --libtraceevent_print
> 
> [..snip..]
> 
> 388.258 :0/0 i40e:i40e_napi_poll(i40e_napi_poll on dev eth2 q
> i40e-eth2-TxRx-9 irq 346 irq_mask
> 00000000,00000000,00000000,00000000,00000000,00800000 curr_cpu 23 budget
> 64 bpr 64 rx_cleaned 28 tx_cleaned 0 rx_clean_complete 1
> tx_clean_complete 1)

So from AF_XDP POV I won't be using it as I would need some other
information. As I said, we don't work on NAPI budget but rather with the
free ring space and I don't get it here. tx_cleaned is also quite
incorrect name to me for count of produced descriptors to Tx ring. I feel
like it would be better to stub it for AF_XDP.

As Jesse said previously we probably can followup with AF_XDP specific
tracepoint with tx cleaned/tx transmitted/NAPI budget/AF_XDP budget (free
ring space) if we find the need for it.

That's my 0.02$, I'm not going to hold this set or whatever, I'll leave the
decision to Sridhar & Jesse.

> 	i40e_napi_poll ([i40e])
> 	i40e_napi_poll ([i40e])
> 	__napi_poll ([kernel.kallsyms])
> 	net_rx_action ([kernel.kallsyms])
> 	__do_softirq ([kernel.kallsyms])
> 	common_interrupt ([kernel.kallsyms])
> 	asm_common_interrupt ([kernel.kallsyms])
> 	intel_idle_irq ([kernel.kallsyms])
> 	cpuidle_enter_state ([kernel.kallsyms])
> 	cpuidle_enter ([kernel.kallsyms])
> 	do_idle ([kernel.kallsyms])
> 	cpu_startup_entry ([kernel.kallsyms])
> 	[0x243fd8] ([kernel.kallsyms])
> 	secondary_startup_64_no_verify ([kernel.kallsyms])
> 
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
>  drivers/net/ethernet/intel/i40e/i40e_trace.h | 49 ++++++++++++++++++++++++++++
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c  |  3 ++
>  2 files changed, 52 insertions(+)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_trace.h b/drivers/net/ethernet/intel/i40e/i40e_trace.h
> index b5b1229..7d7c161 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_trace.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_trace.h
> @@ -55,6 +55,55 @@
>   * being built from shared code.
>   */
>  
> +#define NO_DEV "(i40e no_device)"
> +
> +TRACE_EVENT(i40e_napi_poll,
> +
> +	TP_PROTO(struct napi_struct *napi, struct i40e_q_vector *q, int budget,
> +		 int budget_per_ring, unsigned int rx_cleaned, unsigned int tx_cleaned,
> +		 bool rx_clean_complete, bool tx_clean_complete),
> +
> +	TP_ARGS(napi, q, budget, budget_per_ring, rx_cleaned, tx_cleaned,
> +		rx_clean_complete, tx_clean_complete),
> +
> +	TP_STRUCT__entry(
> +		__field(int, budget)
> +		__field(int, budget_per_ring)
> +		__field(unsigned int, rx_cleaned)
> +		__field(unsigned int, tx_cleaned)
> +		__field(int, rx_clean_complete)
> +		__field(int, tx_clean_complete)
> +		__field(int, irq_num)
> +		__field(int, curr_cpu)
> +		__string(qname, q->name)
> +		__string(dev_name, napi->dev ? napi->dev->name : NO_DEV)
> +		__bitmask(irq_affinity,	nr_cpumask_bits)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->budget = budget;
> +		__entry->budget_per_ring = budget_per_ring;
> +		__entry->rx_cleaned = rx_cleaned;
> +		__entry->tx_cleaned = tx_cleaned;
> +		__entry->rx_clean_complete = rx_clean_complete;
> +		__entry->tx_clean_complete = tx_clean_complete;
> +		__entry->irq_num = q->irq_num;
> +		__entry->curr_cpu = get_cpu();
> +		__assign_str(qname, q->name);
> +		__assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV);
> +		__assign_bitmask(irq_affinity, cpumask_bits(&q->affinity_mask),
> +				 nr_cpumask_bits);
> +	),
> +
> +	TP_printk("i40e_napi_poll on dev %s q %s irq %d irq_mask %s curr_cpu %d "
> +		  "budget %d bpr %d rx_cleaned %lu tx_cleaned %lu "
> +		  "rx_clean_complete %d tx_clean_complete %d",
> +		__get_str(dev_name), __get_str(qname), __entry->irq_num,
> +		__get_bitmask(irq_affinity), __entry->curr_cpu, __entry->budget,
> +		__entry->budget_per_ring, __entry->rx_cleaned, __entry->tx_cleaned,
> +		__entry->rx_clean_complete, __entry->tx_clean_complete)
> +);
> +
>  /* Events related to a vsi & ring */
>  DECLARE_EVENT_CLASS(
>  	i40e_tx_template,
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index adf133b..fb9add8 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -2753,6 +2753,9 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
>  			clean_complete = rx_clean_complete = false;
>  	}
>  
> +	trace_i40e_napi_poll(napi, q_vector, budget, budget_per_ring, rx_cleaned,
> +			     tx_cleaned, rx_clean_complete, tx_clean_complete);
> +
>  	/* If work not completed, return budget and polling will return */
>  	if (!clean_complete) {
>  		int cpu_id = smp_processor_id();
> -- 
> 2.7.4
>
Joe Damato Oct. 7, 2022, 5:49 p.m. UTC | #2
On Fri, Oct 07, 2022 at 10:18:41AM +0200, Maciej Fijalkowski wrote:
> On Thu, Oct 06, 2022 at 04:43:58PM -0700, Joe Damato wrote:
> > Add a tracepoint for i40e_napi_poll that allows users to get detailed
> > information about the amount of work done. This information can help users
> > better tune the correct NAPI parameters (like weight and budget), as well
> > as debug NIC settings like rx-usecs and tx-usecs, etc.
> > 
> > An example of the output from this tracepoint:
> > 
> > $ sudo perf trace -e i40e:i40e_napi_poll -a --call-graph=fp --libtraceevent_print
> > 
> > [..snip..]
> > 
> > 388.258 :0/0 i40e:i40e_napi_poll(i40e_napi_poll on dev eth2 q
> > i40e-eth2-TxRx-9 irq 346 irq_mask
> > 00000000,00000000,00000000,00000000,00000000,00800000 curr_cpu 23 budget
> > 64 bpr 64 rx_cleaned 28 tx_cleaned 0 rx_clean_complete 1
> > tx_clean_complete 1)
> 
> So from AF_XDP POV I won't be using it as I would need some other
> information.
>
> As I said, we don't work on NAPI budget but rather with the
> free ring space and I don't get it here. tx_cleaned is also quite
> incorrect name to me for count of produced descriptors to Tx ring. I feel
> like it would be better to stub it for AF_XDP.
>
> As Jesse said previously we probably can followup with AF_XDP specific
> tracepoint with tx cleaned/tx transmitted/NAPI budget/AF_XDP budget (free
> ring space) if we find the need for it.
> 
> That's my 0.02$, I'm not going to hold this set or whatever, I'll leave the
> decision to Sridhar & Jesse.

I'll send the other patchset I've written and tested as an RFC which doesn't
touch anything in XDP at all and only adds the tracepoint in i40e_napi_poll only
if xdp is not enabled.

The code in that branch for i40e_napi_poll looks like this:

+	if (!i40e_enabled_xdp_vsi(vsi))
+		trace_i40e_napi_poll(napi, q_vector, budget, budget_per_ring, rx_cleaned,
+				     tx_cleaned, rx_clean_complete, tx_clean_complete);

The XDP functions are not modified to take any out parameters in that
branch.

In that case, anyone who cares about XDP (when I think about XDP, I feel
nothing, so that probably won't be me) can go back through and add their
own XDP related tracepoint as the 'else' branch and modify all the xdp
related functions and add (very precisely named) out parameters.

As far as I'm concerned: I submit this code simply because it's been very
useful for me to debug i40e performance and to tune settings and I thought
it might be helpful for others.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/i40e/i40e_trace.h b/drivers/net/ethernet/intel/i40e/i40e_trace.h
index b5b1229..7d7c161 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_trace.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_trace.h
@@ -55,6 +55,55 @@ 
  * being built from shared code.
  */
 
+#define NO_DEV "(i40e no_device)"
+
+TRACE_EVENT(i40e_napi_poll,
+
+	TP_PROTO(struct napi_struct *napi, struct i40e_q_vector *q, int budget,
+		 int budget_per_ring, unsigned int rx_cleaned, unsigned int tx_cleaned,
+		 bool rx_clean_complete, bool tx_clean_complete),
+
+	TP_ARGS(napi, q, budget, budget_per_ring, rx_cleaned, tx_cleaned,
+		rx_clean_complete, tx_clean_complete),
+
+	TP_STRUCT__entry(
+		__field(int, budget)
+		__field(int, budget_per_ring)
+		__field(unsigned int, rx_cleaned)
+		__field(unsigned int, tx_cleaned)
+		__field(int, rx_clean_complete)
+		__field(int, tx_clean_complete)
+		__field(int, irq_num)
+		__field(int, curr_cpu)
+		__string(qname, q->name)
+		__string(dev_name, napi->dev ? napi->dev->name : NO_DEV)
+		__bitmask(irq_affinity,	nr_cpumask_bits)
+	),
+
+	TP_fast_assign(
+		__entry->budget = budget;
+		__entry->budget_per_ring = budget_per_ring;
+		__entry->rx_cleaned = rx_cleaned;
+		__entry->tx_cleaned = tx_cleaned;
+		__entry->rx_clean_complete = rx_clean_complete;
+		__entry->tx_clean_complete = tx_clean_complete;
+		__entry->irq_num = q->irq_num;
+		__entry->curr_cpu = get_cpu();
+		__assign_str(qname, q->name);
+		__assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV);
+		__assign_bitmask(irq_affinity, cpumask_bits(&q->affinity_mask),
+				 nr_cpumask_bits);
+	),
+
+	TP_printk("i40e_napi_poll on dev %s q %s irq %d irq_mask %s curr_cpu %d "
+		  "budget %d bpr %d rx_cleaned %lu tx_cleaned %lu "
+		  "rx_clean_complete %d tx_clean_complete %d",
+		__get_str(dev_name), __get_str(qname), __entry->irq_num,
+		__get_bitmask(irq_affinity), __entry->curr_cpu, __entry->budget,
+		__entry->budget_per_ring, __entry->rx_cleaned, __entry->tx_cleaned,
+		__entry->rx_clean_complete, __entry->tx_clean_complete)
+);
+
 /* Events related to a vsi & ring */
 DECLARE_EVENT_CLASS(
 	i40e_tx_template,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index adf133b..fb9add8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2753,6 +2753,9 @@  int i40e_napi_poll(struct napi_struct *napi, int budget)
 			clean_complete = rx_clean_complete = false;
 	}
 
+	trace_i40e_napi_poll(napi, q_vector, budget, budget_per_ring, rx_cleaned,
+			     tx_cleaned, rx_clean_complete, tx_clean_complete);
+
 	/* If work not completed, return budget and polling will return */
 	if (!clean_complete) {
 		int cpu_id = smp_processor_id();