diff mbox series

[net-next,2/3] net: qualcomm: rmnet: add tx packets aggregation

Message ID 20221109180249.4721-3-dnlplm@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series add tx packets aggregation to ethtool and rmnet | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 4344 this patch: 4347
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang fail Errors and warnings before: 1046 this patch: 1048
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 4530 this patch: 4533
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Daniele Palmas Nov. 9, 2022, 6:02 p.m. UTC
Bidirectional TCP throughput tests through iperf with low-cat
Thread-x based modems showed performance issues both in tx
and rx.

The Windows driver does not show this issue: inspecting USB
packets revealed that the only notable change is the driver
enabling tx packets aggregation.

Tx packets aggregation, by default disabled, requires flag
RMNET_FLAGS_EGRESS_AGGREGATION to be set (e.g. through ip command).

The maximum number of aggregated packets and the maximum aggregated
size are by default set to reasonably low values in order to support
the majority of modems.

This implementation is based on patches available in Code Aurora
repositories (msm kernel) whose main authors are

Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Sean Tranchetti <stranche@codeaurora.org>

Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
---
 .../ethernet/qualcomm/rmnet/rmnet_config.c    |   5 +
 .../ethernet/qualcomm/rmnet/rmnet_config.h    |  19 ++
 .../ethernet/qualcomm/rmnet/rmnet_handlers.c  |  25 ++-
 .../net/ethernet/qualcomm/rmnet/rmnet_map.h   |   7 +
 .../ethernet/qualcomm/rmnet/rmnet_map_data.c  | 196 ++++++++++++++++++
 include/uapi/linux/if_link.h                  |   1 +
 6 files changed, 251 insertions(+), 2 deletions(-)

Comments

Alexander Lobakin Nov. 10, 2022, 5:32 p.m. UTC | #1
From: Daniele Palmas <dnlplm@gmail.com>
Date: Wed,  9 Nov 2022 19:02:48 +0100

> Bidirectional TCP throughput tests through iperf with low-cat
> Thread-x based modems showed performance issues both in tx
> and rx.
> 
> The Windows driver does not show this issue: inspecting USB
> packets revealed that the only notable change is the driver
> enabling tx packets aggregation.
> 
> Tx packets aggregation, by default disabled, requires flag
> RMNET_FLAGS_EGRESS_AGGREGATION to be set (e.g. through ip command).
> 
> The maximum number of aggregated packets and the maximum aggregated
> size are by default set to reasonably low values in order to support
> the majority of modems.
> 
> This implementation is based on patches available in Code Aurora
> repositories (msm kernel) whose main authors are
> 
> Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
> Sean Tranchetti <stranche@codeaurora.org>
> 
> Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
> ---
>  .../ethernet/qualcomm/rmnet/rmnet_config.c    |   5 +
>  .../ethernet/qualcomm/rmnet/rmnet_config.h    |  19 ++
>  .../ethernet/qualcomm/rmnet/rmnet_handlers.c  |  25 ++-
>  .../net/ethernet/qualcomm/rmnet/rmnet_map.h   |   7 +
>  .../ethernet/qualcomm/rmnet/rmnet_map_data.c  | 196 ++++++++++++++++++
>  include/uapi/linux/if_link.h                  |   1 +
>  6 files changed, 251 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
> index 27b1663c476e..39d24e07f306 100644
> --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
> +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
> @@ -12,6 +12,7 @@
>  #include "rmnet_handlers.h"
>  #include "rmnet_vnd.h"
>  #include "rmnet_private.h"
> +#include "rmnet_map.h"
>  
>  /* Local Definitions and Declarations */
>  
> @@ -39,6 +40,8 @@ static int rmnet_unregister_real_device(struct net_device *real_dev)
>  	if (port->nr_rmnet_devs)
>  		return -EINVAL;
>  
> +	rmnet_map_tx_aggregate_exit(port);
> +
>  	netdev_rx_handler_unregister(real_dev);
>  
>  	kfree(port);
> @@ -79,6 +82,8 @@ static int rmnet_register_real_device(struct net_device *real_dev,
>  	for (entry = 0; entry < RMNET_MAX_LOGICAL_EP; entry++)
>  		INIT_HLIST_HEAD(&port->muxed_ep[entry]);
>  
> +	rmnet_map_tx_aggregate_init(port);
> +
>  	netdev_dbg(real_dev, "registered with rmnet\n");
>  	return 0;
>  }
> diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
> index 3d3cba56c516..d341df78e411 100644
> --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
> +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
> @@ -6,6 +6,7 @@
>   */
>  
>  #include <linux/skbuff.h>
> +#include <linux/time.h>
>  #include <net/gro_cells.h>
>  
>  #ifndef _RMNET_CONFIG_H_
> @@ -19,6 +20,12 @@ struct rmnet_endpoint {
>  	struct hlist_node hlnode;
>  };
>  
> +struct rmnet_egress_agg_params {
> +	u16 agg_size;

skbs can now be way longer than 64 Kb.

> +	u16 agg_count;
> +	u64 agg_time_nsec;
> +};
> +
>  /* One instance of this structure is instantiated for each real_dev associated
>   * with rmnet.
>   */

[...]

> @@ -518,3 +519,198 @@ int rmnet_map_process_next_hdr_packet(struct sk_buff *skb,
>  
>  	return 0;
>  }
> +
> +long rmnet_agg_bypass_time __read_mostly = 10000L * NSEC_PER_USEC;

Why __read_mostly if you don't change it anywhere? Could be const.
Why here if you use it only in one file? Could be static there.
Why variable if it could be a definition?

> +
> +bool rmnet_map_tx_agg_skip(struct sk_buff *skb)
> +{
> +	bool is_icmp = 0;
> +
> +	if (skb->protocol == htons(ETH_P_IP)) {
> +		struct iphdr *ip4h = ip_hdr(skb);

[...]

> +static void rmnet_map_flush_tx_packet_work(struct work_struct *work)
> +{
> +	struct sk_buff *skb = NULL;
> +	struct rmnet_port *port;
> +	unsigned long flags;
> +
> +	port = container_of(work, struct rmnet_port, agg_wq);
> +
> +	spin_lock_irqsave(&port->agg_lock, flags);

I don't see aggregation fields used in any hardware interrupt
handlers, are you sure you need _irq*(), not _bh()?

> +	if (likely(port->agg_state == -EINPROGRESS)) {
> +		/* Buffer may have already been shipped out */
> +		if (likely(port->agg_skb)) {
> +			skb = port->agg_skb;
> +			reset_aggr_params(port);
> +		}
> +		port->agg_state = 0;
> +	}
> +
> +	spin_unlock_irqrestore(&port->agg_lock, flags);
> +	if (skb)
> +		dev_queue_xmit(skb);
> +}
> +
> +enum hrtimer_restart rmnet_map_flush_tx_packet_queue(struct hrtimer *t)
> +{
> +	struct rmnet_port *port;
> +
> +	port = container_of(t, struct rmnet_port, hrtimer);
> +
> +	schedule_work(&port->agg_wq);
> +
> +	return HRTIMER_NORESTART;
> +}
> +
> +void rmnet_map_tx_aggregate(struct sk_buff *skb, struct rmnet_port *port,
> +			    struct net_device *orig_dev)
> +{
> +	struct timespec64 diff, last;
> +	int size = 0;

RCT style?

> +	struct sk_buff *agg_skb;
> +	unsigned long flags;
> +
> +new_packet:
> +	spin_lock_irqsave(&port->agg_lock, flags);
> +	memcpy(&last, &port->agg_last, sizeof(struct timespec64));
> +	ktime_get_real_ts64(&port->agg_last);
> +
> +	if (!port->agg_skb) {
> +		/* Check to see if we should agg first. If the traffic is very
> +		 * sparse, don't aggregate.
> +		 */
> +		diff = timespec64_sub(port->agg_last, last);
> +		size = port->egress_agg_params.agg_size - skb->len;
> +
> +		if (size < 0) {
> +			struct rmnet_priv *priv;
> +
> +			/* dropped */

So if a packet is smaller than the aggregation threshold, you just
drop it? Why, if you could just let it go the "standard" way, like
ICMP does?

> +			dev_kfree_skb_any(skb);
> +			spin_unlock_irqrestore(&port->agg_lock, flags);

You could release this lock a line above, so that
dev_kfree_skb_any() wouldn't be called in the HWIRQ context and
postpone skb freeing via the softnet queue.

> +			priv = netdev_priv(orig_dev);
> +			this_cpu_inc(priv->pcpu_stats->stats.tx_drops);
> +
> +			return;
> +		}
> +
> +		if (diff.tv_sec > 0 || diff.tv_nsec > rmnet_agg_bypass_time ||
> +		    size == 0) {
> +			spin_unlock_irqrestore(&port->agg_lock, flags);
> +			skb->protocol = htons(ETH_P_MAP);
> +			dev_queue_xmit(skb);
> +			return;
> +		}
> +
> +		port->agg_skb = skb_copy_expand(skb, 0, size, GFP_ATOMIC);

You could use skb_cow_head(skb, 0) and skip allocating a new skb if
the current one is writable, which usually is the case.

> +		if (!port->agg_skb) {
> +			reset_aggr_params(port);
> +			spin_unlock_irqrestore(&port->agg_lock, flags);
> +			skb->protocol = htons(ETH_P_MAP);
> +			dev_queue_xmit(skb);
> +			return;
> +		}
> +		port->agg_skb->protocol = htons(ETH_P_MAP);
> +		port->agg_count = 1;
> +		ktime_get_real_ts64(&port->agg_time);
> +		dev_kfree_skb_any(skb);
> +		goto schedule;
> +	}
> +	diff = timespec64_sub(port->agg_last, port->agg_time);
> +	size = port->egress_agg_params.agg_size - port->agg_skb->len;
> +
> +	if (skb->len > size ||
> +	    diff.tv_sec > 0 || diff.tv_nsec > port->egress_agg_params.agg_time_nsec) {
> +		agg_skb = port->agg_skb;
> +		reset_aggr_params(port);
> +		spin_unlock_irqrestore(&port->agg_lock, flags);
> +		hrtimer_cancel(&port->hrtimer);
> +		dev_queue_xmit(agg_skb);
> +		goto new_packet;
> +	}
> +
> +	skb_put_data(port->agg_skb, skb->data, skb->len);

IIUC, RMNet netdevs support %NETIF_F_SG. Which means you could just
attach skb data as frags to the first skb in the aggregation
session instead of copying the data all the time.
...or even add %NETIF_F_FRAGLIST handling, that would save even more
-- just use skb->frag_list once you run out of skb_shinfo()->frags.

> +	port->agg_count++;
> +	dev_kfree_skb_any(skb);
> +
> +	if (port->agg_count == port->egress_agg_params.agg_count ||
> +	    port->agg_skb->len == port->egress_agg_params.agg_size) {

I think ::agg_count and ::agg_size are the thresholds, so the
comparison should be >= I guess (especially ::agg_size which gets
increased by a random value each time, not by 1)?

> +		agg_skb = port->agg_skb;
> +		reset_aggr_params(port);
> +		spin_unlock_irqrestore(&port->agg_lock, flags);
> +		hrtimer_cancel(&port->hrtimer);
> +		dev_queue_xmit(agg_skb);
> +		return;
> +	}
> +
> +schedule:
> +	if (!hrtimer_active(&port->hrtimer) && port->agg_state != -EINPROGRESS) {
> +		port->agg_state = -EINPROGRESS;
> +		hrtimer_start(&port->hrtimer,
> +			      ns_to_ktime(port->egress_agg_params.agg_time_nsec),
> +			      HRTIMER_MODE_REL);
> +	}
> +	spin_unlock_irqrestore(&port->agg_lock, flags);
> +}
> +
> +void rmnet_map_update_ul_agg_config(struct rmnet_port *port, u16 size,
> +				    u16 count, u32 time)
> +{
> +	unsigned long irq_flags;
> +
> +	spin_lock_irqsave(&port->agg_lock, irq_flags);
> +	port->egress_agg_params.agg_size = size;
> +	port->egress_agg_params.agg_count = count;
> +	port->egress_agg_params.agg_time_nsec = time * NSEC_PER_USEC;
> +	spin_unlock_irqrestore(&port->agg_lock, irq_flags);
> +}
> +
> +void rmnet_map_tx_aggregate_init(struct rmnet_port *port)
> +{
> +	hrtimer_init(&port->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +	port->hrtimer.function = rmnet_map_flush_tx_packet_queue;
> +	spin_lock_init(&port->agg_lock);
> +	rmnet_map_update_ul_agg_config(port, 4096, 16, 800);
> +	INIT_WORK(&port->agg_wq, rmnet_map_flush_tx_packet_work);
> +}
> +
> +void rmnet_map_tx_aggregate_exit(struct rmnet_port *port)
> +{
> +	unsigned long flags;
> +
> +	hrtimer_cancel(&port->hrtimer);
> +	cancel_work_sync(&port->agg_wq);
> +
> +	spin_lock_irqsave(&port->agg_lock, flags);
> +	if (port->agg_state == -EINPROGRESS) {
> +		if (port->agg_skb) {
> +			kfree_skb(port->agg_skb);
> +			reset_aggr_params(port);
> +		}
> +
> +		port->agg_state = 0;
> +	}
> +
> +	spin_unlock_irqrestore(&port->agg_lock, flags);
> +}

Do I get the whole logics correctly, you allocate a new big skb and
just copy several frames into it, then send as one chunk once its
size reaches the threshold? Plus linearize every skb to be able to
do that... That's too much of overhead I'd say, just handle S/G and
fraglists and make long trains of frags from them without copying
anything? Also BQL/DQL already does some sort of aggregation via
::xmit_more, doesn't it? Do you have any performance numbers?

> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index 5e7a1041df3a..09a30e2b29b1 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -1351,6 +1351,7 @@ enum {
>  #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4            (1U << 3)
>  #define RMNET_FLAGS_INGRESS_MAP_CKSUMV5           (1U << 4)
>  #define RMNET_FLAGS_EGRESS_MAP_CKSUMV5            (1U << 5)
> +#define RMNET_FLAGS_EGRESS_AGGREGATION            (1U << 6)

But you could rely on the aggregation parameters passed via Ethtool
to decide whether to enable aggregation or not. If any of them is 0,
it means the aggregation needs to be disabled.
Otherwise, to enable it you need to use 2 utilities: the one that
creates RMNet devices at first and Ethtool after, isn't it too
complicated for no reason?

>  
>  enum {
>  	IFLA_RMNET_UNSPEC,
> -- 
> 2.37.1

Thanks,
Olek
kernel test robot Nov. 10, 2022, 7:09 p.m. UTC | #2
Hi Daniele,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Daniele-Palmas/add-tx-packets-aggregation-to-ethtool-and-rmnet/20221110-021152
patch link:    https://lore.kernel.org/r/20221109180249.4721-3-dnlplm%40gmail.com
patch subject: [PATCH net-next 2/3] net: qualcomm: rmnet: add tx packets aggregation
config: ia64-allyesconfig
compiler: ia64-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/ab24bdaf594299d926240cfc1b8e78fa7e9af3b4
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Daniele-Palmas/add-tx-packets-aggregation-to-ethtool-and-rmnet/20221110-021152
        git checkout ab24bdaf594299d926240cfc1b8e78fa7e9af3b4
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=ia64 SHELL=/bin/bash drivers/net/ethernet/qualcomm/rmnet/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:575:22: warning: no previous prototype for 'rmnet_map_flush_tx_packet_queue' [-Wmissing-prototypes]
     575 | enum hrtimer_restart rmnet_map_flush_tx_packet_queue(struct hrtimer *t)
         |                      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


vim +/rmnet_map_flush_tx_packet_queue +575 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c

   574	
 > 575	enum hrtimer_restart rmnet_map_flush_tx_packet_queue(struct hrtimer *t)
   576	{
   577		struct rmnet_port *port;
   578	
   579		port = container_of(t, struct rmnet_port, hrtimer);
   580	
   581		schedule_work(&port->agg_wq);
   582	
   583		return HRTIMER_NORESTART;
   584	}
   585
Subash Abhinov Kasiviswanathan (KS) Nov. 11, 2022, 1:17 a.m. UTC | #3
On 11/10/2022 10:32 AM, Alexander Lobakin wrote:
> From: Daniele Palmas <dnlplm@gmail.com>
> Date: Wed,  9 Nov 2022 19:02:48 +0100
> 
>> Bidirectional TCP throughput tests through iperf with low-cat
>> Thread-x based modems showed performance issues both in tx
>> and rx.
>>
>> The Windows driver does not show this issue: inspecting USB
>> packets revealed that the only notable change is the driver
>> enabling tx packets aggregation.
>>
>> Tx packets aggregation, by default disabled, requires flag
>> RMNET_FLAGS_EGRESS_AGGREGATION to be set (e.g. through ip command).
>>
>> The maximum number of aggregated packets and the maximum aggregated
>> size are by default set to reasonably low values in order to support
>> the majority of modems.
>>
>> This implementation is based on patches available in Code Aurora
>> repositories (msm kernel) whose main authors are
>>
>> Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
>> Sean Tranchetti <stranche@codeaurora.org>
>>
>> Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
>> ---
>>   
>> +struct rmnet_egress_agg_params {
>> +	u16 agg_size;
> 
> skbs can now be way longer than 64 Kb.
> 

rmnet devices generally use a standard MTU (around 1500) size.
Would it still be possible for >64kb to be generated as no relevant 
hw_features is set for large transmit offloads.
Alternatively, are you referring to injection of packets explicitly, say 
via packet sockets.

>> +	u16 agg_count;
>> +	u64 agg_time_nsec;
>> +};
>> +
> Do I get the whole logics correctly, you allocate a new big skb and
> just copy several frames into it, then send as one chunk once its
> size reaches the threshold? Plus linearize every skb to be able to
> do that... That's too much of overhead I'd say, just handle S/G and
> fraglists and make long trains of frags from them without copying
> anything? Also BQL/DQL already does some sort of aggregation via
> ::xmit_more, doesn't it? Do you have any performance numbers?

The difference here is that hardware would use a single descriptor for 
aggregation vs multiple descriptors for scatter gather.

I wonder if this issue is related to pacing though.
Daniele, perhaps you can try this hack without enabling EGRESS 
AGGREGATION and check if you are able to reach the same level of 
performance for your scenario.

--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
@@ -236,7 +236,7 @@ void rmnet_egress_handler(struct sk_buff *skb)
         struct rmnet_priv *priv;
         u8 mux_id;

-       sk_pacing_shift_update(skb->sk, 8);
+       skb_orphan(skb);

         orig_dev = skb->dev;
         priv = netdev_priv(orig_dev);

> 
>> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
>> index 5e7a1041df3a..09a30e2b29b1 100644
>> --- a/include/uapi/linux/if_link.h
>> +++ b/include/uapi/linux/if_link.h
>> @@ -1351,6 +1351,7 @@ enum {
>>   #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4            (1U << 3)
>>   #define RMNET_FLAGS_INGRESS_MAP_CKSUMV5           (1U << 4)
>>   #define RMNET_FLAGS_EGRESS_MAP_CKSUMV5            (1U << 5)
>> +#define RMNET_FLAGS_EGRESS_AGGREGATION            (1U << 6)
> 
> But you could rely on the aggregation parameters passed via Ethtool
> to decide whether to enable aggregation or not. If any of them is 0,
> it means the aggregation needs to be disabled.
> Otherwise, to enable it you need to use 2 utilities: the one that
> creates RMNet devices at first and Ethtool after, isn't it too
> complicated for no reason?

Yes, the EGRESS AGGREGATION parameters can be added as part of the rmnet 
netlink policies.

> 
>>   
>>   enum {
>>   	IFLA_RMNET_UNSPEC,
>> -- 
>> 2.37.1
> 
> Thanks,
> Olek
Jakub Kicinski Nov. 11, 2022, 5:11 p.m. UTC | #4
On Thu, 10 Nov 2022 18:17:09 -0700 Subash Abhinov Kasiviswanathan (KS)
wrote:
> The difference here is that hardware would use a single descriptor for 
> aggregation vs multiple descriptors for scatter gather.
> 
> I wonder if this issue is related to pacing though.
> Daniele, perhaps you can try this hack without enabling EGRESS 
> AGGREGATION and check if you are able to reach the same level of 
> performance for your scenario.
> 
> --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
> +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
> @@ -236,7 +236,7 @@ void rmnet_egress_handler(struct sk_buff *skb)
>          struct rmnet_priv *priv;
>          u8 mux_id;
> 
> -       sk_pacing_shift_update(skb->sk, 8);
> +       skb_orphan(skb);
> 
>          orig_dev = skb->dev;
>          priv = netdev_priv(orig_dev);

The pacing shift update is much cleaner than orphaning packets, IMHO.
And the aggregation should hold onto the original skbs (a list?) and
free them once the aggregate was transmitted.
Jakub Kicinski Nov. 11, 2022, 5:14 p.m. UTC | #5
On Wed,  9 Nov 2022 19:02:48 +0100 Daniele Palmas wrote:
> +bool rmnet_map_tx_agg_skip(struct sk_buff *skb)
> +{
> +	bool is_icmp = 0;
> +
> +	if (skb->protocol == htons(ETH_P_IP)) {
> +		struct iphdr *ip4h = ip_hdr(skb);
> +
> +		if (ip4h->protocol == IPPROTO_ICMP)
> +			is_icmp = true;
> +	} else if (skb->protocol == htons(ETH_P_IPV6)) {
> +		unsigned int icmp_offset = 0;
> +
> +		if (ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, NULL, NULL) == IPPROTO_ICMPV6)
> +			is_icmp = true;
> +	}
> +
> +	return is_icmp;
> +}

Why this? I don't see it mention in the commit message or any code
comment.
Daniele Palmas Nov. 11, 2022, 5:23 p.m. UTC | #6
Hi Alexander,

Il giorno gio 10 nov 2022 alle ore 18:35 Alexander Lobakin
<alexandr.lobakin@intel.com> ha scritto:
>
> From: Daniele Palmas <dnlplm@gmail.com>
> Date: Wed,  9 Nov 2022 19:02:48 +0100
>
> > Bidirectional TCP throughput tests through iperf with low-cat
> > Thread-x based modems showed performance issues both in tx
> > and rx.
> >
> > The Windows driver does not show this issue: inspecting USB
> > packets revealed that the only notable change is the driver
> > enabling tx packets aggregation.
> >
> > Tx packets aggregation, by default disabled, requires flag
> > RMNET_FLAGS_EGRESS_AGGREGATION to be set (e.g. through ip command).
> >
> > The maximum number of aggregated packets and the maximum aggregated
> > size are by default set to reasonably low values in order to support
> > the majority of modems.
> >
> > This implementation is based on patches available in Code Aurora
> > repositories (msm kernel) whose main authors are
> >
> > Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
> > Sean Tranchetti <stranche@codeaurora.org>
> >
> > Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
> > ---
> >  .../ethernet/qualcomm/rmnet/rmnet_config.c    |   5 +
> >  .../ethernet/qualcomm/rmnet/rmnet_config.h    |  19 ++
> >  .../ethernet/qualcomm/rmnet/rmnet_handlers.c  |  25 ++-
> >  .../net/ethernet/qualcomm/rmnet/rmnet_map.h   |   7 +
> >  .../ethernet/qualcomm/rmnet/rmnet_map_data.c  | 196 ++++++++++++++++++
> >  include/uapi/linux/if_link.h                  |   1 +
> >  6 files changed, 251 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
> > index 27b1663c476e..39d24e07f306 100644
> > --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
> > +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
> > @@ -12,6 +12,7 @@
> >  #include "rmnet_handlers.h"
> >  #include "rmnet_vnd.h"
> >  #include "rmnet_private.h"
> > +#include "rmnet_map.h"
> >
> >  /* Local Definitions and Declarations */
> >
> > @@ -39,6 +40,8 @@ static int rmnet_unregister_real_device(struct net_device *real_dev)
> >       if (port->nr_rmnet_devs)
> >               return -EINVAL;
> >
> > +     rmnet_map_tx_aggregate_exit(port);
> > +
> >       netdev_rx_handler_unregister(real_dev);
> >
> >       kfree(port);
> > @@ -79,6 +82,8 @@ static int rmnet_register_real_device(struct net_device *real_dev,
> >       for (entry = 0; entry < RMNET_MAX_LOGICAL_EP; entry++)
> >               INIT_HLIST_HEAD(&port->muxed_ep[entry]);
> >
> > +     rmnet_map_tx_aggregate_init(port);
> > +
> >       netdev_dbg(real_dev, "registered with rmnet\n");
> >       return 0;
> >  }
> > diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
> > index 3d3cba56c516..d341df78e411 100644
> > --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
> > +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
> > @@ -6,6 +6,7 @@
> >   */
> >
> >  #include <linux/skbuff.h>
> > +#include <linux/time.h>
> >  #include <net/gro_cells.h>
> >
> >  #ifndef _RMNET_CONFIG_H_
> > @@ -19,6 +20,12 @@ struct rmnet_endpoint {
> >       struct hlist_node hlnode;
> >  };
> >
> > +struct rmnet_egress_agg_params {
> > +     u16 agg_size;
>
> skbs can now be way longer than 64 Kb.
>

OK, I can use u32.

For further information, I would like to explain where u16 is coming from.

The value of agg_size is returned by the modem through a qmi request
(wda set data format): so far, all the modem I've been able to test
have less than 64KB as the maximum size for the aggregated packets
block, so I thought u16 was enough.

But, to be honest, I can't exclude that there are modems where this
value is > 64KB, so I think it could make sense using u32.

> > +     u16 agg_count;
> > +     u64 agg_time_nsec;
> > +};
> > +
> >  /* One instance of this structure is instantiated for each real_dev associated
> >   * with rmnet.
> >   */
>
> [...]
>
> > @@ -518,3 +519,198 @@ int rmnet_map_process_next_hdr_packet(struct sk_buff *skb,
> >
> >       return 0;
> >  }
> > +
> > +long rmnet_agg_bypass_time __read_mostly = 10000L * NSEC_PER_USEC;
>
> Why __read_mostly if you don't change it anywhere? Could be const.
> Why here if you use it only in one file? Could be static there.
> Why variable if it could be a definition?
>

All your remarks make sense.

> > +
> > +bool rmnet_map_tx_agg_skip(struct sk_buff *skb)
> > +{
> > +     bool is_icmp = 0;
> > +
> > +     if (skb->protocol == htons(ETH_P_IP)) {
> > +             struct iphdr *ip4h = ip_hdr(skb);
>
> [...]
>
> > +static void rmnet_map_flush_tx_packet_work(struct work_struct *work)
> > +{
> > +     struct sk_buff *skb = NULL;
> > +     struct rmnet_port *port;
> > +     unsigned long flags;
> > +
> > +     port = container_of(work, struct rmnet_port, agg_wq);
> > +
> > +     spin_lock_irqsave(&port->agg_lock, flags);
>
> I don't see aggregation fields used in any hardware interrupt
> handlers, are you sure you need _irq*(), not _bh()?
>

I think you are right.

> > +     if (likely(port->agg_state == -EINPROGRESS)) {
> > +             /* Buffer may have already been shipped out */
> > +             if (likely(port->agg_skb)) {
> > +                     skb = port->agg_skb;
> > +                     reset_aggr_params(port);
> > +             }
> > +             port->agg_state = 0;
> > +     }
> > +
> > +     spin_unlock_irqrestore(&port->agg_lock, flags);
> > +     if (skb)
> > +             dev_queue_xmit(skb);
> > +}
> > +
> > +enum hrtimer_restart rmnet_map_flush_tx_packet_queue(struct hrtimer *t)
> > +{
> > +     struct rmnet_port *port;
> > +
> > +     port = container_of(t, struct rmnet_port, hrtimer);
> > +
> > +     schedule_work(&port->agg_wq);
> > +
> > +     return HRTIMER_NORESTART;
> > +}
> > +
> > +void rmnet_map_tx_aggregate(struct sk_buff *skb, struct rmnet_port *port,
> > +                         struct net_device *orig_dev)
> > +{
> > +     struct timespec64 diff, last;
> > +     int size = 0;
>
> RCT style?
>

Ack.

> > +     struct sk_buff *agg_skb;
> > +     unsigned long flags;
> > +
> > +new_packet:
> > +     spin_lock_irqsave(&port->agg_lock, flags);
> > +     memcpy(&last, &port->agg_last, sizeof(struct timespec64));
> > +     ktime_get_real_ts64(&port->agg_last);
> > +
> > +     if (!port->agg_skb) {
> > +             /* Check to see if we should agg first. If the traffic is very
> > +              * sparse, don't aggregate.
> > +              */
> > +             diff = timespec64_sub(port->agg_last, last);
> > +             size = port->egress_agg_params.agg_size - skb->len;
> > +
> > +             if (size < 0) {
> > +                     struct rmnet_priv *priv;
> > +
> > +                     /* dropped */
>
> So if a packet is smaller than the aggregation threshold, you just
> drop it? Why, if you could just let it go the "standard" way, like
> ICMP does?
>

If skb->len is bigger than the maximum possible aggregated block size,
then the packet is dropped: this is because with some modems, sending
a block that is bigger than the maximum supported size causes the
modem to crash.

> > +                     dev_kfree_skb_any(skb);
> > +                     spin_unlock_irqrestore(&port->agg_lock, flags);
>
> You could release this lock a line above, so that
> dev_kfree_skb_any() wouldn't be called in the HWIRQ context and
> postpone skb freeing via the softnet queue.
>

Ack.

> > +                     priv = netdev_priv(orig_dev);
> > +                     this_cpu_inc(priv->pcpu_stats->stats.tx_drops);
> > +
> > +                     return;
> > +             }
> > +
> > +             if (diff.tv_sec > 0 || diff.tv_nsec > rmnet_agg_bypass_time ||
> > +                 size == 0) {
> > +                     spin_unlock_irqrestore(&port->agg_lock, flags);
> > +                     skb->protocol = htons(ETH_P_MAP);
> > +                     dev_queue_xmit(skb);
> > +                     return;
> > +             }
> > +
> > +             port->agg_skb = skb_copy_expand(skb, 0, size, GFP_ATOMIC);
>
> You could use skb_cow_head(skb, 0) and skip allocating a new skb if
> the current one is writable, which usually is the case.
>

Ok, I will look at how to do that.

> > +             if (!port->agg_skb) {
> > +                     reset_aggr_params(port);
> > +                     spin_unlock_irqrestore(&port->agg_lock, flags);
> > +                     skb->protocol = htons(ETH_P_MAP);
> > +                     dev_queue_xmit(skb);
> > +                     return;
> > +             }
> > +             port->agg_skb->protocol = htons(ETH_P_MAP);
> > +             port->agg_count = 1;
> > +             ktime_get_real_ts64(&port->agg_time);
> > +             dev_kfree_skb_any(skb);
> > +             goto schedule;
> > +     }
> > +     diff = timespec64_sub(port->agg_last, port->agg_time);
> > +     size = port->egress_agg_params.agg_size - port->agg_skb->len;
> > +
> > +     if (skb->len > size ||
> > +         diff.tv_sec > 0 || diff.tv_nsec > port->egress_agg_params.agg_time_nsec) {
> > +             agg_skb = port->agg_skb;
> > +             reset_aggr_params(port);
> > +             spin_unlock_irqrestore(&port->agg_lock, flags);
> > +             hrtimer_cancel(&port->hrtimer);
> > +             dev_queue_xmit(agg_skb);
> > +             goto new_packet;
> > +     }
> > +
> > +     skb_put_data(port->agg_skb, skb->data, skb->len);
>
> IIUC, RMNet netdevs support %NETIF_F_SG. Which means you could just
> attach skb data as frags to the first skb in the aggregation
> session instead of copying the data all the time.
> ...or even add %NETIF_F_FRAGLIST handling, that would save even more
> -- just use skb->frag_list once you run out of skb_shinfo()->frags.
>

I think I get the general idea behind your comment, but honestly I
need to study and work on this a bit more, since I'm missing specific
knowledge on the things you are mentioning.

> > +     port->agg_count++;
> > +     dev_kfree_skb_any(skb);
> > +
> > +     if (port->agg_count == port->egress_agg_params.agg_count ||
> > +         port->agg_skb->len == port->egress_agg_params.agg_size) {
>
> I think ::agg_count and ::agg_size are the thresholds, so the
> comparison should be >= I guess (especially ::agg_size which gets
> increased by a random value each time, not by 1)?
>

Sorry, I'm probably missing something.

port->egress_agg_params.agg_count and port->egress_agg_params.agg_size
are supposed to stay fixed throughout the life of the device since
they are configured usually during modem initialization with the
values returned by qmi wda set data format request.

The number of packets in an aggregated block sent to the modem can't
be > port->egress_agg_params.agg_count and the size of the aggregated
block can't be > port->egress_agg_params.agg_size, otherwise the modem
could crash.

The actual status of the aggregated block is stored in port->agg_count
and port->agg_skb->len. Due to the reason above (modem crashing) the
code should not arrive in a situation in which port->agg_skb->len >
port->egress_agg_params.agg_size or port->agg_count >
port->egress_agg_params.agg_count, besides possible bug I could have
done.

I think I don't understand this sentence "especially ::agg_size which
gets increased by a random value each time, not by 1".

> > +             agg_skb = port->agg_skb;
> > +             reset_aggr_params(port);
> > +             spin_unlock_irqrestore(&port->agg_lock, flags);
> > +             hrtimer_cancel(&port->hrtimer);
> > +             dev_queue_xmit(agg_skb);
> > +             return;
> > +     }
> > +
> > +schedule:
> > +     if (!hrtimer_active(&port->hrtimer) && port->agg_state != -EINPROGRESS) {
> > +             port->agg_state = -EINPROGRESS;
> > +             hrtimer_start(&port->hrtimer,
> > +                           ns_to_ktime(port->egress_agg_params.agg_time_nsec),
> > +                           HRTIMER_MODE_REL);
> > +     }
> > +     spin_unlock_irqrestore(&port->agg_lock, flags);
> > +}
> > +
> > +void rmnet_map_update_ul_agg_config(struct rmnet_port *port, u16 size,
> > +                                 u16 count, u32 time)
> > +{
> > +     unsigned long irq_flags;
> > +
> > +     spin_lock_irqsave(&port->agg_lock, irq_flags);
> > +     port->egress_agg_params.agg_size = size;
> > +     port->egress_agg_params.agg_count = count;
> > +     port->egress_agg_params.agg_time_nsec = time * NSEC_PER_USEC;
> > +     spin_unlock_irqrestore(&port->agg_lock, irq_flags);
> > +}
> > +
> > +void rmnet_map_tx_aggregate_init(struct rmnet_port *port)
> > +{
> > +     hrtimer_init(&port->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> > +     port->hrtimer.function = rmnet_map_flush_tx_packet_queue;
> > +     spin_lock_init(&port->agg_lock);
> > +     rmnet_map_update_ul_agg_config(port, 4096, 16, 800);
> > +     INIT_WORK(&port->agg_wq, rmnet_map_flush_tx_packet_work);
> > +}
> > +
> > +void rmnet_map_tx_aggregate_exit(struct rmnet_port *port)
> > +{
> > +     unsigned long flags;
> > +
> > +     hrtimer_cancel(&port->hrtimer);
> > +     cancel_work_sync(&port->agg_wq);
> > +
> > +     spin_lock_irqsave(&port->agg_lock, flags);
> > +     if (port->agg_state == -EINPROGRESS) {
> > +             if (port->agg_skb) {
> > +                     kfree_skb(port->agg_skb);
> > +                     reset_aggr_params(port);
> > +             }
> > +
> > +             port->agg_state = 0;
> > +     }
> > +
> > +     spin_unlock_irqrestore(&port->agg_lock, flags);
> > +}
>
> Do I get the whole logics correctly, you allocate a new big skb and
> just copy several frames into it, then send as one chunk once its
> size reaches the threshold? Plus linearize every skb to be able to
> do that... That's too much of overhead I'd say, just handle S/G and
> fraglists and make long trains of frags from them without copying
> anything?

Yes, you get the logic right.

And my reply is like above, I think I understand your comment at a
high level, but I need to learn how to do that.

> Also BQL/DQL already does some sort of aggregation via
> ::xmit_more, doesn't it? Do you have any performance numbers?
>

I've mainly tested with iperf tcp with the cat. 4 modem in which the
issue is showing and the driver is capable of reaching the limits of
the modem (50Mbps), but I understand that it's quite low.

I've tried testing also with a 5G modem, but I could only test with a
custom udp loopback mode, no real network or network simulator,
through iperf, something like:

$ iperf --client 192.168.1.172 --udp --interval 10 --port 5001
--bandwidth 600M --parallel 3 --time 10
------------------------------------------------------------
Client connecting to 192.168.1.172, UDP port 5001
Sending 1470 byte datagrams, IPG target: 18.69 us (kalman adjust)
UDP buffer size:  208 KByte (default)
------------------------------------------------------------
[  6] local 192.168.48.173 port 54765 connected with 192.168.1.172 port 5001
[  3] local 192.168.48.173 port 41767 connected with 192.168.1.172 port 5001
[  5] local 192.168.48.173 port 33300 connected with 192.168.1.172 port 5001
[ ID] Interval       Transfer     Bandwidth
[  6]  0.0-10.0 sec   750 MBytes   629 Mbits/sec
[  5]  0.0-10.0 sec   750 MBytes   629 Mbits/sec
[  5]  0.0-10.0 sec   750 MBytes   629 Mbits/sec
[  5] Sent 534987 datagrams
[SUM]  0.0-10.0 sec  2.20 GBytes  1.89 Gbits/sec
[SUM] Sent 1604960 datagrams
[  5] Server Report:
[  5]  0.0-10.2 sec  1.21 GBytes  1.02 Gbits/sec  23.271 ms    0/534988 (0%)
[  5] 0.0000-10.2199 sec  835183 datagrams received out-of-order
[  6]  0.0-10.0 sec   750 MBytes   629 Mbits/sec
[  6] Sent 534990 datagrams
[  6] Server Report:
[  6]  0.0-10.2 sec  1.20 GBytes  1.00 Gbits/sec  14.879 ms    0/534991 (0%)
[  6] 0.0000-10.2320 sec  818506 datagrams received out-of-order
[  3]  0.0-10.0 sec   750 MBytes   629 Mbits/sec
[  3] Sent 534978 datagrams
[  3] Server Report:
[  3]  0.0-10.2 sec  1.20 GBytes  1.01 Gbits/sec  11.631 ms    0/534979 (0%)
[  3] 0.0000-10.2320 sec  824594 datagrams received out-of-order

Not sure how much meaningful those are, since I don't have a clear
vision on how the modem implements the loopback.

> > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> > index 5e7a1041df3a..09a30e2b29b1 100644
> > --- a/include/uapi/linux/if_link.h
> > +++ b/include/uapi/linux/if_link.h
> > @@ -1351,6 +1351,7 @@ enum {
> >  #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4            (1U << 3)
> >  #define RMNET_FLAGS_INGRESS_MAP_CKSUMV5           (1U << 4)
> >  #define RMNET_FLAGS_EGRESS_MAP_CKSUMV5            (1U << 5)
> > +#define RMNET_FLAGS_EGRESS_AGGREGATION            (1U << 6)
>
> But you could rely on the aggregation parameters passed via Ethtool
> to decide whether to enable aggregation or not. If any of them is 0,
> it means the aggregation needs to be disabled.
> Otherwise, to enable it you need to use 2 utilities: the one that
> creates RMNet devices at first and Ethtool after, isn't it too
> complicated for no reason?
>

The rationale behind this was to keep a conservative approach by
leaving the tx aggregation disabled by default and at the same time
provide default values for the aggregation parameters.

But I agree with you that it is redundant, I can manage that with just
the value of the egress_agg_params.agg_count (1 = no aggregation).

Thanks for your time and for the review.

Thanks,
Daniele

> >
> >  enum {
> >       IFLA_RMNET_UNSPEC,
> > --
> > 2.37.1
>
> Thanks,
> Olek
Daniele Palmas Nov. 11, 2022, 10 p.m. UTC | #7
Hello Subash,

Il giorno ven 11 nov 2022 alle ore 02:17 Subash Abhinov
Kasiviswanathan (KS) <quic_subashab@quicinc.com> ha scritto:
>
> On 11/10/2022 10:32 AM, Alexander Lobakin wrote:
> > From: Daniele Palmas <dnlplm@gmail.com>
> > Date: Wed,  9 Nov 2022 19:02:48 +0100
> >
> >> Bidirectional TCP throughput tests through iperf with low-cat
> >> Thread-x based modems showed performance issues both in tx
> >> and rx.
> >>
> >> The Windows driver does not show this issue: inspecting USB
> >> packets revealed that the only notable change is the driver
> >> enabling tx packets aggregation.
> >>
> >> Tx packets aggregation, by default disabled, requires flag
> >> RMNET_FLAGS_EGRESS_AGGREGATION to be set (e.g. through ip command).
> >>
> >> The maximum number of aggregated packets and the maximum aggregated
> >> size are by default set to reasonably low values in order to support
> >> the majority of modems.
> >>
> >> This implementation is based on patches available in Code Aurora
> >> repositories (msm kernel) whose main authors are
> >>
> >> Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
> >> Sean Tranchetti <stranche@codeaurora.org>
> >>
> >> Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
> >> ---
> >>
> >> +struct rmnet_egress_agg_params {
> >> +    u16 agg_size;
> >
> > skbs can now be way longer than 64 Kb.
> >
>
> rmnet devices generally use a standard MTU (around 1500) size.
> Would it still be possible for >64kb to be generated as no relevant
> hw_features is set for large transmit offloads.
> Alternatively, are you referring to injection of packets explicitly, say
> via packet sockets.
>
> >> +    u16 agg_count;
> >> +    u64 agg_time_nsec;
> >> +};
> >> +
> > Do I get the whole logics correctly, you allocate a new big skb and
> > just copy several frames into it, then send as one chunk once its
> > size reaches the threshold? Plus linearize every skb to be able to
> > do that... That's too much of overhead I'd say, just handle S/G and
> > fraglists and make long trains of frags from them without copying
> > anything? Also BQL/DQL already does some sort of aggregation via
> > ::xmit_more, doesn't it? Do you have any performance numbers?
>
> The difference here is that hardware would use a single descriptor for
> aggregation vs multiple descriptors for scatter gather.
>
> I wonder if this issue is related to pacing though.
> Daniele, perhaps you can try this hack without enabling EGRESS
> AGGREGATION and check if you are able to reach the same level of
> performance for your scenario.
>
> --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
> +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
> @@ -236,7 +236,7 @@ void rmnet_egress_handler(struct sk_buff *skb)
>          struct rmnet_priv *priv;
>          u8 mux_id;
>
> -       sk_pacing_shift_update(skb->sk, 8);
> +       skb_orphan(skb);
>
>          orig_dev = skb->dev;
>          priv = netdev_priv(orig_dev);
>

Sure, I'll test that on Monday.

> >
> >> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> >> index 5e7a1041df3a..09a30e2b29b1 100644
> >> --- a/include/uapi/linux/if_link.h
> >> +++ b/include/uapi/linux/if_link.h
> >> @@ -1351,6 +1351,7 @@ enum {
> >>   #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4            (1U << 3)
> >>   #define RMNET_FLAGS_INGRESS_MAP_CKSUMV5           (1U << 4)
> >>   #define RMNET_FLAGS_EGRESS_MAP_CKSUMV5            (1U << 5)
> >> +#define RMNET_FLAGS_EGRESS_AGGREGATION            (1U << 6)
> >
> > But you could rely on the aggregation parameters passed via Ethtool
> > to decide whether to enable aggregation or not. If any of them is 0,
> > it means the aggregation needs to be disabled.
> > Otherwise, to enable it you need to use 2 utilities: the one that
> > creates RMNet devices at first and Ethtool after, isn't it too
> > complicated for no reason?
>
> Yes, the EGRESS AGGREGATION parameters can be added as part of the rmnet
> netlink policies.
>

Ack.

Thanks,
Daniele

> >
> >>
> >>   enum {
> >>      IFLA_RMNET_UNSPEC,
> >> --
> >> 2.37.1
> >
> > Thanks,
> > Olek
Daniele Palmas Nov. 14, 2022, 8:48 a.m. UTC | #8
Hello Subash,

Il giorno ven 11 nov 2022 alle ore 23:00 Daniele Palmas
<dnlplm@gmail.com> ha scritto:
>
> Hello Subash,
>
> Il giorno ven 11 nov 2022 alle ore 02:17 Subash Abhinov
> Kasiviswanathan (KS) <quic_subashab@quicinc.com> ha scritto:
> >
> > On 11/10/2022 10:32 AM, Alexander Lobakin wrote:
> > > From: Daniele Palmas <dnlplm@gmail.com>
> > > Date: Wed,  9 Nov 2022 19:02:48 +0100
> > >
> > >> Bidirectional TCP throughput tests through iperf with low-cat
> > >> Thread-x based modems showed performance issues both in tx
> > >> and rx.
> > >>
> > >> The Windows driver does not show this issue: inspecting USB
> > >> packets revealed that the only notable change is the driver
> > >> enabling tx packets aggregation.
> > >>
> > >> Tx packets aggregation, by default disabled, requires flag
> > >> RMNET_FLAGS_EGRESS_AGGREGATION to be set (e.g. through ip command).
> > >>
> > >> The maximum number of aggregated packets and the maximum aggregated
> > >> size are by default set to reasonably low values in order to support
> > >> the majority of modems.
> > >>
> > >> This implementation is based on patches available in Code Aurora
> > >> repositories (msm kernel) whose main authors are
> > >>
> > >> Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
> > >> Sean Tranchetti <stranche@codeaurora.org>
> > >>
> > >> Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
> > >> ---
> > >>
> > >> +struct rmnet_egress_agg_params {
> > >> +    u16 agg_size;
> > >
> > > skbs can now be way longer than 64 Kb.
> > >
> >
> > rmnet devices generally use a standard MTU (around 1500) size.
> > Would it still be possible for >64kb to be generated as no relevant
> > hw_features is set for large transmit offloads.
> > Alternatively, are you referring to injection of packets explicitly, say
> > via packet sockets.
> >
> > >> +    u16 agg_count;
> > >> +    u64 agg_time_nsec;
> > >> +};
> > >> +
> > > Do I get the whole logics correctly, you allocate a new big skb and
> > > just copy several frames into it, then send as one chunk once its
> > > size reaches the threshold? Plus linearize every skb to be able to
> > > do that... That's too much of overhead I'd say, just handle S/G and
> > > fraglists and make long trains of frags from them without copying
> > > anything? Also BQL/DQL already does some sort of aggregation via
> > > ::xmit_more, doesn't it? Do you have any performance numbers?
> >
> > The difference here is that hardware would use a single descriptor for
> > aggregation vs multiple descriptors for scatter gather.
> >
> > I wonder if this issue is related to pacing though.
> > Daniele, perhaps you can try this hack without enabling EGRESS
> > AGGREGATION and check if you are able to reach the same level of
> > performance for your scenario.
> >
> > --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
> > +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
> > @@ -236,7 +236,7 @@ void rmnet_egress_handler(struct sk_buff *skb)
> >          struct rmnet_priv *priv;
> >          u8 mux_id;
> >
> > -       sk_pacing_shift_update(skb->sk, 8);
> > +       skb_orphan(skb);
> >
> >          orig_dev = skb->dev;
> >          priv = netdev_priv(orig_dev);
> >
>
> Sure, I'll test that on Monday.
>

Hello Subash,

This change does not have any effect.

Start the tx test alone:

$ sudo iperf3 -c 172.22.1.201 -t 60 -p 5001 -P 3
Connecting to host 172.22.1.201, port 5001
[  5] local 172.22.1.102 port 45902 connected to 172.22.1.201 port 5001
[  7] local 172.22.1.102 port 45908 connected to 172.22.1.201 port 5001
[  9] local 172.22.1.102 port 45910 connected to 172.22.1.201 port 5001
[ ID] Interval           Transfer     Bitrate         Retr  Cwnd
[  5]   0.00-1.00   sec  1.90 MBytes  15.9 Mbits/sec    0    114 KBytes
[  7]   0.00-1.00   sec  1.07 MBytes  8.94 Mbits/sec    0   71.3 KBytes
[  9]   0.00-1.00   sec  5.02 MBytes  42.1 Mbits/sec    0    344 KBytes
[SUM]   0.00-1.00   sec  7.98 MBytes  66.9 Mbits/sec    0
- - - - - - - - - - - - - - - - - - - - - - - - -
[  5]   1.00-2.00   sec  1.30 MBytes  10.9 Mbits/sec    4    104 KBytes
[  7]   1.00-2.00   sec   949 KBytes  7.77 Mbits/sec    6   64.6 KBytes
[  9]   1.00-2.00   sec  3.71 MBytes  31.1 Mbits/sec   20    284 KBytes
[SUM]   1.00-2.00   sec  5.93 MBytes  49.7 Mbits/sec   30
- - - - - - - - - - - - - - - - - - - - - - - - -
[  5]   2.00-3.00   sec  1.24 MBytes  10.4 Mbits/sec    0    117 KBytes
[  7]   2.00-3.00   sec   759 KBytes  6.22 Mbits/sec    0   72.7 KBytes
[  9]   2.00-3.00   sec  3.71 MBytes  31.1 Mbits/sec    0    319 KBytes
[SUM]   2.00-3.00   sec  5.68 MBytes  47.7 Mbits/sec    0
- - - - - - - - - - - - - - - - - - - - - - - - -
[  5]   3.00-4.00   sec  1.85 MBytes  15.5 Mbits/sec    0    124 KBytes
[  7]   3.00-4.00   sec   949 KBytes  7.77 Mbits/sec    0   76.7 KBytes
[  9]   3.00-4.00   sec  3.09 MBytes  25.9 Mbits/sec    5    246 KBytes
[SUM]   3.00-4.00   sec  5.87 MBytes  49.2 Mbits/sec    5
...

Start the rx test:

$ sudo iperf3 -R -c 172.22.1.201 -t 30 -p 5002
Connecting to host 172.22.1.201, port 5002
Reverse mode, remote host 172.22.1.201 is sending
[  5] local 172.22.1.102 port 46496 connected to 172.22.1.201 port 5002
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec  10.4 MBytes  87.4 Mbits/sec
[  5]   1.00-2.00   sec  12.7 MBytes   107 Mbits/sec
[  5]   2.00-3.00   sec  12.6 MBytes   106 Mbits/sec
[  5]   3.00-4.00   sec  12.6 MBytes   105 Mbits/sec
[  5]   4.00-5.00   sec  12.6 MBytes   106 Mbits/sec

Checking the tx path:

[  5]  11.00-12.00  sec   632 KBytes  5.18 Mbits/sec   16   95.5 KBytes
[  7]  11.00-12.00  sec   506 KBytes  4.14 Mbits/sec   16   52.5 KBytes
[  9]  11.00-12.00  sec   759 KBytes  6.21 Mbits/sec   17    143 KBytes
[SUM]  11.00-12.00  sec  1.85 MBytes  15.5 Mbits/sec   49
- - - - - - - - - - - - - - - - - - - - - - - - -
[  5]  12.00-13.00  sec   316 KBytes  2.59 Mbits/sec   67   26.9 KBytes
[  7]  12.00-13.00  sec   253 KBytes  2.07 Mbits/sec   44   9.42 KBytes
[  9]  12.00-13.00  sec   759 KBytes  6.22 Mbits/sec   66   94.2 KBytes
[SUM]  12.00-13.00  sec  1.30 MBytes  10.9 Mbits/sec  177
- - - - - - - - - - - - - - - - - - - - - - - - -
[  5]  13.00-14.00  sec   316 KBytes  2.59 Mbits/sec   58   10.8 KBytes
[  7]  13.00-14.00  sec   253 KBytes  2.07 Mbits/sec    6   10.8 KBytes
[  9]  13.00-14.00  sec  0.00 Bytes  0.00 bits/sec   91   37.7 KBytes
[SUM]  13.00-14.00  sec   569 KBytes  4.66 Mbits/sec  155
- - - - - - - - - - - - - - - - - - - - - - - - -
[  5]  14.00-15.00  sec   316 KBytes  2.59 Mbits/sec    6   8.07 KBytes
[  7]  14.00-15.00  sec   253 KBytes  2.07 Mbits/sec    3   9.42 KBytes
[  9]  14.00-15.00  sec  0.00 Bytes  0.00 bits/sec   53   8.07 KBytes
[SUM]  14.00-15.00  sec   569 KBytes  4.66 Mbits/sec   62

Stop the rx test, the tx becomes again:

[  5]  28.00-29.00  sec  1.85 MBytes  15.5 Mbits/sec    0   96.9 KBytes
[  7]  28.00-29.00  sec  1.98 MBytes  16.6 Mbits/sec    0   96.9 KBytes
[  9]  28.00-29.00  sec  2.22 MBytes  18.7 Mbits/sec    0   96.9 KBytes
[SUM]  28.00-29.00  sec  6.05 MBytes  50.8 Mbits/sec    0
- - - - - - - - - - - - - - - - - - - - - - - - -
[  5]  29.00-30.00  sec  2.16 MBytes  18.1 Mbits/sec    0    110 KBytes
[  7]  29.00-30.00  sec  1.98 MBytes  16.6 Mbits/sec    0    110 KBytes
[  9]  29.00-30.00  sec  1.48 MBytes  12.4 Mbits/sec    0    110 KBytes
[SUM]  29.00-30.00  sec  5.62 MBytes  47.1 Mbits/sec    0
- - - - - - - - - - - - - - - - - - - - - - - - -
[  5]  30.00-31.00  sec  1.85 MBytes  15.5 Mbits/sec    0    122 KBytes
[  7]  30.00-31.00  sec  1.85 MBytes  15.5 Mbits/sec    0    122 KBytes
[  9]  30.00-31.00  sec  2.22 MBytes  18.7 Mbits/sec    0    122 KBytes
[SUM]  30.00-31.00  sec  5.93 MBytes  49.7 Mbits/sec    0

Just want to highlight also that, if I enable the tx aggregation, not
only the above described behavior does not happen, but also the rx
test alone is capable to reach the maximum throughput of the modem:

$ sudo iperf3 -R -c 172.22.1.201 -t 30 -p 5002
Connecting to host 172.22.1.201, port 5002
Reverse mode, remote host 172.22.1.201 is sending
[  5] local 172.22.1.102 port 51422 connected to 172.22.1.201 port 5002
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec  15.1 MBytes   126 Mbits/sec
[  5]   1.00-2.00   sec  17.2 MBytes   144 Mbits/sec
[  5]   2.00-3.00   sec  17.2 MBytes   144 Mbits/sec
[  5]   3.00-4.00   sec  17.2 MBytes   144 Mbits/sec
[  5]   4.00-5.00   sec  17.2 MBytes   144 Mbits/sec
[  5]   5.00-6.00   sec  17.2 MBytes   144 Mbits/sec
[  5]   6.00-7.00   sec  17.2 MBytes   144 Mbits/sec
[  5]   7.00-8.00   sec  17.2 MBytes   144 Mbits/sec
[  5]   8.00-9.00   sec  17.2 MBytes   144 Mbits/sec
[  5]   9.00-10.00  sec  17.2 MBytes   144 Mbits/sec
[  5]  10.00-11.00  sec  17.2 MBytes   144 Mbits/sec
^C[  5]  11.00-11.50  sec  8.67 MBytes   144 Mbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-11.50  sec  0.00 Bytes  0.00 bits/sec                  sender
[  5]   0.00-11.50  sec   195 MBytes   143 Mbits/sec                  receiver
iperf3: interrupt - the client has terminated

Thanks,
Daniele

> > >
> > >> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> > >> index 5e7a1041df3a..09a30e2b29b1 100644
> > >> --- a/include/uapi/linux/if_link.h
> > >> +++ b/include/uapi/linux/if_link.h
> > >> @@ -1351,6 +1351,7 @@ enum {
> > >>   #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4            (1U << 3)
> > >>   #define RMNET_FLAGS_INGRESS_MAP_CKSUMV5           (1U << 4)
> > >>   #define RMNET_FLAGS_EGRESS_MAP_CKSUMV5            (1U << 5)
> > >> +#define RMNET_FLAGS_EGRESS_AGGREGATION            (1U << 6)
> > >
> > > But you could rely on the aggregation parameters passed via Ethtool
> > > to decide whether to enable aggregation or not. If any of them is 0,
> > > it means the aggregation needs to be disabled.
> > > Otherwise, to enable it you need to use 2 utilities: the one that
> > > creates RMNet devices at first and Ethtool after, isn't it too
> > > complicated for no reason?
> >
> > Yes, the EGRESS AGGREGATION parameters can be added as part of the rmnet
> > netlink policies.
> >
>
> Ack.
>
> Thanks,
> Daniele
>
> > >
> > >>
> > >>   enum {
> > >>      IFLA_RMNET_UNSPEC,
> > >> --
> > >> 2.37.1
> > >
> > > Thanks,
> > > Olek
Daniele Palmas Nov. 14, 2022, 9:13 a.m. UTC | #9
Hello Jakub,

Il giorno ven 11 nov 2022 alle ore 18:14 Jakub Kicinski
<kuba@kernel.org> ha scritto:
>
> On Wed,  9 Nov 2022 19:02:48 +0100 Daniele Palmas wrote:
> > +bool rmnet_map_tx_agg_skip(struct sk_buff *skb)
> > +{
> > +     bool is_icmp = 0;
> > +
> > +     if (skb->protocol == htons(ETH_P_IP)) {
> > +             struct iphdr *ip4h = ip_hdr(skb);
> > +
> > +             if (ip4h->protocol == IPPROTO_ICMP)
> > +                     is_icmp = true;
> > +     } else if (skb->protocol == htons(ETH_P_IPV6)) {
> > +             unsigned int icmp_offset = 0;
> > +
> > +             if (ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, NULL, NULL) == IPPROTO_ICMPV6)
> > +                     is_icmp = true;
> > +     }
> > +
> > +     return is_icmp;
> > +}
>
> Why this? I don't see it mention in the commit message or any code
> comment.

This is something I've found in downstream code: with my test setup
and scenario it does not make any difference on the icmp packets
timing (both with or without throughput tests ongoing), but I don't
have access to all the systems for which rmnet is used.

So, I'm not sure if it solves a real issue in other situations.

I can move that out and me or someone else will add it again in case
there will be a real issue to be solved.

Thanks,
Daniele
Bjørn Mork Nov. 14, 2022, 10:25 a.m. UTC | #10
Daniele Palmas <dnlplm@gmail.com> writes:
> Il giorno ven 11 nov 2022 alle ore 18:14 Jakub Kicinski
> <kuba@kernel.org> ha scritto:
>>
>> On Wed,  9 Nov 2022 19:02:48 +0100 Daniele Palmas wrote:
>> > +bool rmnet_map_tx_agg_skip(struct sk_buff *skb)
>> > +{
>> > +     bool is_icmp = 0;
>> > +
>> > +     if (skb->protocol == htons(ETH_P_IP)) {
>> > +             struct iphdr *ip4h = ip_hdr(skb);
>> > +
>> > +             if (ip4h->protocol == IPPROTO_ICMP)
>> > +                     is_icmp = true;
>> > +     } else if (skb->protocol == htons(ETH_P_IPV6)) {
>> > +             unsigned int icmp_offset = 0;
>> > +
>> > +             if (ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, NULL, NULL) == IPPROTO_ICMPV6)
>> > +                     is_icmp = true;
>> > +     }
>> > +
>> > +     return is_icmp;
>> > +}
>>
>> Why this? I don't see it mention in the commit message or any code
>> comment.
>
> This is something I've found in downstream code: with my test setup
> and scenario it does not make any difference on the icmp packets
> timing (both with or without throughput tests ongoing), but I don't
> have access to all the systems for which rmnet is used.
>
> So, I'm not sure if it solves a real issue in other situations.
>
> I can move that out and me or someone else will add it again in case
> there will be a real issue to be solved.

It looks like an attempt to "cheat" latency measurements.  I don't think
we should do that.  Aggregation may be necessary to achieve maximum
throughput in a radio network, but has its obvious bufferbloat downside.
Let's not hide that fact.  Users deserve to know, and tune their systems
accordingly.  Things like this will only make that more difficult


Bjørn
Jakub Kicinski Nov. 15, 2022, 12:43 a.m. UTC | #11
On Mon, 14 Nov 2022 11:25:32 +0100 Bjørn Mork wrote:
> It looks like an attempt to "cheat" latency measurements.  I don't think
> we should do that.  Aggregation may be necessary to achieve maximum
> throughput in a radio network, but has its obvious bufferbloat downside.
> Let's not hide that fact.  Users deserve to know, and tune their systems
> accordingly.  Things like this will only make that more difficult


Daniele Palmas Nov. 16, 2022, 3:19 p.m. UTC | #12
Hello Alexander,

Il giorno gio 10 nov 2022 alle ore 18:35 Alexander Lobakin
<alexandr.lobakin@intel.com> ha scritto:
>
> Do I get the whole logics correctly, you allocate a new big skb and
> just copy several frames into it, then send as one chunk once its
> size reaches the threshold? Plus linearize every skb to be able to
> do that... That's too much of overhead I'd say, just handle S/G and
> fraglists and make long trains of frags from them without copying
> anything?

sorry for my question, for sure I'm lacking knowledge about this, but
I'm trying to understand how I can move forward.

Suppose I'm able to build the aggregated block as a train of
fragments, then I have to send it to the underlying netdevice that, in
my scenario, is created by the qmi_wwan driver: I could be wrong, but
my understanding is that it does not support fragments.

And, as far as I know, there's only another driver in mainline used
with rmnet (mhi_net) and that one also does not seem to support them
either.

Does this mean that once I have the aggregated block through fragments
it should be converted to a single linear skb before sending?

Thanks,
Daniele
Alexander Lobakin Nov. 16, 2022, 4:20 p.m. UTC | #13
From: Daniele Palmas <dnlplm@gmail.com>
Date: Wed, 16 Nov 2022 16:19:48 +0100

> Hello Alexander,
> 
> Il giorno gio 10 nov 2022 alle ore 18:35 Alexander Lobakin
> <alexandr.lobakin@intel.com> ha scritto:
> >
> > Do I get the whole logics correctly, you allocate a new big skb and
> > just copy several frames into it, then send as one chunk once its
> > size reaches the threshold? Plus linearize every skb to be able to
> > do that... That's too much of overhead I'd say, just handle S/G and
> > fraglists and make long trains of frags from them without copying
> > anything?
> 
> sorry for my question, for sure I'm lacking knowledge about this, but
> I'm trying to understand how I can move forward.
> 
> Suppose I'm able to build the aggregated block as a train of
> fragments, then I have to send it to the underlying netdevice that, in
> my scenario, is created by the qmi_wwan driver: I could be wrong, but
> my understanding is that it does not support fragments.
> 
> And, as far as I know, there's only another driver in mainline used
> with rmnet (mhi_net) and that one also does not seem to support them
> either.
> 
> Does this mean that once I have the aggregated block through fragments
> it should be converted to a single linear skb before sending?

Ah okay, I've missed the fact it's only an intermediate layer and
there's some real device behind it.
If you make an skb with fragments and queue it up to a netdev which
doesn't advertise %NETIF_F_SG, networking core will take care of
this. It will then form a set of regular skbs and queue it for
sending instead. Sure, there'll be some memcopies, but I can't say
this implementation is better until some stats provided.

And BTW, as Gal indirectly mentioned, those perf problems belong to
the underlying device, e.g. qmi_wwan and so on, rmnet shouldn't do
anything here. So you could try implement aggregation there or
whatever you'd like to pick. I'd try to read some specs first and
see whether qmi_wwan HW is capable of S/G or whether some driver
improvements for Tx could be done there.

> 
> Thanks,
> Daniele

Thanks,
Olek
Daniele Palmas Nov. 20, 2022, 9:25 a.m. UTC | #14
Il giorno mer 16 nov 2022 alle ore 17:21 Alexander Lobakin
<alexandr.lobakin@intel.com> ha scritto:
>
> From: Daniele Palmas <dnlplm@gmail.com>
> Date: Wed, 16 Nov 2022 16:19:48 +0100
>
> > Hello Alexander,
> >
> > Il giorno gio 10 nov 2022 alle ore 18:35 Alexander Lobakin
> > <alexandr.lobakin@intel.com> ha scritto:
> > >
> > > Do I get the whole logics correctly, you allocate a new big skb and
> > > just copy several frames into it, then send as one chunk once its
> > > size reaches the threshold? Plus linearize every skb to be able to
> > > do that... That's too much of overhead I'd say, just handle S/G and
> > > fraglists and make long trains of frags from them without copying
> > > anything?
> >
> > sorry for my question, for sure I'm lacking knowledge about this, but
> > I'm trying to understand how I can move forward.
> >
> > Suppose I'm able to build the aggregated block as a train of
> > fragments, then I have to send it to the underlying netdevice that, in
> > my scenario, is created by the qmi_wwan driver: I could be wrong, but
> > my understanding is that it does not support fragments.
> >
> > And, as far as I know, there's only another driver in mainline used
> > with rmnet (mhi_net) and that one also does not seem to support them
> > either.
> >
> > Does this mean that once I have the aggregated block through fragments
> > it should be converted to a single linear skb before sending?
>
> Ah okay, I've missed the fact it's only an intermediate layer and
> there's some real device behind it.
> If you make an skb with fragments and queue it up to a netdev which
> doesn't advertise %NETIF_F_SG, networking core will take care of
> this. It will then form a set of regular skbs and queue it for
> sending instead. Sure, there'll be some memcopies, but I can't say
> this implementation is better until some stats provided.
>
> And BTW, as Gal indirectly mentioned, those perf problems belong to
> the underlying device, e.g. qmi_wwan and so on, rmnet shouldn't do
> anything here.

Ok, so rmnet would only take care of qmap rx packets deaggregation and
qmi_wwan of the tx aggregation.

At a conceptual level, implementing tx aggregation in qmi_wwan for
passthrough mode could make sense, since the tx aggregation parameters
belong to the physical device and are shared among the virtual rmnet
netdevices, which can't have different aggr configurations if they
belong to the same physical device.

Bjørn, would this approach be ok for you?

Thanks,
Daniele

> So you could try implement aggregation there or
> whatever you'd like to pick. I'd try to read some specs first and
> see whether qmi_wwan HW is capable of S/G or whether some driver
> improvements for Tx could be done there.
>
> >
> > Thanks,
> > Daniele
>
> Thanks,
> Olek
Bjørn Mork Nov. 20, 2022, 9:39 a.m. UTC | #15
Daniele Palmas <dnlplm@gmail.com> writes:

> Ok, so rmnet would only take care of qmap rx packets deaggregation and
> qmi_wwan of the tx aggregation.
>
> At a conceptual level, implementing tx aggregation in qmi_wwan for
> passthrough mode could make sense, since the tx aggregation parameters
> belong to the physical device and are shared among the virtual rmnet
> netdevices, which can't have different aggr configurations if they
> belong to the same physical device.
>
> Bjørn, would this approach be ok for you?

Sounds good to me, if this can be done within the userspace API
restrictions we've been through.

I assume it's possible to make this Just Work(tm) in qmi_wwan
passthrough mode?  I do not think we want any solution where the user
will have to configure both qmi_wwan and rmnet to make things work
properly.


Bjørn
Daniele Palmas Nov. 20, 2022, 9:52 a.m. UTC | #16
Il giorno dom 20 nov 2022 alle ore 10:39 Bjørn Mork <bjorn@mork.no> ha scritto:
>
> Daniele Palmas <dnlplm@gmail.com> writes:
>
> > Ok, so rmnet would only take care of qmap rx packets deaggregation and
> > qmi_wwan of the tx aggregation.
> >
> > At a conceptual level, implementing tx aggregation in qmi_wwan for
> > passthrough mode could make sense, since the tx aggregation parameters
> > belong to the physical device and are shared among the virtual rmnet
> > netdevices, which can't have different aggr configurations if they
> > belong to the same physical device.
> >
> > Bjørn, would this approach be ok for you?
>
> Sounds good to me, if this can be done within the userspace API
> restrictions we've been through.
>
> I assume it's possible to make this Just Work(tm) in qmi_wwan
> passthrough mode?  I do not think we want any solution where the user
> will have to configure both qmi_wwan and rmnet to make things work
> properly.
>

Yes, I think so: the ethtool configurations would apply to the
qmi_wwan netdevice so that nothing should be done on the rmnet side.

Regards,
Daniele
Subash Abhinov Kasiviswanathan (KS) Nov. 20, 2022, 5:48 p.m. UTC | #17
On 11/20/2022 2:52 AM, Daniele Palmas wrote:
> Il giorno dom 20 nov 2022 alle ore 10:39 Bjørn Mork <bjorn@mork.no> ha scritto:
>>
>> Daniele Palmas <dnlplm@gmail.com> writes:
>>
>>> Ok, so rmnet would only take care of qmap rx packets deaggregation and
>>> qmi_wwan of the tx aggregation.
>>>
>>> At a conceptual level, implementing tx aggregation in qmi_wwan for
>>> passthrough mode could make sense, since the tx aggregation parameters
>>> belong to the physical device and are shared among the virtual rmnet
>>> netdevices, which can't have different aggr configurations if they
>>> belong to the same physical device.
>>>
>>> Bjørn, would this approach be ok for you?
>>
>> Sounds good to me, if this can be done within the userspace API
>> restrictions we've been through.
>>
>> I assume it's possible to make this Just Work(tm) in qmi_wwan
>> passthrough mode?  I do not think we want any solution where the user
>> will have to configure both qmi_wwan and rmnet to make things work
>> properly.
>>
> 
> Yes, I think so: the ethtool configurations would apply to the
> qmi_wwan netdevice so that nothing should be done on the rmnet side.
> 
> Regards,
> Daniele

My only concern against this option is that we would now need to end up 
implementing the same tx aggregation logic in the other physical device 
drivers - mhi_netdev & ipa. Keeping this tx aggregation logic in rmnet 
allows you to leverage it across all these various physical devices.
Daniele Palmas Nov. 21, 2022, 7 a.m. UTC | #18
Il giorno dom 20 nov 2022 alle ore 18:48 Subash Abhinov
Kasiviswanathan (KS) <quic_subashab@quicinc.com> ha scritto:
>
> On 11/20/2022 2:52 AM, Daniele Palmas wrote:
> > Il giorno dom 20 nov 2022 alle ore 10:39 Bjørn Mork <bjorn@mork.no> ha scritto:
> >>
> >> Daniele Palmas <dnlplm@gmail.com> writes:
> >>
> >>> Ok, so rmnet would only take care of qmap rx packets deaggregation and
> >>> qmi_wwan of the tx aggregation.
> >>>
> >>> At a conceptual level, implementing tx aggregation in qmi_wwan for
> >>> passthrough mode could make sense, since the tx aggregation parameters
> >>> belong to the physical device and are shared among the virtual rmnet
> >>> netdevices, which can't have different aggr configurations if they
> >>> belong to the same physical device.
> >>>
> >>> Bjørn, would this approach be ok for you?
> >>
> >> Sounds good to me, if this can be done within the userspace API
> >> restrictions we've been through.
> >>
> >> I assume it's possible to make this Just Work(tm) in qmi_wwan
> >> passthrough mode?  I do not think we want any solution where the user
> >> will have to configure both qmi_wwan and rmnet to make things work
> >> properly.
> >>
> >
> > Yes, I think so: the ethtool configurations would apply to the
> > qmi_wwan netdevice so that nothing should be done on the rmnet side.
> >
> > Regards,
> > Daniele
>
> My only concern against this option is that we would now need to end up
> implementing the same tx aggregation logic in the other physical device
> drivers - mhi_netdev & ipa. Keeping this tx aggregation logic in rmnet
> allows you to leverage it across all these various physical devices.

Yes, that's true.

But according to this discussion, the need for tx aggregation seems
relevant just to USB devices (and maybe also a minority of them, since
so far no one complained about it lacking), isn't it?

Regards,
Daniele
Subash Abhinov Kasiviswanathan (KS) Nov. 24, 2022, 3:32 a.m. UTC | #19
On 11/21/2022 12:00 AM, Daniele Palmas wrote:
> Il giorno dom 20 nov 2022 alle ore 18:48 Subash Abhinov
> Kasiviswanathan (KS) <quic_subashab@quicinc.com> ha scritto:
>>
>> On 11/20/2022 2:52 AM, Daniele Palmas wrote:
>>> Il giorno dom 20 nov 2022 alle ore 10:39 Bjørn Mork <bjorn@mork.no> ha scritto:
>>>>
>>>> Daniele Palmas <dnlplm@gmail.com> writes:
>>>>
>>>>> Ok, so rmnet would only take care of qmap rx packets deaggregation and
>>>>> qmi_wwan of the tx aggregation.
>>>>>
>>>>> At a conceptual level, implementing tx aggregation in qmi_wwan for
>>>>> passthrough mode could make sense, since the tx aggregation parameters
>>>>> belong to the physical device and are shared among the virtual rmnet
>>>>> netdevices, which can't have different aggr configurations if they
>>>>> belong to the same physical device.
>>>>>
>>>>> Bjørn, would this approach be ok for you?
>>>>
>>>> Sounds good to me, if this can be done within the userspace API
>>>> restrictions we've been through.
>>>>
>>>> I assume it's possible to make this Just Work(tm) in qmi_wwan
>>>> passthrough mode?  I do not think we want any solution where the user
>>>> will have to configure both qmi_wwan and rmnet to make things work
>>>> properly.
>>>>
>>>
>>> Yes, I think so: the ethtool configurations would apply to the
>>> qmi_wwan netdevice so that nothing should be done on the rmnet side.
>>>
>>> Regards,
>>> Daniele
>>
>> My only concern against this option is that we would now need to end up
>> implementing the same tx aggregation logic in the other physical device
>> drivers - mhi_netdev & ipa. Keeping this tx aggregation logic in rmnet
>> allows you to leverage it across all these various physical devices.
> 
> Yes, that's true.
> 
> But according to this discussion, the need for tx aggregation seems
> relevant just to USB devices (and maybe also a minority of them, since
> so far no one complained about it lacking), isn't it?
> 
> Regards,
> Daniele

Ah, it's more to do with it being future proof (in case some one does 
run into a similar throughput issue on the other platforms).

It also consolidates the functionality within rmnet driver for both tx & 
rx path. As you already know, the deaggregation, checksum offload & 
demuxing logic are already present in rmnet path in rx. The 
complementary muxing & checksum offload functionality are also present 
in the rmnet tx path, so I wanted to see if the aggregation 
functionality could be added in rmnet.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 27b1663c476e..39d24e07f306 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -12,6 +12,7 @@ 
 #include "rmnet_handlers.h"
 #include "rmnet_vnd.h"
 #include "rmnet_private.h"
+#include "rmnet_map.h"
 
 /* Local Definitions and Declarations */
 
@@ -39,6 +40,8 @@  static int rmnet_unregister_real_device(struct net_device *real_dev)
 	if (port->nr_rmnet_devs)
 		return -EINVAL;
 
+	rmnet_map_tx_aggregate_exit(port);
+
 	netdev_rx_handler_unregister(real_dev);
 
 	kfree(port);
@@ -79,6 +82,8 @@  static int rmnet_register_real_device(struct net_device *real_dev,
 	for (entry = 0; entry < RMNET_MAX_LOGICAL_EP; entry++)
 		INIT_HLIST_HEAD(&port->muxed_ep[entry]);
 
+	rmnet_map_tx_aggregate_init(port);
+
 	netdev_dbg(real_dev, "registered with rmnet\n");
 	return 0;
 }
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
index 3d3cba56c516..d341df78e411 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
@@ -6,6 +6,7 @@ 
  */
 
 #include <linux/skbuff.h>
+#include <linux/time.h>
 #include <net/gro_cells.h>
 
 #ifndef _RMNET_CONFIG_H_
@@ -19,6 +20,12 @@  struct rmnet_endpoint {
 	struct hlist_node hlnode;
 };
 
+struct rmnet_egress_agg_params {
+	u16 agg_size;
+	u16 agg_count;
+	u64 agg_time_nsec;
+};
+
 /* One instance of this structure is instantiated for each real_dev associated
  * with rmnet.
  */
@@ -30,6 +37,18 @@  struct rmnet_port {
 	struct hlist_head muxed_ep[RMNET_MAX_LOGICAL_EP];
 	struct net_device *bridge_ep;
 	struct net_device *rmnet_dev;
+
+	/* Egress aggregation information */
+	struct rmnet_egress_agg_params egress_agg_params;
+	/* Protect aggregation related elements */
+	spinlock_t agg_lock;
+	struct sk_buff *agg_skb;
+	int agg_state;
+	u8 agg_count;
+	struct timespec64 agg_time;
+	struct timespec64 agg_last;
+	struct hrtimer hrtimer;
+	struct work_struct agg_wq;
 };
 
 extern struct rtnl_link_ops rmnet_link_ops;
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
index a313242a762e..82e2669e3590 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
@@ -136,10 +136,15 @@  static int rmnet_map_egress_handler(struct sk_buff *skb,
 {
 	int required_headroom, additional_header_len, csum_type = 0;
 	struct rmnet_map_header *map_header;
+	bool is_icmp = false;
 
 	additional_header_len = 0;
 	required_headroom = sizeof(struct rmnet_map_header);
 
+	if (port->data_format & RMNET_FLAGS_EGRESS_AGGREGATION &&
+	    rmnet_map_tx_agg_skip(skb))
+		is_icmp = true;
+
 	if (port->data_format & RMNET_FLAGS_EGRESS_MAP_CKSUMV4) {
 		additional_header_len = sizeof(struct rmnet_map_ul_csum_header);
 		csum_type = RMNET_FLAGS_EGRESS_MAP_CKSUMV4;
@@ -164,8 +169,18 @@  static int rmnet_map_egress_handler(struct sk_buff *skb,
 
 	map_header->mux_id = mux_id;
 
-	skb->protocol = htons(ETH_P_MAP);
+	if (port->data_format & RMNET_FLAGS_EGRESS_AGGREGATION && !is_icmp) {
+		if (skb_is_nonlinear(skb)) {
+			if (unlikely(__skb_linearize(skb)))
+				goto done;
+		}
+
+		rmnet_map_tx_aggregate(skb, port, orig_dev);
+		return -EINPROGRESS;
+	}
 
+done:
+	skb->protocol = htons(ETH_P_MAP);
 	return 0;
 }
 
@@ -235,6 +250,7 @@  void rmnet_egress_handler(struct sk_buff *skb)
 	struct rmnet_port *port;
 	struct rmnet_priv *priv;
 	u8 mux_id;
+	int err;
 
 	sk_pacing_shift_update(skb->sk, 8);
 
@@ -247,8 +263,13 @@  void rmnet_egress_handler(struct sk_buff *skb)
 	if (!port)
 		goto drop;
 
-	if (rmnet_map_egress_handler(skb, port, mux_id, orig_dev))
+	err = rmnet_map_egress_handler(skb, port, mux_id, orig_dev);
+	if (err == -ENOMEM) {
 		goto drop;
+	} else if (err == -EINPROGRESS) {
+		rmnet_vnd_tx_fixup(skb, orig_dev);
+		return;
+	}
 
 	rmnet_vnd_tx_fixup(skb, orig_dev);
 
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
index 2b033060fc20..6aefc4e1bf47 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
@@ -53,5 +53,12 @@  void rmnet_map_checksum_uplink_packet(struct sk_buff *skb,
 				      struct net_device *orig_dev,
 				      int csum_type);
 int rmnet_map_process_next_hdr_packet(struct sk_buff *skb, u16 len);
+bool rmnet_map_tx_agg_skip(struct sk_buff *skb);
+void rmnet_map_tx_aggregate(struct sk_buff *skb, struct rmnet_port *port,
+			    struct net_device *orig_dev);
+void rmnet_map_tx_aggregate_init(struct rmnet_port *port);
+void rmnet_map_tx_aggregate_exit(struct rmnet_port *port);
+void rmnet_map_update_ul_agg_config(struct rmnet_port *port, u16 size,
+				    u16 count, u32 time);
 
 #endif /* _RMNET_MAP_H_ */
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index ba194698cc14..49eeed4a126b 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -12,6 +12,7 @@ 
 #include "rmnet_config.h"
 #include "rmnet_map.h"
 #include "rmnet_private.h"
+#include "rmnet_vnd.h"
 
 #define RMNET_MAP_DEAGGR_SPACING  64
 #define RMNET_MAP_DEAGGR_HEADROOM (RMNET_MAP_DEAGGR_SPACING / 2)
@@ -518,3 +519,198 @@  int rmnet_map_process_next_hdr_packet(struct sk_buff *skb,
 
 	return 0;
 }
+
+long rmnet_agg_bypass_time __read_mostly = 10000L * NSEC_PER_USEC;
+
+bool rmnet_map_tx_agg_skip(struct sk_buff *skb)
+{
+	bool is_icmp = 0;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		struct iphdr *ip4h = ip_hdr(skb);
+
+		if (ip4h->protocol == IPPROTO_ICMP)
+			is_icmp = true;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		unsigned int icmp_offset = 0;
+
+		if (ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, NULL, NULL) == IPPROTO_ICMPV6)
+			is_icmp = true;
+	}
+
+	return is_icmp;
+}
+
+static void reset_aggr_params(struct rmnet_port *port)
+{
+	port->agg_skb = NULL;
+	port->agg_count = 0;
+	port->agg_state = 0;
+	memset(&port->agg_time, 0, sizeof(struct timespec64));
+}
+
+static void rmnet_map_flush_tx_packet_work(struct work_struct *work)
+{
+	struct sk_buff *skb = NULL;
+	struct rmnet_port *port;
+	unsigned long flags;
+
+	port = container_of(work, struct rmnet_port, agg_wq);
+
+	spin_lock_irqsave(&port->agg_lock, flags);
+	if (likely(port->agg_state == -EINPROGRESS)) {
+		/* Buffer may have already been shipped out */
+		if (likely(port->agg_skb)) {
+			skb = port->agg_skb;
+			reset_aggr_params(port);
+		}
+		port->agg_state = 0;
+	}
+
+	spin_unlock_irqrestore(&port->agg_lock, flags);
+	if (skb)
+		dev_queue_xmit(skb);
+}
+
+enum hrtimer_restart rmnet_map_flush_tx_packet_queue(struct hrtimer *t)
+{
+	struct rmnet_port *port;
+
+	port = container_of(t, struct rmnet_port, hrtimer);
+
+	schedule_work(&port->agg_wq);
+
+	return HRTIMER_NORESTART;
+}
+
+void rmnet_map_tx_aggregate(struct sk_buff *skb, struct rmnet_port *port,
+			    struct net_device *orig_dev)
+{
+	struct timespec64 diff, last;
+	int size = 0;
+	struct sk_buff *agg_skb;
+	unsigned long flags;
+
+new_packet:
+	spin_lock_irqsave(&port->agg_lock, flags);
+	memcpy(&last, &port->agg_last, sizeof(struct timespec64));
+	ktime_get_real_ts64(&port->agg_last);
+
+	if (!port->agg_skb) {
+		/* Check to see if we should agg first. If the traffic is very
+		 * sparse, don't aggregate.
+		 */
+		diff = timespec64_sub(port->agg_last, last);
+		size = port->egress_agg_params.agg_size - skb->len;
+
+		if (size < 0) {
+			struct rmnet_priv *priv;
+
+			/* dropped */
+			dev_kfree_skb_any(skb);
+			spin_unlock_irqrestore(&port->agg_lock, flags);
+			priv = netdev_priv(orig_dev);
+			this_cpu_inc(priv->pcpu_stats->stats.tx_drops);
+
+			return;
+		}
+
+		if (diff.tv_sec > 0 || diff.tv_nsec > rmnet_agg_bypass_time ||
+		    size == 0) {
+			spin_unlock_irqrestore(&port->agg_lock, flags);
+			skb->protocol = htons(ETH_P_MAP);
+			dev_queue_xmit(skb);
+			return;
+		}
+
+		port->agg_skb = skb_copy_expand(skb, 0, size, GFP_ATOMIC);
+		if (!port->agg_skb) {
+			reset_aggr_params(port);
+			spin_unlock_irqrestore(&port->agg_lock, flags);
+			skb->protocol = htons(ETH_P_MAP);
+			dev_queue_xmit(skb);
+			return;
+		}
+		port->agg_skb->protocol = htons(ETH_P_MAP);
+		port->agg_count = 1;
+		ktime_get_real_ts64(&port->agg_time);
+		dev_kfree_skb_any(skb);
+		goto schedule;
+	}
+	diff = timespec64_sub(port->agg_last, port->agg_time);
+	size = port->egress_agg_params.agg_size - port->agg_skb->len;
+
+	if (skb->len > size ||
+	    diff.tv_sec > 0 || diff.tv_nsec > port->egress_agg_params.agg_time_nsec) {
+		agg_skb = port->agg_skb;
+		reset_aggr_params(port);
+		spin_unlock_irqrestore(&port->agg_lock, flags);
+		hrtimer_cancel(&port->hrtimer);
+		dev_queue_xmit(agg_skb);
+		goto new_packet;
+	}
+
+	skb_put_data(port->agg_skb, skb->data, skb->len);
+	port->agg_count++;
+	dev_kfree_skb_any(skb);
+
+	if (port->agg_count == port->egress_agg_params.agg_count ||
+	    port->agg_skb->len == port->egress_agg_params.agg_size) {
+		agg_skb = port->agg_skb;
+		reset_aggr_params(port);
+		spin_unlock_irqrestore(&port->agg_lock, flags);
+		hrtimer_cancel(&port->hrtimer);
+		dev_queue_xmit(agg_skb);
+		return;
+	}
+
+schedule:
+	if (!hrtimer_active(&port->hrtimer) && port->agg_state != -EINPROGRESS) {
+		port->agg_state = -EINPROGRESS;
+		hrtimer_start(&port->hrtimer,
+			      ns_to_ktime(port->egress_agg_params.agg_time_nsec),
+			      HRTIMER_MODE_REL);
+	}
+	spin_unlock_irqrestore(&port->agg_lock, flags);
+}
+
+void rmnet_map_update_ul_agg_config(struct rmnet_port *port, u16 size,
+				    u16 count, u32 time)
+{
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&port->agg_lock, irq_flags);
+	port->egress_agg_params.agg_size = size;
+	port->egress_agg_params.agg_count = count;
+	port->egress_agg_params.agg_time_nsec = time * NSEC_PER_USEC;
+	spin_unlock_irqrestore(&port->agg_lock, irq_flags);
+}
+
+void rmnet_map_tx_aggregate_init(struct rmnet_port *port)
+{
+	hrtimer_init(&port->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	port->hrtimer.function = rmnet_map_flush_tx_packet_queue;
+	spin_lock_init(&port->agg_lock);
+	rmnet_map_update_ul_agg_config(port, 4096, 16, 800);
+	INIT_WORK(&port->agg_wq, rmnet_map_flush_tx_packet_work);
+}
+
+void rmnet_map_tx_aggregate_exit(struct rmnet_port *port)
+{
+	unsigned long flags;
+
+	hrtimer_cancel(&port->hrtimer);
+	cancel_work_sync(&port->agg_wq);
+
+	spin_lock_irqsave(&port->agg_lock, flags);
+	if (port->agg_state == -EINPROGRESS) {
+		if (port->agg_skb) {
+			kfree_skb(port->agg_skb);
+			reset_aggr_params(port);
+		}
+
+		port->agg_state = 0;
+	}
+
+	spin_unlock_irqrestore(&port->agg_lock, flags);
+}
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 5e7a1041df3a..09a30e2b29b1 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1351,6 +1351,7 @@  enum {
 #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4            (1U << 3)
 #define RMNET_FLAGS_INGRESS_MAP_CKSUMV5           (1U << 4)
 #define RMNET_FLAGS_EGRESS_MAP_CKSUMV5            (1U << 5)
+#define RMNET_FLAGS_EGRESS_AGGREGATION            (1U << 6)
 
 enum {
 	IFLA_RMNET_UNSPEC,