Message ID | 20221109180249.4721-3-dnlplm@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | add tx packets aggregation to ethtool and rmnet | expand |
From: Daniele Palmas <dnlplm@gmail.com> Date: Wed, 9 Nov 2022 19:02:48 +0100 > Bidirectional TCP throughput tests through iperf with low-cat > Thread-x based modems showed performance issues both in tx > and rx. > > The Windows driver does not show this issue: inspecting USB > packets revealed that the only notable change is the driver > enabling tx packets aggregation. > > Tx packets aggregation, by default disabled, requires flag > RMNET_FLAGS_EGRESS_AGGREGATION to be set (e.g. through ip command). > > The maximum number of aggregated packets and the maximum aggregated > size are by default set to reasonably low values in order to support > the majority of modems. > > This implementation is based on patches available in Code Aurora > repositories (msm kernel) whose main authors are > > Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> > Sean Tranchetti <stranche@codeaurora.org> > > Signed-off-by: Daniele Palmas <dnlplm@gmail.com> > --- > .../ethernet/qualcomm/rmnet/rmnet_config.c | 5 + > .../ethernet/qualcomm/rmnet/rmnet_config.h | 19 ++ > .../ethernet/qualcomm/rmnet/rmnet_handlers.c | 25 ++- > .../net/ethernet/qualcomm/rmnet/rmnet_map.h | 7 + > .../ethernet/qualcomm/rmnet/rmnet_map_data.c | 196 ++++++++++++++++++ > include/uapi/linux/if_link.h | 1 + > 6 files changed, 251 insertions(+), 2 deletions(-) > > diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c > index 27b1663c476e..39d24e07f306 100644 > --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c > +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c > @@ -12,6 +12,7 @@ > #include "rmnet_handlers.h" > #include "rmnet_vnd.h" > #include "rmnet_private.h" > +#include "rmnet_map.h" > > /* Local Definitions and Declarations */ > > @@ -39,6 +40,8 @@ static int rmnet_unregister_real_device(struct net_device *real_dev) > if (port->nr_rmnet_devs) > return -EINVAL; > > + rmnet_map_tx_aggregate_exit(port); > + > netdev_rx_handler_unregister(real_dev); > > kfree(port); > @@ -79,6 +82,8 @@ static int rmnet_register_real_device(struct net_device *real_dev, > for (entry = 0; entry < RMNET_MAX_LOGICAL_EP; entry++) > INIT_HLIST_HEAD(&port->muxed_ep[entry]); > > + rmnet_map_tx_aggregate_init(port); > + > netdev_dbg(real_dev, "registered with rmnet\n"); > return 0; > } > diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h > index 3d3cba56c516..d341df78e411 100644 > --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h > +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h > @@ -6,6 +6,7 @@ > */ > > #include <linux/skbuff.h> > +#include <linux/time.h> > #include <net/gro_cells.h> > > #ifndef _RMNET_CONFIG_H_ > @@ -19,6 +20,12 @@ struct rmnet_endpoint { > struct hlist_node hlnode; > }; > > +struct rmnet_egress_agg_params { > + u16 agg_size; skbs can now be way longer than 64 Kb. > + u16 agg_count; > + u64 agg_time_nsec; > +}; > + > /* One instance of this structure is instantiated for each real_dev associated > * with rmnet. > */ [...] > @@ -518,3 +519,198 @@ int rmnet_map_process_next_hdr_packet(struct sk_buff *skb, > > return 0; > } > + > +long rmnet_agg_bypass_time __read_mostly = 10000L * NSEC_PER_USEC; Why __read_mostly if you don't change it anywhere? Could be const. Why here if you use it only in one file? Could be static there. Why variable if it could be a definition? > + > +bool rmnet_map_tx_agg_skip(struct sk_buff *skb) > +{ > + bool is_icmp = 0; > + > + if (skb->protocol == htons(ETH_P_IP)) { > + struct iphdr *ip4h = ip_hdr(skb); [...] > +static void rmnet_map_flush_tx_packet_work(struct work_struct *work) > +{ > + struct sk_buff *skb = NULL; > + struct rmnet_port *port; > + unsigned long flags; > + > + port = container_of(work, struct rmnet_port, agg_wq); > + > + spin_lock_irqsave(&port->agg_lock, flags); I don't see aggregation fields used in any hardware interrupt handlers, are you sure you need _irq*(), not _bh()? > + if (likely(port->agg_state == -EINPROGRESS)) { > + /* Buffer may have already been shipped out */ > + if (likely(port->agg_skb)) { > + skb = port->agg_skb; > + reset_aggr_params(port); > + } > + port->agg_state = 0; > + } > + > + spin_unlock_irqrestore(&port->agg_lock, flags); > + if (skb) > + dev_queue_xmit(skb); > +} > + > +enum hrtimer_restart rmnet_map_flush_tx_packet_queue(struct hrtimer *t) > +{ > + struct rmnet_port *port; > + > + port = container_of(t, struct rmnet_port, hrtimer); > + > + schedule_work(&port->agg_wq); > + > + return HRTIMER_NORESTART; > +} > + > +void rmnet_map_tx_aggregate(struct sk_buff *skb, struct rmnet_port *port, > + struct net_device *orig_dev) > +{ > + struct timespec64 diff, last; > + int size = 0; RCT style? > + struct sk_buff *agg_skb; > + unsigned long flags; > + > +new_packet: > + spin_lock_irqsave(&port->agg_lock, flags); > + memcpy(&last, &port->agg_last, sizeof(struct timespec64)); > + ktime_get_real_ts64(&port->agg_last); > + > + if (!port->agg_skb) { > + /* Check to see if we should agg first. If the traffic is very > + * sparse, don't aggregate. > + */ > + diff = timespec64_sub(port->agg_last, last); > + size = port->egress_agg_params.agg_size - skb->len; > + > + if (size < 0) { > + struct rmnet_priv *priv; > + > + /* dropped */ So if a packet is smaller than the aggregation threshold, you just drop it? Why, if you could just let it go the "standard" way, like ICMP does? > + dev_kfree_skb_any(skb); > + spin_unlock_irqrestore(&port->agg_lock, flags); You could release this lock a line above, so that dev_kfree_skb_any() wouldn't be called in the HWIRQ context and postpone skb freeing via the softnet queue. > + priv = netdev_priv(orig_dev); > + this_cpu_inc(priv->pcpu_stats->stats.tx_drops); > + > + return; > + } > + > + if (diff.tv_sec > 0 || diff.tv_nsec > rmnet_agg_bypass_time || > + size == 0) { > + spin_unlock_irqrestore(&port->agg_lock, flags); > + skb->protocol = htons(ETH_P_MAP); > + dev_queue_xmit(skb); > + return; > + } > + > + port->agg_skb = skb_copy_expand(skb, 0, size, GFP_ATOMIC); You could use skb_cow_head(skb, 0) and skip allocating a new skb if the current one is writable, which usually is the case. > + if (!port->agg_skb) { > + reset_aggr_params(port); > + spin_unlock_irqrestore(&port->agg_lock, flags); > + skb->protocol = htons(ETH_P_MAP); > + dev_queue_xmit(skb); > + return; > + } > + port->agg_skb->protocol = htons(ETH_P_MAP); > + port->agg_count = 1; > + ktime_get_real_ts64(&port->agg_time); > + dev_kfree_skb_any(skb); > + goto schedule; > + } > + diff = timespec64_sub(port->agg_last, port->agg_time); > + size = port->egress_agg_params.agg_size - port->agg_skb->len; > + > + if (skb->len > size || > + diff.tv_sec > 0 || diff.tv_nsec > port->egress_agg_params.agg_time_nsec) { > + agg_skb = port->agg_skb; > + reset_aggr_params(port); > + spin_unlock_irqrestore(&port->agg_lock, flags); > + hrtimer_cancel(&port->hrtimer); > + dev_queue_xmit(agg_skb); > + goto new_packet; > + } > + > + skb_put_data(port->agg_skb, skb->data, skb->len); IIUC, RMNet netdevs support %NETIF_F_SG. Which means you could just attach skb data as frags to the first skb in the aggregation session instead of copying the data all the time. ...or even add %NETIF_F_FRAGLIST handling, that would save even more -- just use skb->frag_list once you run out of skb_shinfo()->frags. > + port->agg_count++; > + dev_kfree_skb_any(skb); > + > + if (port->agg_count == port->egress_agg_params.agg_count || > + port->agg_skb->len == port->egress_agg_params.agg_size) { I think ::agg_count and ::agg_size are the thresholds, so the comparison should be >= I guess (especially ::agg_size which gets increased by a random value each time, not by 1)? > + agg_skb = port->agg_skb; > + reset_aggr_params(port); > + spin_unlock_irqrestore(&port->agg_lock, flags); > + hrtimer_cancel(&port->hrtimer); > + dev_queue_xmit(agg_skb); > + return; > + } > + > +schedule: > + if (!hrtimer_active(&port->hrtimer) && port->agg_state != -EINPROGRESS) { > + port->agg_state = -EINPROGRESS; > + hrtimer_start(&port->hrtimer, > + ns_to_ktime(port->egress_agg_params.agg_time_nsec), > + HRTIMER_MODE_REL); > + } > + spin_unlock_irqrestore(&port->agg_lock, flags); > +} > + > +void rmnet_map_update_ul_agg_config(struct rmnet_port *port, u16 size, > + u16 count, u32 time) > +{ > + unsigned long irq_flags; > + > + spin_lock_irqsave(&port->agg_lock, irq_flags); > + port->egress_agg_params.agg_size = size; > + port->egress_agg_params.agg_count = count; > + port->egress_agg_params.agg_time_nsec = time * NSEC_PER_USEC; > + spin_unlock_irqrestore(&port->agg_lock, irq_flags); > +} > + > +void rmnet_map_tx_aggregate_init(struct rmnet_port *port) > +{ > + hrtimer_init(&port->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); > + port->hrtimer.function = rmnet_map_flush_tx_packet_queue; > + spin_lock_init(&port->agg_lock); > + rmnet_map_update_ul_agg_config(port, 4096, 16, 800); > + INIT_WORK(&port->agg_wq, rmnet_map_flush_tx_packet_work); > +} > + > +void rmnet_map_tx_aggregate_exit(struct rmnet_port *port) > +{ > + unsigned long flags; > + > + hrtimer_cancel(&port->hrtimer); > + cancel_work_sync(&port->agg_wq); > + > + spin_lock_irqsave(&port->agg_lock, flags); > + if (port->agg_state == -EINPROGRESS) { > + if (port->agg_skb) { > + kfree_skb(port->agg_skb); > + reset_aggr_params(port); > + } > + > + port->agg_state = 0; > + } > + > + spin_unlock_irqrestore(&port->agg_lock, flags); > +} Do I get the whole logics correctly, you allocate a new big skb and just copy several frames into it, then send as one chunk once its size reaches the threshold? Plus linearize every skb to be able to do that... That's too much of overhead I'd say, just handle S/G and fraglists and make long trains of frags from them without copying anything? Also BQL/DQL already does some sort of aggregation via ::xmit_more, doesn't it? Do you have any performance numbers? > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h > index 5e7a1041df3a..09a30e2b29b1 100644 > --- a/include/uapi/linux/if_link.h > +++ b/include/uapi/linux/if_link.h > @@ -1351,6 +1351,7 @@ enum { > #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) > #define RMNET_FLAGS_INGRESS_MAP_CKSUMV5 (1U << 4) > #define RMNET_FLAGS_EGRESS_MAP_CKSUMV5 (1U << 5) > +#define RMNET_FLAGS_EGRESS_AGGREGATION (1U << 6) But you could rely on the aggregation parameters passed via Ethtool to decide whether to enable aggregation or not. If any of them is 0, it means the aggregation needs to be disabled. Otherwise, to enable it you need to use 2 utilities: the one that creates RMNet devices at first and Ethtool after, isn't it too complicated for no reason? > > enum { > IFLA_RMNET_UNSPEC, > -- > 2.37.1 Thanks, Olek
Hi Daniele, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on net-next/master] url: https://github.com/intel-lab-lkp/linux/commits/Daniele-Palmas/add-tx-packets-aggregation-to-ethtool-and-rmnet/20221110-021152 patch link: https://lore.kernel.org/r/20221109180249.4721-3-dnlplm%40gmail.com patch subject: [PATCH net-next 2/3] net: qualcomm: rmnet: add tx packets aggregation config: ia64-allyesconfig compiler: ia64-linux-gcc (GCC) 12.1.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/intel-lab-lkp/linux/commit/ab24bdaf594299d926240cfc1b8e78fa7e9af3b4 git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Daniele-Palmas/add-tx-packets-aggregation-to-ethtool-and-rmnet/20221110-021152 git checkout ab24bdaf594299d926240cfc1b8e78fa7e9af3b4 # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=ia64 SHELL=/bin/bash drivers/net/ethernet/qualcomm/rmnet/ If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> All warnings (new ones prefixed by >>): >> drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:575:22: warning: no previous prototype for 'rmnet_map_flush_tx_packet_queue' [-Wmissing-prototypes] 575 | enum hrtimer_restart rmnet_map_flush_tx_packet_queue(struct hrtimer *t) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ vim +/rmnet_map_flush_tx_packet_queue +575 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c 574 > 575 enum hrtimer_restart rmnet_map_flush_tx_packet_queue(struct hrtimer *t) 576 { 577 struct rmnet_port *port; 578 579 port = container_of(t, struct rmnet_port, hrtimer); 580 581 schedule_work(&port->agg_wq); 582 583 return HRTIMER_NORESTART; 584 } 585
On 11/10/2022 10:32 AM, Alexander Lobakin wrote: > From: Daniele Palmas <dnlplm@gmail.com> > Date: Wed, 9 Nov 2022 19:02:48 +0100 > >> Bidirectional TCP throughput tests through iperf with low-cat >> Thread-x based modems showed performance issues both in tx >> and rx. >> >> The Windows driver does not show this issue: inspecting USB >> packets revealed that the only notable change is the driver >> enabling tx packets aggregation. >> >> Tx packets aggregation, by default disabled, requires flag >> RMNET_FLAGS_EGRESS_AGGREGATION to be set (e.g. through ip command). >> >> The maximum number of aggregated packets and the maximum aggregated >> size are by default set to reasonably low values in order to support >> the majority of modems. >> >> This implementation is based on patches available in Code Aurora >> repositories (msm kernel) whose main authors are >> >> Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> >> Sean Tranchetti <stranche@codeaurora.org> >> >> Signed-off-by: Daniele Palmas <dnlplm@gmail.com> >> --- >> >> +struct rmnet_egress_agg_params { >> + u16 agg_size; > > skbs can now be way longer than 64 Kb. > rmnet devices generally use a standard MTU (around 1500) size. Would it still be possible for >64kb to be generated as no relevant hw_features is set for large transmit offloads. Alternatively, are you referring to injection of packets explicitly, say via packet sockets. >> + u16 agg_count; >> + u64 agg_time_nsec; >> +}; >> + > Do I get the whole logics correctly, you allocate a new big skb and > just copy several frames into it, then send as one chunk once its > size reaches the threshold? Plus linearize every skb to be able to > do that... That's too much of overhead I'd say, just handle S/G and > fraglists and make long trains of frags from them without copying > anything? Also BQL/DQL already does some sort of aggregation via > ::xmit_more, doesn't it? Do you have any performance numbers? The difference here is that hardware would use a single descriptor for aggregation vs multiple descriptors for scatter gather. I wonder if this issue is related to pacing though. Daniele, perhaps you can try this hack without enabling EGRESS AGGREGATION and check if you are able to reach the same level of performance for your scenario. --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c @@ -236,7 +236,7 @@ void rmnet_egress_handler(struct sk_buff *skb) struct rmnet_priv *priv; u8 mux_id; - sk_pacing_shift_update(skb->sk, 8); + skb_orphan(skb); orig_dev = skb->dev; priv = netdev_priv(orig_dev); > >> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h >> index 5e7a1041df3a..09a30e2b29b1 100644 >> --- a/include/uapi/linux/if_link.h >> +++ b/include/uapi/linux/if_link.h >> @@ -1351,6 +1351,7 @@ enum { >> #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) >> #define RMNET_FLAGS_INGRESS_MAP_CKSUMV5 (1U << 4) >> #define RMNET_FLAGS_EGRESS_MAP_CKSUMV5 (1U << 5) >> +#define RMNET_FLAGS_EGRESS_AGGREGATION (1U << 6) > > But you could rely on the aggregation parameters passed via Ethtool > to decide whether to enable aggregation or not. If any of them is 0, > it means the aggregation needs to be disabled. > Otherwise, to enable it you need to use 2 utilities: the one that > creates RMNet devices at first and Ethtool after, isn't it too > complicated for no reason? Yes, the EGRESS AGGREGATION parameters can be added as part of the rmnet netlink policies. > >> >> enum { >> IFLA_RMNET_UNSPEC, >> -- >> 2.37.1 > > Thanks, > Olek
On Thu, 10 Nov 2022 18:17:09 -0700 Subash Abhinov Kasiviswanathan (KS) wrote: > The difference here is that hardware would use a single descriptor for > aggregation vs multiple descriptors for scatter gather. > > I wonder if this issue is related to pacing though. > Daniele, perhaps you can try this hack without enabling EGRESS > AGGREGATION and check if you are able to reach the same level of > performance for your scenario. > > --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c > +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c > @@ -236,7 +236,7 @@ void rmnet_egress_handler(struct sk_buff *skb) > struct rmnet_priv *priv; > u8 mux_id; > > - sk_pacing_shift_update(skb->sk, 8); > + skb_orphan(skb); > > orig_dev = skb->dev; > priv = netdev_priv(orig_dev); The pacing shift update is much cleaner than orphaning packets, IMHO. And the aggregation should hold onto the original skbs (a list?) and free them once the aggregate was transmitted.
On Wed, 9 Nov 2022 19:02:48 +0100 Daniele Palmas wrote: > +bool rmnet_map_tx_agg_skip(struct sk_buff *skb) > +{ > + bool is_icmp = 0; > + > + if (skb->protocol == htons(ETH_P_IP)) { > + struct iphdr *ip4h = ip_hdr(skb); > + > + if (ip4h->protocol == IPPROTO_ICMP) > + is_icmp = true; > + } else if (skb->protocol == htons(ETH_P_IPV6)) { > + unsigned int icmp_offset = 0; > + > + if (ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, NULL, NULL) == IPPROTO_ICMPV6) > + is_icmp = true; > + } > + > + return is_icmp; > +} Why this? I don't see it mention in the commit message or any code comment.
Hi Alexander, Il giorno gio 10 nov 2022 alle ore 18:35 Alexander Lobakin <alexandr.lobakin@intel.com> ha scritto: > > From: Daniele Palmas <dnlplm@gmail.com> > Date: Wed, 9 Nov 2022 19:02:48 +0100 > > > Bidirectional TCP throughput tests through iperf with low-cat > > Thread-x based modems showed performance issues both in tx > > and rx. > > > > The Windows driver does not show this issue: inspecting USB > > packets revealed that the only notable change is the driver > > enabling tx packets aggregation. > > > > Tx packets aggregation, by default disabled, requires flag > > RMNET_FLAGS_EGRESS_AGGREGATION to be set (e.g. through ip command). > > > > The maximum number of aggregated packets and the maximum aggregated > > size are by default set to reasonably low values in order to support > > the majority of modems. > > > > This implementation is based on patches available in Code Aurora > > repositories (msm kernel) whose main authors are > > > > Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> > > Sean Tranchetti <stranche@codeaurora.org> > > > > Signed-off-by: Daniele Palmas <dnlplm@gmail.com> > > --- > > .../ethernet/qualcomm/rmnet/rmnet_config.c | 5 + > > .../ethernet/qualcomm/rmnet/rmnet_config.h | 19 ++ > > .../ethernet/qualcomm/rmnet/rmnet_handlers.c | 25 ++- > > .../net/ethernet/qualcomm/rmnet/rmnet_map.h | 7 + > > .../ethernet/qualcomm/rmnet/rmnet_map_data.c | 196 ++++++++++++++++++ > > include/uapi/linux/if_link.h | 1 + > > 6 files changed, 251 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c > > index 27b1663c476e..39d24e07f306 100644 > > --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c > > +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c > > @@ -12,6 +12,7 @@ > > #include "rmnet_handlers.h" > > #include "rmnet_vnd.h" > > #include "rmnet_private.h" > > +#include "rmnet_map.h" > > > > /* Local Definitions and Declarations */ > > > > @@ -39,6 +40,8 @@ static int rmnet_unregister_real_device(struct net_device *real_dev) > > if (port->nr_rmnet_devs) > > return -EINVAL; > > > > + rmnet_map_tx_aggregate_exit(port); > > + > > netdev_rx_handler_unregister(real_dev); > > > > kfree(port); > > @@ -79,6 +82,8 @@ static int rmnet_register_real_device(struct net_device *real_dev, > > for (entry = 0; entry < RMNET_MAX_LOGICAL_EP; entry++) > > INIT_HLIST_HEAD(&port->muxed_ep[entry]); > > > > + rmnet_map_tx_aggregate_init(port); > > + > > netdev_dbg(real_dev, "registered with rmnet\n"); > > return 0; > > } > > diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h > > index 3d3cba56c516..d341df78e411 100644 > > --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h > > +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h > > @@ -6,6 +6,7 @@ > > */ > > > > #include <linux/skbuff.h> > > +#include <linux/time.h> > > #include <net/gro_cells.h> > > > > #ifndef _RMNET_CONFIG_H_ > > @@ -19,6 +20,12 @@ struct rmnet_endpoint { > > struct hlist_node hlnode; > > }; > > > > +struct rmnet_egress_agg_params { > > + u16 agg_size; > > skbs can now be way longer than 64 Kb. > OK, I can use u32. For further information, I would like to explain where u16 is coming from. The value of agg_size is returned by the modem through a qmi request (wda set data format): so far, all the modem I've been able to test have less than 64KB as the maximum size for the aggregated packets block, so I thought u16 was enough. But, to be honest, I can't exclude that there are modems where this value is > 64KB, so I think it could make sense using u32. > > + u16 agg_count; > > + u64 agg_time_nsec; > > +}; > > + > > /* One instance of this structure is instantiated for each real_dev associated > > * with rmnet. > > */ > > [...] > > > @@ -518,3 +519,198 @@ int rmnet_map_process_next_hdr_packet(struct sk_buff *skb, > > > > return 0; > > } > > + > > +long rmnet_agg_bypass_time __read_mostly = 10000L * NSEC_PER_USEC; > > Why __read_mostly if you don't change it anywhere? Could be const. > Why here if you use it only in one file? Could be static there. > Why variable if it could be a definition? > All your remarks make sense. > > + > > +bool rmnet_map_tx_agg_skip(struct sk_buff *skb) > > +{ > > + bool is_icmp = 0; > > + > > + if (skb->protocol == htons(ETH_P_IP)) { > > + struct iphdr *ip4h = ip_hdr(skb); > > [...] > > > +static void rmnet_map_flush_tx_packet_work(struct work_struct *work) > > +{ > > + struct sk_buff *skb = NULL; > > + struct rmnet_port *port; > > + unsigned long flags; > > + > > + port = container_of(work, struct rmnet_port, agg_wq); > > + > > + spin_lock_irqsave(&port->agg_lock, flags); > > I don't see aggregation fields used in any hardware interrupt > handlers, are you sure you need _irq*(), not _bh()? > I think you are right. > > + if (likely(port->agg_state == -EINPROGRESS)) { > > + /* Buffer may have already been shipped out */ > > + if (likely(port->agg_skb)) { > > + skb = port->agg_skb; > > + reset_aggr_params(port); > > + } > > + port->agg_state = 0; > > + } > > + > > + spin_unlock_irqrestore(&port->agg_lock, flags); > > + if (skb) > > + dev_queue_xmit(skb); > > +} > > + > > +enum hrtimer_restart rmnet_map_flush_tx_packet_queue(struct hrtimer *t) > > +{ > > + struct rmnet_port *port; > > + > > + port = container_of(t, struct rmnet_port, hrtimer); > > + > > + schedule_work(&port->agg_wq); > > + > > + return HRTIMER_NORESTART; > > +} > > + > > +void rmnet_map_tx_aggregate(struct sk_buff *skb, struct rmnet_port *port, > > + struct net_device *orig_dev) > > +{ > > + struct timespec64 diff, last; > > + int size = 0; > > RCT style? > Ack. > > + struct sk_buff *agg_skb; > > + unsigned long flags; > > + > > +new_packet: > > + spin_lock_irqsave(&port->agg_lock, flags); > > + memcpy(&last, &port->agg_last, sizeof(struct timespec64)); > > + ktime_get_real_ts64(&port->agg_last); > > + > > + if (!port->agg_skb) { > > + /* Check to see if we should agg first. If the traffic is very > > + * sparse, don't aggregate. > > + */ > > + diff = timespec64_sub(port->agg_last, last); > > + size = port->egress_agg_params.agg_size - skb->len; > > + > > + if (size < 0) { > > + struct rmnet_priv *priv; > > + > > + /* dropped */ > > So if a packet is smaller than the aggregation threshold, you just > drop it? Why, if you could just let it go the "standard" way, like > ICMP does? > If skb->len is bigger than the maximum possible aggregated block size, then the packet is dropped: this is because with some modems, sending a block that is bigger than the maximum supported size causes the modem to crash. > > + dev_kfree_skb_any(skb); > > + spin_unlock_irqrestore(&port->agg_lock, flags); > > You could release this lock a line above, so that > dev_kfree_skb_any() wouldn't be called in the HWIRQ context and > postpone skb freeing via the softnet queue. > Ack. > > + priv = netdev_priv(orig_dev); > > + this_cpu_inc(priv->pcpu_stats->stats.tx_drops); > > + > > + return; > > + } > > + > > + if (diff.tv_sec > 0 || diff.tv_nsec > rmnet_agg_bypass_time || > > + size == 0) { > > + spin_unlock_irqrestore(&port->agg_lock, flags); > > + skb->protocol = htons(ETH_P_MAP); > > + dev_queue_xmit(skb); > > + return; > > + } > > + > > + port->agg_skb = skb_copy_expand(skb, 0, size, GFP_ATOMIC); > > You could use skb_cow_head(skb, 0) and skip allocating a new skb if > the current one is writable, which usually is the case. > Ok, I will look at how to do that. > > + if (!port->agg_skb) { > > + reset_aggr_params(port); > > + spin_unlock_irqrestore(&port->agg_lock, flags); > > + skb->protocol = htons(ETH_P_MAP); > > + dev_queue_xmit(skb); > > + return; > > + } > > + port->agg_skb->protocol = htons(ETH_P_MAP); > > + port->agg_count = 1; > > + ktime_get_real_ts64(&port->agg_time); > > + dev_kfree_skb_any(skb); > > + goto schedule; > > + } > > + diff = timespec64_sub(port->agg_last, port->agg_time); > > + size = port->egress_agg_params.agg_size - port->agg_skb->len; > > + > > + if (skb->len > size || > > + diff.tv_sec > 0 || diff.tv_nsec > port->egress_agg_params.agg_time_nsec) { > > + agg_skb = port->agg_skb; > > + reset_aggr_params(port); > > + spin_unlock_irqrestore(&port->agg_lock, flags); > > + hrtimer_cancel(&port->hrtimer); > > + dev_queue_xmit(agg_skb); > > + goto new_packet; > > + } > > + > > + skb_put_data(port->agg_skb, skb->data, skb->len); > > IIUC, RMNet netdevs support %NETIF_F_SG. Which means you could just > attach skb data as frags to the first skb in the aggregation > session instead of copying the data all the time. > ...or even add %NETIF_F_FRAGLIST handling, that would save even more > -- just use skb->frag_list once you run out of skb_shinfo()->frags. > I think I get the general idea behind your comment, but honestly I need to study and work on this a bit more, since I'm missing specific knowledge on the things you are mentioning. > > + port->agg_count++; > > + dev_kfree_skb_any(skb); > > + > > + if (port->agg_count == port->egress_agg_params.agg_count || > > + port->agg_skb->len == port->egress_agg_params.agg_size) { > > I think ::agg_count and ::agg_size are the thresholds, so the > comparison should be >= I guess (especially ::agg_size which gets > increased by a random value each time, not by 1)? > Sorry, I'm probably missing something. port->egress_agg_params.agg_count and port->egress_agg_params.agg_size are supposed to stay fixed throughout the life of the device since they are configured usually during modem initialization with the values returned by qmi wda set data format request. The number of packets in an aggregated block sent to the modem can't be > port->egress_agg_params.agg_count and the size of the aggregated block can't be > port->egress_agg_params.agg_size, otherwise the modem could crash. The actual status of the aggregated block is stored in port->agg_count and port->agg_skb->len. Due to the reason above (modem crashing) the code should not arrive in a situation in which port->agg_skb->len > port->egress_agg_params.agg_size or port->agg_count > port->egress_agg_params.agg_count, besides possible bug I could have done. I think I don't understand this sentence "especially ::agg_size which gets increased by a random value each time, not by 1". > > + agg_skb = port->agg_skb; > > + reset_aggr_params(port); > > + spin_unlock_irqrestore(&port->agg_lock, flags); > > + hrtimer_cancel(&port->hrtimer); > > + dev_queue_xmit(agg_skb); > > + return; > > + } > > + > > +schedule: > > + if (!hrtimer_active(&port->hrtimer) && port->agg_state != -EINPROGRESS) { > > + port->agg_state = -EINPROGRESS; > > + hrtimer_start(&port->hrtimer, > > + ns_to_ktime(port->egress_agg_params.agg_time_nsec), > > + HRTIMER_MODE_REL); > > + } > > + spin_unlock_irqrestore(&port->agg_lock, flags); > > +} > > + > > +void rmnet_map_update_ul_agg_config(struct rmnet_port *port, u16 size, > > + u16 count, u32 time) > > +{ > > + unsigned long irq_flags; > > + > > + spin_lock_irqsave(&port->agg_lock, irq_flags); > > + port->egress_agg_params.agg_size = size; > > + port->egress_agg_params.agg_count = count; > > + port->egress_agg_params.agg_time_nsec = time * NSEC_PER_USEC; > > + spin_unlock_irqrestore(&port->agg_lock, irq_flags); > > +} > > + > > +void rmnet_map_tx_aggregate_init(struct rmnet_port *port) > > +{ > > + hrtimer_init(&port->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); > > + port->hrtimer.function = rmnet_map_flush_tx_packet_queue; > > + spin_lock_init(&port->agg_lock); > > + rmnet_map_update_ul_agg_config(port, 4096, 16, 800); > > + INIT_WORK(&port->agg_wq, rmnet_map_flush_tx_packet_work); > > +} > > + > > +void rmnet_map_tx_aggregate_exit(struct rmnet_port *port) > > +{ > > + unsigned long flags; > > + > > + hrtimer_cancel(&port->hrtimer); > > + cancel_work_sync(&port->agg_wq); > > + > > + spin_lock_irqsave(&port->agg_lock, flags); > > + if (port->agg_state == -EINPROGRESS) { > > + if (port->agg_skb) { > > + kfree_skb(port->agg_skb); > > + reset_aggr_params(port); > > + } > > + > > + port->agg_state = 0; > > + } > > + > > + spin_unlock_irqrestore(&port->agg_lock, flags); > > +} > > Do I get the whole logics correctly, you allocate a new big skb and > just copy several frames into it, then send as one chunk once its > size reaches the threshold? Plus linearize every skb to be able to > do that... That's too much of overhead I'd say, just handle S/G and > fraglists and make long trains of frags from them without copying > anything? Yes, you get the logic right. And my reply is like above, I think I understand your comment at a high level, but I need to learn how to do that. > Also BQL/DQL already does some sort of aggregation via > ::xmit_more, doesn't it? Do you have any performance numbers? > I've mainly tested with iperf tcp with the cat. 4 modem in which the issue is showing and the driver is capable of reaching the limits of the modem (50Mbps), but I understand that it's quite low. I've tried testing also with a 5G modem, but I could only test with a custom udp loopback mode, no real network or network simulator, through iperf, something like: $ iperf --client 192.168.1.172 --udp --interval 10 --port 5001 --bandwidth 600M --parallel 3 --time 10 ------------------------------------------------------------ Client connecting to 192.168.1.172, UDP port 5001 Sending 1470 byte datagrams, IPG target: 18.69 us (kalman adjust) UDP buffer size: 208 KByte (default) ------------------------------------------------------------ [ 6] local 192.168.48.173 port 54765 connected with 192.168.1.172 port 5001 [ 3] local 192.168.48.173 port 41767 connected with 192.168.1.172 port 5001 [ 5] local 192.168.48.173 port 33300 connected with 192.168.1.172 port 5001 [ ID] Interval Transfer Bandwidth [ 6] 0.0-10.0 sec 750 MBytes 629 Mbits/sec [ 5] 0.0-10.0 sec 750 MBytes 629 Mbits/sec [ 5] 0.0-10.0 sec 750 MBytes 629 Mbits/sec [ 5] Sent 534987 datagrams [SUM] 0.0-10.0 sec 2.20 GBytes 1.89 Gbits/sec [SUM] Sent 1604960 datagrams [ 5] Server Report: [ 5] 0.0-10.2 sec 1.21 GBytes 1.02 Gbits/sec 23.271 ms 0/534988 (0%) [ 5] 0.0000-10.2199 sec 835183 datagrams received out-of-order [ 6] 0.0-10.0 sec 750 MBytes 629 Mbits/sec [ 6] Sent 534990 datagrams [ 6] Server Report: [ 6] 0.0-10.2 sec 1.20 GBytes 1.00 Gbits/sec 14.879 ms 0/534991 (0%) [ 6] 0.0000-10.2320 sec 818506 datagrams received out-of-order [ 3] 0.0-10.0 sec 750 MBytes 629 Mbits/sec [ 3] Sent 534978 datagrams [ 3] Server Report: [ 3] 0.0-10.2 sec 1.20 GBytes 1.01 Gbits/sec 11.631 ms 0/534979 (0%) [ 3] 0.0000-10.2320 sec 824594 datagrams received out-of-order Not sure how much meaningful those are, since I don't have a clear vision on how the modem implements the loopback. > > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h > > index 5e7a1041df3a..09a30e2b29b1 100644 > > --- a/include/uapi/linux/if_link.h > > +++ b/include/uapi/linux/if_link.h > > @@ -1351,6 +1351,7 @@ enum { > > #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) > > #define RMNET_FLAGS_INGRESS_MAP_CKSUMV5 (1U << 4) > > #define RMNET_FLAGS_EGRESS_MAP_CKSUMV5 (1U << 5) > > +#define RMNET_FLAGS_EGRESS_AGGREGATION (1U << 6) > > But you could rely on the aggregation parameters passed via Ethtool > to decide whether to enable aggregation or not. If any of them is 0, > it means the aggregation needs to be disabled. > Otherwise, to enable it you need to use 2 utilities: the one that > creates RMNet devices at first and Ethtool after, isn't it too > complicated for no reason? > The rationale behind this was to keep a conservative approach by leaving the tx aggregation disabled by default and at the same time provide default values for the aggregation parameters. But I agree with you that it is redundant, I can manage that with just the value of the egress_agg_params.agg_count (1 = no aggregation). Thanks for your time and for the review. Thanks, Daniele > > > > enum { > > IFLA_RMNET_UNSPEC, > > -- > > 2.37.1 > > Thanks, > Olek
Hello Subash, Il giorno ven 11 nov 2022 alle ore 02:17 Subash Abhinov Kasiviswanathan (KS) <quic_subashab@quicinc.com> ha scritto: > > On 11/10/2022 10:32 AM, Alexander Lobakin wrote: > > From: Daniele Palmas <dnlplm@gmail.com> > > Date: Wed, 9 Nov 2022 19:02:48 +0100 > > > >> Bidirectional TCP throughput tests through iperf with low-cat > >> Thread-x based modems showed performance issues both in tx > >> and rx. > >> > >> The Windows driver does not show this issue: inspecting USB > >> packets revealed that the only notable change is the driver > >> enabling tx packets aggregation. > >> > >> Tx packets aggregation, by default disabled, requires flag > >> RMNET_FLAGS_EGRESS_AGGREGATION to be set (e.g. through ip command). > >> > >> The maximum number of aggregated packets and the maximum aggregated > >> size are by default set to reasonably low values in order to support > >> the majority of modems. > >> > >> This implementation is based on patches available in Code Aurora > >> repositories (msm kernel) whose main authors are > >> > >> Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> > >> Sean Tranchetti <stranche@codeaurora.org> > >> > >> Signed-off-by: Daniele Palmas <dnlplm@gmail.com> > >> --- > >> > >> +struct rmnet_egress_agg_params { > >> + u16 agg_size; > > > > skbs can now be way longer than 64 Kb. > > > > rmnet devices generally use a standard MTU (around 1500) size. > Would it still be possible for >64kb to be generated as no relevant > hw_features is set for large transmit offloads. > Alternatively, are you referring to injection of packets explicitly, say > via packet sockets. > > >> + u16 agg_count; > >> + u64 agg_time_nsec; > >> +}; > >> + > > Do I get the whole logics correctly, you allocate a new big skb and > > just copy several frames into it, then send as one chunk once its > > size reaches the threshold? Plus linearize every skb to be able to > > do that... That's too much of overhead I'd say, just handle S/G and > > fraglists and make long trains of frags from them without copying > > anything? Also BQL/DQL already does some sort of aggregation via > > ::xmit_more, doesn't it? Do you have any performance numbers? > > The difference here is that hardware would use a single descriptor for > aggregation vs multiple descriptors for scatter gather. > > I wonder if this issue is related to pacing though. > Daniele, perhaps you can try this hack without enabling EGRESS > AGGREGATION and check if you are able to reach the same level of > performance for your scenario. > > --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c > +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c > @@ -236,7 +236,7 @@ void rmnet_egress_handler(struct sk_buff *skb) > struct rmnet_priv *priv; > u8 mux_id; > > - sk_pacing_shift_update(skb->sk, 8); > + skb_orphan(skb); > > orig_dev = skb->dev; > priv = netdev_priv(orig_dev); > Sure, I'll test that on Monday. > > > >> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h > >> index 5e7a1041df3a..09a30e2b29b1 100644 > >> --- a/include/uapi/linux/if_link.h > >> +++ b/include/uapi/linux/if_link.h > >> @@ -1351,6 +1351,7 @@ enum { > >> #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) > >> #define RMNET_FLAGS_INGRESS_MAP_CKSUMV5 (1U << 4) > >> #define RMNET_FLAGS_EGRESS_MAP_CKSUMV5 (1U << 5) > >> +#define RMNET_FLAGS_EGRESS_AGGREGATION (1U << 6) > > > > But you could rely on the aggregation parameters passed via Ethtool > > to decide whether to enable aggregation or not. If any of them is 0, > > it means the aggregation needs to be disabled. > > Otherwise, to enable it you need to use 2 utilities: the one that > > creates RMNet devices at first and Ethtool after, isn't it too > > complicated for no reason? > > Yes, the EGRESS AGGREGATION parameters can be added as part of the rmnet > netlink policies. > Ack. Thanks, Daniele > > > >> > >> enum { > >> IFLA_RMNET_UNSPEC, > >> -- > >> 2.37.1 > > > > Thanks, > > Olek
Hello Subash, Il giorno ven 11 nov 2022 alle ore 23:00 Daniele Palmas <dnlplm@gmail.com> ha scritto: > > Hello Subash, > > Il giorno ven 11 nov 2022 alle ore 02:17 Subash Abhinov > Kasiviswanathan (KS) <quic_subashab@quicinc.com> ha scritto: > > > > On 11/10/2022 10:32 AM, Alexander Lobakin wrote: > > > From: Daniele Palmas <dnlplm@gmail.com> > > > Date: Wed, 9 Nov 2022 19:02:48 +0100 > > > > > >> Bidirectional TCP throughput tests through iperf with low-cat > > >> Thread-x based modems showed performance issues both in tx > > >> and rx. > > >> > > >> The Windows driver does not show this issue: inspecting USB > > >> packets revealed that the only notable change is the driver > > >> enabling tx packets aggregation. > > >> > > >> Tx packets aggregation, by default disabled, requires flag > > >> RMNET_FLAGS_EGRESS_AGGREGATION to be set (e.g. through ip command). > > >> > > >> The maximum number of aggregated packets and the maximum aggregated > > >> size are by default set to reasonably low values in order to support > > >> the majority of modems. > > >> > > >> This implementation is based on patches available in Code Aurora > > >> repositories (msm kernel) whose main authors are > > >> > > >> Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> > > >> Sean Tranchetti <stranche@codeaurora.org> > > >> > > >> Signed-off-by: Daniele Palmas <dnlplm@gmail.com> > > >> --- > > >> > > >> +struct rmnet_egress_agg_params { > > >> + u16 agg_size; > > > > > > skbs can now be way longer than 64 Kb. > > > > > > > rmnet devices generally use a standard MTU (around 1500) size. > > Would it still be possible for >64kb to be generated as no relevant > > hw_features is set for large transmit offloads. > > Alternatively, are you referring to injection of packets explicitly, say > > via packet sockets. > > > > >> + u16 agg_count; > > >> + u64 agg_time_nsec; > > >> +}; > > >> + > > > Do I get the whole logics correctly, you allocate a new big skb and > > > just copy several frames into it, then send as one chunk once its > > > size reaches the threshold? Plus linearize every skb to be able to > > > do that... That's too much of overhead I'd say, just handle S/G and > > > fraglists and make long trains of frags from them without copying > > > anything? Also BQL/DQL already does some sort of aggregation via > > > ::xmit_more, doesn't it? Do you have any performance numbers? > > > > The difference here is that hardware would use a single descriptor for > > aggregation vs multiple descriptors for scatter gather. > > > > I wonder if this issue is related to pacing though. > > Daniele, perhaps you can try this hack without enabling EGRESS > > AGGREGATION and check if you are able to reach the same level of > > performance for your scenario. > > > > --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c > > +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c > > @@ -236,7 +236,7 @@ void rmnet_egress_handler(struct sk_buff *skb) > > struct rmnet_priv *priv; > > u8 mux_id; > > > > - sk_pacing_shift_update(skb->sk, 8); > > + skb_orphan(skb); > > > > orig_dev = skb->dev; > > priv = netdev_priv(orig_dev); > > > > Sure, I'll test that on Monday. > Hello Subash, This change does not have any effect. Start the tx test alone: $ sudo iperf3 -c 172.22.1.201 -t 60 -p 5001 -P 3 Connecting to host 172.22.1.201, port 5001 [ 5] local 172.22.1.102 port 45902 connected to 172.22.1.201 port 5001 [ 7] local 172.22.1.102 port 45908 connected to 172.22.1.201 port 5001 [ 9] local 172.22.1.102 port 45910 connected to 172.22.1.201 port 5001 [ ID] Interval Transfer Bitrate Retr Cwnd [ 5] 0.00-1.00 sec 1.90 MBytes 15.9 Mbits/sec 0 114 KBytes [ 7] 0.00-1.00 sec 1.07 MBytes 8.94 Mbits/sec 0 71.3 KBytes [ 9] 0.00-1.00 sec 5.02 MBytes 42.1 Mbits/sec 0 344 KBytes [SUM] 0.00-1.00 sec 7.98 MBytes 66.9 Mbits/sec 0 - - - - - - - - - - - - - - - - - - - - - - - - - [ 5] 1.00-2.00 sec 1.30 MBytes 10.9 Mbits/sec 4 104 KBytes [ 7] 1.00-2.00 sec 949 KBytes 7.77 Mbits/sec 6 64.6 KBytes [ 9] 1.00-2.00 sec 3.71 MBytes 31.1 Mbits/sec 20 284 KBytes [SUM] 1.00-2.00 sec 5.93 MBytes 49.7 Mbits/sec 30 - - - - - - - - - - - - - - - - - - - - - - - - - [ 5] 2.00-3.00 sec 1.24 MBytes 10.4 Mbits/sec 0 117 KBytes [ 7] 2.00-3.00 sec 759 KBytes 6.22 Mbits/sec 0 72.7 KBytes [ 9] 2.00-3.00 sec 3.71 MBytes 31.1 Mbits/sec 0 319 KBytes [SUM] 2.00-3.00 sec 5.68 MBytes 47.7 Mbits/sec 0 - - - - - - - - - - - - - - - - - - - - - - - - - [ 5] 3.00-4.00 sec 1.85 MBytes 15.5 Mbits/sec 0 124 KBytes [ 7] 3.00-4.00 sec 949 KBytes 7.77 Mbits/sec 0 76.7 KBytes [ 9] 3.00-4.00 sec 3.09 MBytes 25.9 Mbits/sec 5 246 KBytes [SUM] 3.00-4.00 sec 5.87 MBytes 49.2 Mbits/sec 5 ... Start the rx test: $ sudo iperf3 -R -c 172.22.1.201 -t 30 -p 5002 Connecting to host 172.22.1.201, port 5002 Reverse mode, remote host 172.22.1.201 is sending [ 5] local 172.22.1.102 port 46496 connected to 172.22.1.201 port 5002 [ ID] Interval Transfer Bitrate [ 5] 0.00-1.00 sec 10.4 MBytes 87.4 Mbits/sec [ 5] 1.00-2.00 sec 12.7 MBytes 107 Mbits/sec [ 5] 2.00-3.00 sec 12.6 MBytes 106 Mbits/sec [ 5] 3.00-4.00 sec 12.6 MBytes 105 Mbits/sec [ 5] 4.00-5.00 sec 12.6 MBytes 106 Mbits/sec Checking the tx path: [ 5] 11.00-12.00 sec 632 KBytes 5.18 Mbits/sec 16 95.5 KBytes [ 7] 11.00-12.00 sec 506 KBytes 4.14 Mbits/sec 16 52.5 KBytes [ 9] 11.00-12.00 sec 759 KBytes 6.21 Mbits/sec 17 143 KBytes [SUM] 11.00-12.00 sec 1.85 MBytes 15.5 Mbits/sec 49 - - - - - - - - - - - - - - - - - - - - - - - - - [ 5] 12.00-13.00 sec 316 KBytes 2.59 Mbits/sec 67 26.9 KBytes [ 7] 12.00-13.00 sec 253 KBytes 2.07 Mbits/sec 44 9.42 KBytes [ 9] 12.00-13.00 sec 759 KBytes 6.22 Mbits/sec 66 94.2 KBytes [SUM] 12.00-13.00 sec 1.30 MBytes 10.9 Mbits/sec 177 - - - - - - - - - - - - - - - - - - - - - - - - - [ 5] 13.00-14.00 sec 316 KBytes 2.59 Mbits/sec 58 10.8 KBytes [ 7] 13.00-14.00 sec 253 KBytes 2.07 Mbits/sec 6 10.8 KBytes [ 9] 13.00-14.00 sec 0.00 Bytes 0.00 bits/sec 91 37.7 KBytes [SUM] 13.00-14.00 sec 569 KBytes 4.66 Mbits/sec 155 - - - - - - - - - - - - - - - - - - - - - - - - - [ 5] 14.00-15.00 sec 316 KBytes 2.59 Mbits/sec 6 8.07 KBytes [ 7] 14.00-15.00 sec 253 KBytes 2.07 Mbits/sec 3 9.42 KBytes [ 9] 14.00-15.00 sec 0.00 Bytes 0.00 bits/sec 53 8.07 KBytes [SUM] 14.00-15.00 sec 569 KBytes 4.66 Mbits/sec 62 Stop the rx test, the tx becomes again: [ 5] 28.00-29.00 sec 1.85 MBytes 15.5 Mbits/sec 0 96.9 KBytes [ 7] 28.00-29.00 sec 1.98 MBytes 16.6 Mbits/sec 0 96.9 KBytes [ 9] 28.00-29.00 sec 2.22 MBytes 18.7 Mbits/sec 0 96.9 KBytes [SUM] 28.00-29.00 sec 6.05 MBytes 50.8 Mbits/sec 0 - - - - - - - - - - - - - - - - - - - - - - - - - [ 5] 29.00-30.00 sec 2.16 MBytes 18.1 Mbits/sec 0 110 KBytes [ 7] 29.00-30.00 sec 1.98 MBytes 16.6 Mbits/sec 0 110 KBytes [ 9] 29.00-30.00 sec 1.48 MBytes 12.4 Mbits/sec 0 110 KBytes [SUM] 29.00-30.00 sec 5.62 MBytes 47.1 Mbits/sec 0 - - - - - - - - - - - - - - - - - - - - - - - - - [ 5] 30.00-31.00 sec 1.85 MBytes 15.5 Mbits/sec 0 122 KBytes [ 7] 30.00-31.00 sec 1.85 MBytes 15.5 Mbits/sec 0 122 KBytes [ 9] 30.00-31.00 sec 2.22 MBytes 18.7 Mbits/sec 0 122 KBytes [SUM] 30.00-31.00 sec 5.93 MBytes 49.7 Mbits/sec 0 Just want to highlight also that, if I enable the tx aggregation, not only the above described behavior does not happen, but also the rx test alone is capable to reach the maximum throughput of the modem: $ sudo iperf3 -R -c 172.22.1.201 -t 30 -p 5002 Connecting to host 172.22.1.201, port 5002 Reverse mode, remote host 172.22.1.201 is sending [ 5] local 172.22.1.102 port 51422 connected to 172.22.1.201 port 5002 [ ID] Interval Transfer Bitrate [ 5] 0.00-1.00 sec 15.1 MBytes 126 Mbits/sec [ 5] 1.00-2.00 sec 17.2 MBytes 144 Mbits/sec [ 5] 2.00-3.00 sec 17.2 MBytes 144 Mbits/sec [ 5] 3.00-4.00 sec 17.2 MBytes 144 Mbits/sec [ 5] 4.00-5.00 sec 17.2 MBytes 144 Mbits/sec [ 5] 5.00-6.00 sec 17.2 MBytes 144 Mbits/sec [ 5] 6.00-7.00 sec 17.2 MBytes 144 Mbits/sec [ 5] 7.00-8.00 sec 17.2 MBytes 144 Mbits/sec [ 5] 8.00-9.00 sec 17.2 MBytes 144 Mbits/sec [ 5] 9.00-10.00 sec 17.2 MBytes 144 Mbits/sec [ 5] 10.00-11.00 sec 17.2 MBytes 144 Mbits/sec ^C[ 5] 11.00-11.50 sec 8.67 MBytes 144 Mbits/sec - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bitrate [ 5] 0.00-11.50 sec 0.00 Bytes 0.00 bits/sec sender [ 5] 0.00-11.50 sec 195 MBytes 143 Mbits/sec receiver iperf3: interrupt - the client has terminated Thanks, Daniele > > > > > >> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h > > >> index 5e7a1041df3a..09a30e2b29b1 100644 > > >> --- a/include/uapi/linux/if_link.h > > >> +++ b/include/uapi/linux/if_link.h > > >> @@ -1351,6 +1351,7 @@ enum { > > >> #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) > > >> #define RMNET_FLAGS_INGRESS_MAP_CKSUMV5 (1U << 4) > > >> #define RMNET_FLAGS_EGRESS_MAP_CKSUMV5 (1U << 5) > > >> +#define RMNET_FLAGS_EGRESS_AGGREGATION (1U << 6) > > > > > > But you could rely on the aggregation parameters passed via Ethtool > > > to decide whether to enable aggregation or not. If any of them is 0, > > > it means the aggregation needs to be disabled. > > > Otherwise, to enable it you need to use 2 utilities: the one that > > > creates RMNet devices at first and Ethtool after, isn't it too > > > complicated for no reason? > > > > Yes, the EGRESS AGGREGATION parameters can be added as part of the rmnet > > netlink policies. > > > > Ack. > > Thanks, > Daniele > > > > > > >> > > >> enum { > > >> IFLA_RMNET_UNSPEC, > > >> -- > > >> 2.37.1 > > > > > > Thanks, > > > Olek
Hello Jakub, Il giorno ven 11 nov 2022 alle ore 18:14 Jakub Kicinski <kuba@kernel.org> ha scritto: > > On Wed, 9 Nov 2022 19:02:48 +0100 Daniele Palmas wrote: > > +bool rmnet_map_tx_agg_skip(struct sk_buff *skb) > > +{ > > + bool is_icmp = 0; > > + > > + if (skb->protocol == htons(ETH_P_IP)) { > > + struct iphdr *ip4h = ip_hdr(skb); > > + > > + if (ip4h->protocol == IPPROTO_ICMP) > > + is_icmp = true; > > + } else if (skb->protocol == htons(ETH_P_IPV6)) { > > + unsigned int icmp_offset = 0; > > + > > + if (ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, NULL, NULL) == IPPROTO_ICMPV6) > > + is_icmp = true; > > + } > > + > > + return is_icmp; > > +} > > Why this? I don't see it mention in the commit message or any code > comment. This is something I've found in downstream code: with my test setup and scenario it does not make any difference on the icmp packets timing (both with or without throughput tests ongoing), but I don't have access to all the systems for which rmnet is used. So, I'm not sure if it solves a real issue in other situations. I can move that out and me or someone else will add it again in case there will be a real issue to be solved. Thanks, Daniele
Daniele Palmas <dnlplm@gmail.com> writes: > Il giorno ven 11 nov 2022 alle ore 18:14 Jakub Kicinski > <kuba@kernel.org> ha scritto: >> >> On Wed, 9 Nov 2022 19:02:48 +0100 Daniele Palmas wrote: >> > +bool rmnet_map_tx_agg_skip(struct sk_buff *skb) >> > +{ >> > + bool is_icmp = 0; >> > + >> > + if (skb->protocol == htons(ETH_P_IP)) { >> > + struct iphdr *ip4h = ip_hdr(skb); >> > + >> > + if (ip4h->protocol == IPPROTO_ICMP) >> > + is_icmp = true; >> > + } else if (skb->protocol == htons(ETH_P_IPV6)) { >> > + unsigned int icmp_offset = 0; >> > + >> > + if (ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, NULL, NULL) == IPPROTO_ICMPV6) >> > + is_icmp = true; >> > + } >> > + >> > + return is_icmp; >> > +} >> >> Why this? I don't see it mention in the commit message or any code >> comment. > > This is something I've found in downstream code: with my test setup > and scenario it does not make any difference on the icmp packets > timing (both with or without throughput tests ongoing), but I don't > have access to all the systems for which rmnet is used. > > So, I'm not sure if it solves a real issue in other situations. > > I can move that out and me or someone else will add it again in case > there will be a real issue to be solved. It looks like an attempt to "cheat" latency measurements. I don't think we should do that. Aggregation may be necessary to achieve maximum throughput in a radio network, but has its obvious bufferbloat downside. Let's not hide that fact. Users deserve to know, and tune their systems accordingly. Things like this will only make that more difficult Bjørn
On Mon, 14 Nov 2022 11:25:32 +0100 Bjørn Mork wrote: > It looks like an attempt to "cheat" latency measurements. I don't think > we should do that. Aggregation may be necessary to achieve maximum > throughput in a radio network, but has its obvious bufferbloat downside. > Let's not hide that fact. Users deserve to know, and tune their systems > accordingly. Things like this will only make that more difficult
Hello Alexander, Il giorno gio 10 nov 2022 alle ore 18:35 Alexander Lobakin <alexandr.lobakin@intel.com> ha scritto: > > Do I get the whole logics correctly, you allocate a new big skb and > just copy several frames into it, then send as one chunk once its > size reaches the threshold? Plus linearize every skb to be able to > do that... That's too much of overhead I'd say, just handle S/G and > fraglists and make long trains of frags from them without copying > anything? sorry for my question, for sure I'm lacking knowledge about this, but I'm trying to understand how I can move forward. Suppose I'm able to build the aggregated block as a train of fragments, then I have to send it to the underlying netdevice that, in my scenario, is created by the qmi_wwan driver: I could be wrong, but my understanding is that it does not support fragments. And, as far as I know, there's only another driver in mainline used with rmnet (mhi_net) and that one also does not seem to support them either. Does this mean that once I have the aggregated block through fragments it should be converted to a single linear skb before sending? Thanks, Daniele
From: Daniele Palmas <dnlplm@gmail.com> Date: Wed, 16 Nov 2022 16:19:48 +0100 > Hello Alexander, > > Il giorno gio 10 nov 2022 alle ore 18:35 Alexander Lobakin > <alexandr.lobakin@intel.com> ha scritto: > > > > Do I get the whole logics correctly, you allocate a new big skb and > > just copy several frames into it, then send as one chunk once its > > size reaches the threshold? Plus linearize every skb to be able to > > do that... That's too much of overhead I'd say, just handle S/G and > > fraglists and make long trains of frags from them without copying > > anything? > > sorry for my question, for sure I'm lacking knowledge about this, but > I'm trying to understand how I can move forward. > > Suppose I'm able to build the aggregated block as a train of > fragments, then I have to send it to the underlying netdevice that, in > my scenario, is created by the qmi_wwan driver: I could be wrong, but > my understanding is that it does not support fragments. > > And, as far as I know, there's only another driver in mainline used > with rmnet (mhi_net) and that one also does not seem to support them > either. > > Does this mean that once I have the aggregated block through fragments > it should be converted to a single linear skb before sending? Ah okay, I've missed the fact it's only an intermediate layer and there's some real device behind it. If you make an skb with fragments and queue it up to a netdev which doesn't advertise %NETIF_F_SG, networking core will take care of this. It will then form a set of regular skbs and queue it for sending instead. Sure, there'll be some memcopies, but I can't say this implementation is better until some stats provided. And BTW, as Gal indirectly mentioned, those perf problems belong to the underlying device, e.g. qmi_wwan and so on, rmnet shouldn't do anything here. So you could try implement aggregation there or whatever you'd like to pick. I'd try to read some specs first and see whether qmi_wwan HW is capable of S/G or whether some driver improvements for Tx could be done there. > > Thanks, > Daniele Thanks, Olek
Il giorno mer 16 nov 2022 alle ore 17:21 Alexander Lobakin <alexandr.lobakin@intel.com> ha scritto: > > From: Daniele Palmas <dnlplm@gmail.com> > Date: Wed, 16 Nov 2022 16:19:48 +0100 > > > Hello Alexander, > > > > Il giorno gio 10 nov 2022 alle ore 18:35 Alexander Lobakin > > <alexandr.lobakin@intel.com> ha scritto: > > > > > > Do I get the whole logics correctly, you allocate a new big skb and > > > just copy several frames into it, then send as one chunk once its > > > size reaches the threshold? Plus linearize every skb to be able to > > > do that... That's too much of overhead I'd say, just handle S/G and > > > fraglists and make long trains of frags from them without copying > > > anything? > > > > sorry for my question, for sure I'm lacking knowledge about this, but > > I'm trying to understand how I can move forward. > > > > Suppose I'm able to build the aggregated block as a train of > > fragments, then I have to send it to the underlying netdevice that, in > > my scenario, is created by the qmi_wwan driver: I could be wrong, but > > my understanding is that it does not support fragments. > > > > And, as far as I know, there's only another driver in mainline used > > with rmnet (mhi_net) and that one also does not seem to support them > > either. > > > > Does this mean that once I have the aggregated block through fragments > > it should be converted to a single linear skb before sending? > > Ah okay, I've missed the fact it's only an intermediate layer and > there's some real device behind it. > If you make an skb with fragments and queue it up to a netdev which > doesn't advertise %NETIF_F_SG, networking core will take care of > this. It will then form a set of regular skbs and queue it for > sending instead. Sure, there'll be some memcopies, but I can't say > this implementation is better until some stats provided. > > And BTW, as Gal indirectly mentioned, those perf problems belong to > the underlying device, e.g. qmi_wwan and so on, rmnet shouldn't do > anything here. Ok, so rmnet would only take care of qmap rx packets deaggregation and qmi_wwan of the tx aggregation. At a conceptual level, implementing tx aggregation in qmi_wwan for passthrough mode could make sense, since the tx aggregation parameters belong to the physical device and are shared among the virtual rmnet netdevices, which can't have different aggr configurations if they belong to the same physical device. Bjørn, would this approach be ok for you? Thanks, Daniele > So you could try implement aggregation there or > whatever you'd like to pick. I'd try to read some specs first and > see whether qmi_wwan HW is capable of S/G or whether some driver > improvements for Tx could be done there. > > > > > Thanks, > > Daniele > > Thanks, > Olek
Daniele Palmas <dnlplm@gmail.com> writes: > Ok, so rmnet would only take care of qmap rx packets deaggregation and > qmi_wwan of the tx aggregation. > > At a conceptual level, implementing tx aggregation in qmi_wwan for > passthrough mode could make sense, since the tx aggregation parameters > belong to the physical device and are shared among the virtual rmnet > netdevices, which can't have different aggr configurations if they > belong to the same physical device. > > Bjørn, would this approach be ok for you? Sounds good to me, if this can be done within the userspace API restrictions we've been through. I assume it's possible to make this Just Work(tm) in qmi_wwan passthrough mode? I do not think we want any solution where the user will have to configure both qmi_wwan and rmnet to make things work properly. Bjørn
Il giorno dom 20 nov 2022 alle ore 10:39 Bjørn Mork <bjorn@mork.no> ha scritto: > > Daniele Palmas <dnlplm@gmail.com> writes: > > > Ok, so rmnet would only take care of qmap rx packets deaggregation and > > qmi_wwan of the tx aggregation. > > > > At a conceptual level, implementing tx aggregation in qmi_wwan for > > passthrough mode could make sense, since the tx aggregation parameters > > belong to the physical device and are shared among the virtual rmnet > > netdevices, which can't have different aggr configurations if they > > belong to the same physical device. > > > > Bjørn, would this approach be ok for you? > > Sounds good to me, if this can be done within the userspace API > restrictions we've been through. > > I assume it's possible to make this Just Work(tm) in qmi_wwan > passthrough mode? I do not think we want any solution where the user > will have to configure both qmi_wwan and rmnet to make things work > properly. > Yes, I think so: the ethtool configurations would apply to the qmi_wwan netdevice so that nothing should be done on the rmnet side. Regards, Daniele
On 11/20/2022 2:52 AM, Daniele Palmas wrote: > Il giorno dom 20 nov 2022 alle ore 10:39 Bjørn Mork <bjorn@mork.no> ha scritto: >> >> Daniele Palmas <dnlplm@gmail.com> writes: >> >>> Ok, so rmnet would only take care of qmap rx packets deaggregation and >>> qmi_wwan of the tx aggregation. >>> >>> At a conceptual level, implementing tx aggregation in qmi_wwan for >>> passthrough mode could make sense, since the tx aggregation parameters >>> belong to the physical device and are shared among the virtual rmnet >>> netdevices, which can't have different aggr configurations if they >>> belong to the same physical device. >>> >>> Bjørn, would this approach be ok for you? >> >> Sounds good to me, if this can be done within the userspace API >> restrictions we've been through. >> >> I assume it's possible to make this Just Work(tm) in qmi_wwan >> passthrough mode? I do not think we want any solution where the user >> will have to configure both qmi_wwan and rmnet to make things work >> properly. >> > > Yes, I think so: the ethtool configurations would apply to the > qmi_wwan netdevice so that nothing should be done on the rmnet side. > > Regards, > Daniele My only concern against this option is that we would now need to end up implementing the same tx aggregation logic in the other physical device drivers - mhi_netdev & ipa. Keeping this tx aggregation logic in rmnet allows you to leverage it across all these various physical devices.
Il giorno dom 20 nov 2022 alle ore 18:48 Subash Abhinov Kasiviswanathan (KS) <quic_subashab@quicinc.com> ha scritto: > > On 11/20/2022 2:52 AM, Daniele Palmas wrote: > > Il giorno dom 20 nov 2022 alle ore 10:39 Bjørn Mork <bjorn@mork.no> ha scritto: > >> > >> Daniele Palmas <dnlplm@gmail.com> writes: > >> > >>> Ok, so rmnet would only take care of qmap rx packets deaggregation and > >>> qmi_wwan of the tx aggregation. > >>> > >>> At a conceptual level, implementing tx aggregation in qmi_wwan for > >>> passthrough mode could make sense, since the tx aggregation parameters > >>> belong to the physical device and are shared among the virtual rmnet > >>> netdevices, which can't have different aggr configurations if they > >>> belong to the same physical device. > >>> > >>> Bjørn, would this approach be ok for you? > >> > >> Sounds good to me, if this can be done within the userspace API > >> restrictions we've been through. > >> > >> I assume it's possible to make this Just Work(tm) in qmi_wwan > >> passthrough mode? I do not think we want any solution where the user > >> will have to configure both qmi_wwan and rmnet to make things work > >> properly. > >> > > > > Yes, I think so: the ethtool configurations would apply to the > > qmi_wwan netdevice so that nothing should be done on the rmnet side. > > > > Regards, > > Daniele > > My only concern against this option is that we would now need to end up > implementing the same tx aggregation logic in the other physical device > drivers - mhi_netdev & ipa. Keeping this tx aggregation logic in rmnet > allows you to leverage it across all these various physical devices. Yes, that's true. But according to this discussion, the need for tx aggregation seems relevant just to USB devices (and maybe also a minority of them, since so far no one complained about it lacking), isn't it? Regards, Daniele
On 11/21/2022 12:00 AM, Daniele Palmas wrote: > Il giorno dom 20 nov 2022 alle ore 18:48 Subash Abhinov > Kasiviswanathan (KS) <quic_subashab@quicinc.com> ha scritto: >> >> On 11/20/2022 2:52 AM, Daniele Palmas wrote: >>> Il giorno dom 20 nov 2022 alle ore 10:39 Bjørn Mork <bjorn@mork.no> ha scritto: >>>> >>>> Daniele Palmas <dnlplm@gmail.com> writes: >>>> >>>>> Ok, so rmnet would only take care of qmap rx packets deaggregation and >>>>> qmi_wwan of the tx aggregation. >>>>> >>>>> At a conceptual level, implementing tx aggregation in qmi_wwan for >>>>> passthrough mode could make sense, since the tx aggregation parameters >>>>> belong to the physical device and are shared among the virtual rmnet >>>>> netdevices, which can't have different aggr configurations if they >>>>> belong to the same physical device. >>>>> >>>>> Bjørn, would this approach be ok for you? >>>> >>>> Sounds good to me, if this can be done within the userspace API >>>> restrictions we've been through. >>>> >>>> I assume it's possible to make this Just Work(tm) in qmi_wwan >>>> passthrough mode? I do not think we want any solution where the user >>>> will have to configure both qmi_wwan and rmnet to make things work >>>> properly. >>>> >>> >>> Yes, I think so: the ethtool configurations would apply to the >>> qmi_wwan netdevice so that nothing should be done on the rmnet side. >>> >>> Regards, >>> Daniele >> >> My only concern against this option is that we would now need to end up >> implementing the same tx aggregation logic in the other physical device >> drivers - mhi_netdev & ipa. Keeping this tx aggregation logic in rmnet >> allows you to leverage it across all these various physical devices. > > Yes, that's true. > > But according to this discussion, the need for tx aggregation seems > relevant just to USB devices (and maybe also a minority of them, since > so far no one complained about it lacking), isn't it? > > Regards, > Daniele Ah, it's more to do with it being future proof (in case some one does run into a similar throughput issue on the other platforms). It also consolidates the functionality within rmnet driver for both tx & rx path. As you already know, the deaggregation, checksum offload & demuxing logic are already present in rmnet path in rx. The complementary muxing & checksum offload functionality are also present in the rmnet tx path, so I wanted to see if the aggregation functionality could be added in rmnet.
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 27b1663c476e..39d24e07f306 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -12,6 +12,7 @@ #include "rmnet_handlers.h" #include "rmnet_vnd.h" #include "rmnet_private.h" +#include "rmnet_map.h" /* Local Definitions and Declarations */ @@ -39,6 +40,8 @@ static int rmnet_unregister_real_device(struct net_device *real_dev) if (port->nr_rmnet_devs) return -EINVAL; + rmnet_map_tx_aggregate_exit(port); + netdev_rx_handler_unregister(real_dev); kfree(port); @@ -79,6 +82,8 @@ static int rmnet_register_real_device(struct net_device *real_dev, for (entry = 0; entry < RMNET_MAX_LOGICAL_EP; entry++) INIT_HLIST_HEAD(&port->muxed_ep[entry]); + rmnet_map_tx_aggregate_init(port); + netdev_dbg(real_dev, "registered with rmnet\n"); return 0; } diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h index 3d3cba56c516..d341df78e411 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h @@ -6,6 +6,7 @@ */ #include <linux/skbuff.h> +#include <linux/time.h> #include <net/gro_cells.h> #ifndef _RMNET_CONFIG_H_ @@ -19,6 +20,12 @@ struct rmnet_endpoint { struct hlist_node hlnode; }; +struct rmnet_egress_agg_params { + u16 agg_size; + u16 agg_count; + u64 agg_time_nsec; +}; + /* One instance of this structure is instantiated for each real_dev associated * with rmnet. */ @@ -30,6 +37,18 @@ struct rmnet_port { struct hlist_head muxed_ep[RMNET_MAX_LOGICAL_EP]; struct net_device *bridge_ep; struct net_device *rmnet_dev; + + /* Egress aggregation information */ + struct rmnet_egress_agg_params egress_agg_params; + /* Protect aggregation related elements */ + spinlock_t agg_lock; + struct sk_buff *agg_skb; + int agg_state; + u8 agg_count; + struct timespec64 agg_time; + struct timespec64 agg_last; + struct hrtimer hrtimer; + struct work_struct agg_wq; }; extern struct rtnl_link_ops rmnet_link_ops; diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c index a313242a762e..82e2669e3590 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c @@ -136,10 +136,15 @@ static int rmnet_map_egress_handler(struct sk_buff *skb, { int required_headroom, additional_header_len, csum_type = 0; struct rmnet_map_header *map_header; + bool is_icmp = false; additional_header_len = 0; required_headroom = sizeof(struct rmnet_map_header); + if (port->data_format & RMNET_FLAGS_EGRESS_AGGREGATION && + rmnet_map_tx_agg_skip(skb)) + is_icmp = true; + if (port->data_format & RMNET_FLAGS_EGRESS_MAP_CKSUMV4) { additional_header_len = sizeof(struct rmnet_map_ul_csum_header); csum_type = RMNET_FLAGS_EGRESS_MAP_CKSUMV4; @@ -164,8 +169,18 @@ static int rmnet_map_egress_handler(struct sk_buff *skb, map_header->mux_id = mux_id; - skb->protocol = htons(ETH_P_MAP); + if (port->data_format & RMNET_FLAGS_EGRESS_AGGREGATION && !is_icmp) { + if (skb_is_nonlinear(skb)) { + if (unlikely(__skb_linearize(skb))) + goto done; + } + + rmnet_map_tx_aggregate(skb, port, orig_dev); + return -EINPROGRESS; + } +done: + skb->protocol = htons(ETH_P_MAP); return 0; } @@ -235,6 +250,7 @@ void rmnet_egress_handler(struct sk_buff *skb) struct rmnet_port *port; struct rmnet_priv *priv; u8 mux_id; + int err; sk_pacing_shift_update(skb->sk, 8); @@ -247,8 +263,13 @@ void rmnet_egress_handler(struct sk_buff *skb) if (!port) goto drop; - if (rmnet_map_egress_handler(skb, port, mux_id, orig_dev)) + err = rmnet_map_egress_handler(skb, port, mux_id, orig_dev); + if (err == -ENOMEM) { goto drop; + } else if (err == -EINPROGRESS) { + rmnet_vnd_tx_fixup(skb, orig_dev); + return; + } rmnet_vnd_tx_fixup(skb, orig_dev); diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h index 2b033060fc20..6aefc4e1bf47 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h @@ -53,5 +53,12 @@ void rmnet_map_checksum_uplink_packet(struct sk_buff *skb, struct net_device *orig_dev, int csum_type); int rmnet_map_process_next_hdr_packet(struct sk_buff *skb, u16 len); +bool rmnet_map_tx_agg_skip(struct sk_buff *skb); +void rmnet_map_tx_aggregate(struct sk_buff *skb, struct rmnet_port *port, + struct net_device *orig_dev); +void rmnet_map_tx_aggregate_init(struct rmnet_port *port); +void rmnet_map_tx_aggregate_exit(struct rmnet_port *port); +void rmnet_map_update_ul_agg_config(struct rmnet_port *port, u16 size, + u16 count, u32 time); #endif /* _RMNET_MAP_H_ */ diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c index ba194698cc14..49eeed4a126b 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -12,6 +12,7 @@ #include "rmnet_config.h" #include "rmnet_map.h" #include "rmnet_private.h" +#include "rmnet_vnd.h" #define RMNET_MAP_DEAGGR_SPACING 64 #define RMNET_MAP_DEAGGR_HEADROOM (RMNET_MAP_DEAGGR_SPACING / 2) @@ -518,3 +519,198 @@ int rmnet_map_process_next_hdr_packet(struct sk_buff *skb, return 0; } + +long rmnet_agg_bypass_time __read_mostly = 10000L * NSEC_PER_USEC; + +bool rmnet_map_tx_agg_skip(struct sk_buff *skb) +{ + bool is_icmp = 0; + + if (skb->protocol == htons(ETH_P_IP)) { + struct iphdr *ip4h = ip_hdr(skb); + + if (ip4h->protocol == IPPROTO_ICMP) + is_icmp = true; + } else if (skb->protocol == htons(ETH_P_IPV6)) { + unsigned int icmp_offset = 0; + + if (ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, NULL, NULL) == IPPROTO_ICMPV6) + is_icmp = true; + } + + return is_icmp; +} + +static void reset_aggr_params(struct rmnet_port *port) +{ + port->agg_skb = NULL; + port->agg_count = 0; + port->agg_state = 0; + memset(&port->agg_time, 0, sizeof(struct timespec64)); +} + +static void rmnet_map_flush_tx_packet_work(struct work_struct *work) +{ + struct sk_buff *skb = NULL; + struct rmnet_port *port; + unsigned long flags; + + port = container_of(work, struct rmnet_port, agg_wq); + + spin_lock_irqsave(&port->agg_lock, flags); + if (likely(port->agg_state == -EINPROGRESS)) { + /* Buffer may have already been shipped out */ + if (likely(port->agg_skb)) { + skb = port->agg_skb; + reset_aggr_params(port); + } + port->agg_state = 0; + } + + spin_unlock_irqrestore(&port->agg_lock, flags); + if (skb) + dev_queue_xmit(skb); +} + +enum hrtimer_restart rmnet_map_flush_tx_packet_queue(struct hrtimer *t) +{ + struct rmnet_port *port; + + port = container_of(t, struct rmnet_port, hrtimer); + + schedule_work(&port->agg_wq); + + return HRTIMER_NORESTART; +} + +void rmnet_map_tx_aggregate(struct sk_buff *skb, struct rmnet_port *port, + struct net_device *orig_dev) +{ + struct timespec64 diff, last; + int size = 0; + struct sk_buff *agg_skb; + unsigned long flags; + +new_packet: + spin_lock_irqsave(&port->agg_lock, flags); + memcpy(&last, &port->agg_last, sizeof(struct timespec64)); + ktime_get_real_ts64(&port->agg_last); + + if (!port->agg_skb) { + /* Check to see if we should agg first. If the traffic is very + * sparse, don't aggregate. + */ + diff = timespec64_sub(port->agg_last, last); + size = port->egress_agg_params.agg_size - skb->len; + + if (size < 0) { + struct rmnet_priv *priv; + + /* dropped */ + dev_kfree_skb_any(skb); + spin_unlock_irqrestore(&port->agg_lock, flags); + priv = netdev_priv(orig_dev); + this_cpu_inc(priv->pcpu_stats->stats.tx_drops); + + return; + } + + if (diff.tv_sec > 0 || diff.tv_nsec > rmnet_agg_bypass_time || + size == 0) { + spin_unlock_irqrestore(&port->agg_lock, flags); + skb->protocol = htons(ETH_P_MAP); + dev_queue_xmit(skb); + return; + } + + port->agg_skb = skb_copy_expand(skb, 0, size, GFP_ATOMIC); + if (!port->agg_skb) { + reset_aggr_params(port); + spin_unlock_irqrestore(&port->agg_lock, flags); + skb->protocol = htons(ETH_P_MAP); + dev_queue_xmit(skb); + return; + } + port->agg_skb->protocol = htons(ETH_P_MAP); + port->agg_count = 1; + ktime_get_real_ts64(&port->agg_time); + dev_kfree_skb_any(skb); + goto schedule; + } + diff = timespec64_sub(port->agg_last, port->agg_time); + size = port->egress_agg_params.agg_size - port->agg_skb->len; + + if (skb->len > size || + diff.tv_sec > 0 || diff.tv_nsec > port->egress_agg_params.agg_time_nsec) { + agg_skb = port->agg_skb; + reset_aggr_params(port); + spin_unlock_irqrestore(&port->agg_lock, flags); + hrtimer_cancel(&port->hrtimer); + dev_queue_xmit(agg_skb); + goto new_packet; + } + + skb_put_data(port->agg_skb, skb->data, skb->len); + port->agg_count++; + dev_kfree_skb_any(skb); + + if (port->agg_count == port->egress_agg_params.agg_count || + port->agg_skb->len == port->egress_agg_params.agg_size) { + agg_skb = port->agg_skb; + reset_aggr_params(port); + spin_unlock_irqrestore(&port->agg_lock, flags); + hrtimer_cancel(&port->hrtimer); + dev_queue_xmit(agg_skb); + return; + } + +schedule: + if (!hrtimer_active(&port->hrtimer) && port->agg_state != -EINPROGRESS) { + port->agg_state = -EINPROGRESS; + hrtimer_start(&port->hrtimer, + ns_to_ktime(port->egress_agg_params.agg_time_nsec), + HRTIMER_MODE_REL); + } + spin_unlock_irqrestore(&port->agg_lock, flags); +} + +void rmnet_map_update_ul_agg_config(struct rmnet_port *port, u16 size, + u16 count, u32 time) +{ + unsigned long irq_flags; + + spin_lock_irqsave(&port->agg_lock, irq_flags); + port->egress_agg_params.agg_size = size; + port->egress_agg_params.agg_count = count; + port->egress_agg_params.agg_time_nsec = time * NSEC_PER_USEC; + spin_unlock_irqrestore(&port->agg_lock, irq_flags); +} + +void rmnet_map_tx_aggregate_init(struct rmnet_port *port) +{ + hrtimer_init(&port->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + port->hrtimer.function = rmnet_map_flush_tx_packet_queue; + spin_lock_init(&port->agg_lock); + rmnet_map_update_ul_agg_config(port, 4096, 16, 800); + INIT_WORK(&port->agg_wq, rmnet_map_flush_tx_packet_work); +} + +void rmnet_map_tx_aggregate_exit(struct rmnet_port *port) +{ + unsigned long flags; + + hrtimer_cancel(&port->hrtimer); + cancel_work_sync(&port->agg_wq); + + spin_lock_irqsave(&port->agg_lock, flags); + if (port->agg_state == -EINPROGRESS) { + if (port->agg_skb) { + kfree_skb(port->agg_skb); + reset_aggr_params(port); + } + + port->agg_state = 0; + } + + spin_unlock_irqrestore(&port->agg_lock, flags); +} diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 5e7a1041df3a..09a30e2b29b1 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1351,6 +1351,7 @@ enum { #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) #define RMNET_FLAGS_INGRESS_MAP_CKSUMV5 (1U << 4) #define RMNET_FLAGS_EGRESS_MAP_CKSUMV5 (1U << 5) +#define RMNET_FLAGS_EGRESS_AGGREGATION (1U << 6) enum { IFLA_RMNET_UNSPEC,
Bidirectional TCP throughput tests through iperf with low-cat Thread-x based modems showed performance issues both in tx and rx. The Windows driver does not show this issue: inspecting USB packets revealed that the only notable change is the driver enabling tx packets aggregation. Tx packets aggregation, by default disabled, requires flag RMNET_FLAGS_EGRESS_AGGREGATION to be set (e.g. through ip command). The maximum number of aggregated packets and the maximum aggregated size are by default set to reasonably low values in order to support the majority of modems. This implementation is based on patches available in Code Aurora repositories (msm kernel) whose main authors are Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> Sean Tranchetti <stranche@codeaurora.org> Signed-off-by: Daniele Palmas <dnlplm@gmail.com> --- .../ethernet/qualcomm/rmnet/rmnet_config.c | 5 + .../ethernet/qualcomm/rmnet/rmnet_config.h | 19 ++ .../ethernet/qualcomm/rmnet/rmnet_handlers.c | 25 ++- .../net/ethernet/qualcomm/rmnet/rmnet_map.h | 7 + .../ethernet/qualcomm/rmnet/rmnet_map_data.c | 196 ++++++++++++++++++ include/uapi/linux/if_link.h | 1 + 6 files changed, 251 insertions(+), 2 deletions(-)