Message ID | 20230529080840.1156458-6-yoshihiro.shimoda.uh@renesas.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net: renesas: rswitch: Improve performance of TX | expand |
Hi Yoshihiro,
kernel test robot noticed the following build errors:
[auto build test ERROR on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/Yoshihiro-Shimoda/dt-bindings-net-r8a779f0-ether-switch-Add-ACLK/20230529-161010
base: net-next/main
patch link: https://lore.kernel.org/r/20230529080840.1156458-6-yoshihiro.shimoda.uh%40renesas.com
patch subject: [PATCH net-next 5/5] net: renesas: rswitch: Use per-queue rate limiter
config: mips-allyesconfig (https://download.01.org/0day-ci/archive/20230530/202305300454.Kyjag1oy-lkp@intel.com/config)
compiler: mips-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
mkdir -p ~/bin
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/64502944471d5b6fac76f49c06c29edfbbe43935
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Yoshihiro-Shimoda/dt-bindings-net-r8a779f0-ether-switch-Add-ACLK/20230529-161010
git checkout 64502944471d5b6fac76f49c06c29edfbbe43935
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 ~/bin/make.cross W=1 O=build_dir ARCH=mips olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 ~/bin/make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202305300454.Kyjag1oy-lkp@intel.com/
All errors (new ones prefixed by >>):
arch/mips/kernel/head.o: in function `kernel_entry':
(.ref.text+0xac): relocation truncated to fit: R_MIPS_26 against `start_kernel'
init/main.o: in function `set_reset_devices':
main.c:(.init.text+0x20): relocation truncated to fit: R_MIPS_26 against `_mcount'
main.c:(.init.text+0x30): relocation truncated to fit: R_MIPS_26 against `__sanitizer_cov_trace_pc'
init/main.o: in function `debug_kernel':
main.c:(.init.text+0xa4): relocation truncated to fit: R_MIPS_26 against `_mcount'
main.c:(.init.text+0xb4): relocation truncated to fit: R_MIPS_26 against `__sanitizer_cov_trace_pc'
init/main.o: in function `quiet_kernel':
main.c:(.init.text+0x128): relocation truncated to fit: R_MIPS_26 against `_mcount'
main.c:(.init.text+0x138): relocation truncated to fit: R_MIPS_26 against `__sanitizer_cov_trace_pc'
init/main.o: in function `warn_bootconfig':
main.c:(.init.text+0x1ac): relocation truncated to fit: R_MIPS_26 against `_mcount'
main.c:(.init.text+0x1bc): relocation truncated to fit: R_MIPS_26 against `__sanitizer_cov_trace_pc'
init/main.o: in function `init_setup':
main.c:(.init.text+0x234): relocation truncated to fit: R_MIPS_26 against `_mcount'
main.c:(.init.text+0x254): additional relocation overflows omitted from the output
mips-linux-ld: drivers/net/ethernet/renesas/rswitch.o: in function `rswitch_open':
>> rswitch.c:(.text.rswitch_open+0x1a4): undefined reference to `__udivdi3'
Hi Yoshihiro,
kernel test robot noticed the following build errors:
[auto build test ERROR on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/Yoshihiro-Shimoda/dt-bindings-net-r8a779f0-ether-switch-Add-ACLK/20230529-161010
base: net-next/main
patch link: https://lore.kernel.org/r/20230529080840.1156458-6-yoshihiro.shimoda.uh%40renesas.com
patch subject: [PATCH net-next 5/5] net: renesas: rswitch: Use per-queue rate limiter
config: powerpc-allmodconfig (https://download.01.org/0day-ci/archive/20230530/202305300718.pdF0iaxU-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
mkdir -p ~/bin
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/64502944471d5b6fac76f49c06c29edfbbe43935
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Yoshihiro-Shimoda/dt-bindings-net-r8a779f0-ether-switch-Add-ACLK/20230529-161010
git checkout 64502944471d5b6fac76f49c06c29edfbbe43935
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 ~/bin/make.cross W=1 O=build_dir ARCH=powerpc olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 ~/bin/make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202305300718.pdF0iaxU-lkp@intel.com/
All errors (new ones prefixed by >>, old ones prefixed by <<):
>> ERROR: modpost: "__udivdi3" [drivers/net/ethernet/renesas/rswitch_drv.ko] undefined!
Hi Shimoda-san, On Mon, May 29, 2023 at 10:08 AM Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com> wrote: > Use per-queue rate limiter instead of global rate limiter. Otherwise > TX performance will be low when we use multiple ports at the same time. > > Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com> Thanks for your patch! > --- a/drivers/net/ethernet/renesas/rswitch.c > +++ b/drivers/net/ethernet/renesas/rswitch.c > @@ -156,22 +156,31 @@ static int rswitch_gwca_axi_ram_reset(struct rswitch_private *priv) > return rswitch_reg_wait(priv->addr, GWARIRM, GWARIRM_ARR, GWARIRM_ARR); > } > > -static void rswitch_gwca_set_rate_limit(struct rswitch_private *priv, int rate) > +static void rswitch_gwca_set_rate_limit(struct rswitch_private *priv, > + struct rswitch_gwca_queue *txq) > { > - u32 gwgrlulc, gwgrlc; > + u64 period_ps; > + unsigned long rate; > + u32 gwrlc; > > - switch (rate) { > - case 1000: > - gwgrlulc = 0x0000005f; > - gwgrlc = 0x00010260; > - break; > - default: > - dev_err(&priv->pdev->dev, "%s: This rate is not supported (%d)\n", __func__, rate); > - return; > - } > + rate = clk_get_rate(priv->aclk); > + if (!rate) > + rate = RSWITCH_ACLK_DEFAULT; > + > + period_ps = div64_u64(1000000000000ULL, rate); div64_ul, as rate is unsigned long. > + > + /* GWRLC value = 256 * ACLK_period[ns] * maxBandwidth[Gbps] */ > + gwrlc = 256 * period_ps * txq->speed / 1000000; This contains an open-coded 64-by-32 division, causing link failures on 32-bit platforms, so you should use div_u64() instead. However, because of the premultiplication by speed, which is 32-bit, you can use the mul_u64_u32_div() helper. Combining this with the calculation of period_ps above, you can simplify this to: gwrlc = div64_ul(256000000ULL * txq->speed, rate); > + > + /* To avoid overflow internally, the value should be 97% */ > + gwrlc = gwrlc * 97 / 100; > > - iowrite32(gwgrlulc, priv->addr + GWGRLULC); > - iowrite32(gwgrlc, priv->addr + GWGRLC); > + dev_dbg(&priv->pdev->dev, > + "%s: index = %d, speed = %d, rate = %ld, gwrlc = %08x\n", > + __func__, txq->index_trim, txq->speed, rate, gwrlc); > + > + iowrite32(GWRLULC_NOT_REQUIRED, priv->addr + GWRLULC(txq->index_trim)); > + iowrite32(gwrlc | GWRLC_RLE, priv->addr + GWRLC(txq->index_trim)); > } > Gr{oetje,eeting}s, Geert
Hi Geert-san, > From: Geert Uytterhoeven, Sent: Tuesday, May 30, 2023 4:33 PM > > Hi Shimoda-san, > > On Mon, May 29, 2023 at 10:08 AM Yoshihiro Shimoda > <yoshihiro.shimoda.uh@renesas.com> wrote: > > Use per-queue rate limiter instead of global rate limiter. Otherwise > > TX performance will be low when we use multiple ports at the same time. > > > > Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com> > > Thanks for your patch! Thank you for your review! > > --- a/drivers/net/ethernet/renesas/rswitch.c > > +++ b/drivers/net/ethernet/renesas/rswitch.c > > @@ -156,22 +156,31 @@ static int rswitch_gwca_axi_ram_reset(struct rswitch_private *priv) > > return rswitch_reg_wait(priv->addr, GWARIRM, GWARIRM_ARR, GWARIRM_ARR); > > } > > > > -static void rswitch_gwca_set_rate_limit(struct rswitch_private *priv, int rate) > > +static void rswitch_gwca_set_rate_limit(struct rswitch_private *priv, > > + struct rswitch_gwca_queue *txq) > > { > > - u32 gwgrlulc, gwgrlc; > > + u64 period_ps; > > + unsigned long rate; > > + u32 gwrlc; > > > > - switch (rate) { > > - case 1000: > > - gwgrlulc = 0x0000005f; > > - gwgrlc = 0x00010260; > > - break; > > - default: > > - dev_err(&priv->pdev->dev, "%s: This rate is not supported (%d)\n", __func__, rate); > > - return; > > - } > > + rate = clk_get_rate(priv->aclk); > > + if (!rate) > > + rate = RSWITCH_ACLK_DEFAULT; > > + > > + period_ps = div64_u64(1000000000000ULL, rate); > > div64_ul, as rate is unsigned long. I see. > > + > > + /* GWRLC value = 256 * ACLK_period[ns] * maxBandwidth[Gbps] */ > > + gwrlc = 256 * period_ps * txq->speed / 1000000; > > This contains an open-coded 64-by-32 division, causing link failures > on 32-bit platforms, so you should use div_u64() instead. However, > because of the premultiplication by speed, which is 32-bit, you can > use the mul_u64_u32_div() helper. Thank you for your comment! After I got emails from kernel test robot, I realized that I should use such a macro here. > Combining this with the calculation of period_ps above, you can simplify > this to: > > gwrlc = div64_ul(256000000ULL * txq->speed, rate); Thank you for your suggestion! I'll fix it on v2. Best regards, Yoshihiro Shimoda > > + > > + /* To avoid overflow internally, the value should be 97% */ > > + gwrlc = gwrlc * 97 / 100; > > > > - iowrite32(gwgrlulc, priv->addr + GWGRLULC); > > - iowrite32(gwgrlc, priv->addr + GWGRLC); > > + dev_dbg(&priv->pdev->dev, > > + "%s: index = %d, speed = %d, rate = %ld, gwrlc = %08x\n", > > + __func__, txq->index_trim, txq->speed, rate, gwrlc); > > + > > + iowrite32(GWRLULC_NOT_REQUIRED, priv->addr + GWRLULC(txq->index_trim)); > > + iowrite32(gwrlc | GWRLC_RLE, priv->addr + GWRLC(txq->index_trim)); > > } > > > > Gr{oetje,eeting}s, > > Geert > > -- > Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org > > In personal conversations with technical people, I call myself a hacker. But > when I'm talking to journalists I just say "programmer" or something like that. > -- Linus Torvalds
On Mon, May 29, 2023 at 05:08:40PM +0900, Yoshihiro Shimoda wrote: > Use per-queue rate limiter instead of global rate limiter. Otherwise > TX performance will be low when we use multiple ports at the same time. > > Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com> > --- > drivers/net/ethernet/renesas/rswitch.c | 51 +++++++++++++++++--------- > drivers/net/ethernet/renesas/rswitch.h | 15 +++++++- > 2 files changed, 47 insertions(+), 19 deletions(-) > > diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c > index 4ae34b0206cd..a7195625a2c7 100644 > --- a/drivers/net/ethernet/renesas/rswitch.c > +++ b/drivers/net/ethernet/renesas/rswitch.c > @@ -156,22 +156,31 @@ static int rswitch_gwca_axi_ram_reset(struct rswitch_private *priv) > return rswitch_reg_wait(priv->addr, GWARIRM, GWARIRM_ARR, GWARIRM_ARR); > } > > -static void rswitch_gwca_set_rate_limit(struct rswitch_private *priv, int rate) > +static void rswitch_gwca_set_rate_limit(struct rswitch_private *priv, > + struct rswitch_gwca_queue *txq) > { > - u32 gwgrlulc, gwgrlc; > + u64 period_ps; > + unsigned long rate; > + u32 gwrlc; Hi Shimoda-san, a minor not from my side: please use reverse xmas tree order - longest line to shortest - for local variable declarations in networking code. unsigned long rate; u64 period_ps; u32 gwrlc;
Hi Simon-san, > From: Simon Horman, Sent: Wednesday, May 31, 2023 4:29 AM > > On Mon, May 29, 2023 at 05:08:40PM +0900, Yoshihiro Shimoda wrote: > > Use per-queue rate limiter instead of global rate limiter. Otherwise > > TX performance will be low when we use multiple ports at the same time. > > > > Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com> > > --- > > drivers/net/ethernet/renesas/rswitch.c | 51 +++++++++++++++++--------- > > drivers/net/ethernet/renesas/rswitch.h | 15 +++++++- > > 2 files changed, 47 insertions(+), 19 deletions(-) > > > > diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c > > index 4ae34b0206cd..a7195625a2c7 100644 > > --- a/drivers/net/ethernet/renesas/rswitch.c > > +++ b/drivers/net/ethernet/renesas/rswitch.c > > @@ -156,22 +156,31 @@ static int rswitch_gwca_axi_ram_reset(struct rswitch_private *priv) > > return rswitch_reg_wait(priv->addr, GWARIRM, GWARIRM_ARR, GWARIRM_ARR); > > } > > > > -static void rswitch_gwca_set_rate_limit(struct rswitch_private *priv, int rate) > > +static void rswitch_gwca_set_rate_limit(struct rswitch_private *priv, > > + struct rswitch_gwca_queue *txq) > > { > > - u32 gwgrlulc, gwgrlc; > > + u64 period_ps; > > + unsigned long rate; > > + u32 gwrlc; > > Hi Shimoda-san, > > a minor not from my side: please use reverse xmas tree order - longest line > to shortest - for local variable declarations in networking code. > > unsigned long rate; > u64 period_ps; > u32 gwrlc; Thank you for your comment! I completely forgot that network driver should use reverse xmas tree order... JFYI, I found another way to improve the performance. So, I dropped this patch on v2 [1]. [1] https://lore.kernel.org/all/20230606085558.1708766-1-yoshihiro.shimoda.uh@renesas.com/ Best regards, Yoshihiro Shimoda > -- > pw-bot: cr
diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 4ae34b0206cd..a7195625a2c7 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -156,22 +156,31 @@ static int rswitch_gwca_axi_ram_reset(struct rswitch_private *priv) return rswitch_reg_wait(priv->addr, GWARIRM, GWARIRM_ARR, GWARIRM_ARR); } -static void rswitch_gwca_set_rate_limit(struct rswitch_private *priv, int rate) +static void rswitch_gwca_set_rate_limit(struct rswitch_private *priv, + struct rswitch_gwca_queue *txq) { - u32 gwgrlulc, gwgrlc; + u64 period_ps; + unsigned long rate; + u32 gwrlc; - switch (rate) { - case 1000: - gwgrlulc = 0x0000005f; - gwgrlc = 0x00010260; - break; - default: - dev_err(&priv->pdev->dev, "%s: This rate is not supported (%d)\n", __func__, rate); - return; - } + rate = clk_get_rate(priv->aclk); + if (!rate) + rate = RSWITCH_ACLK_DEFAULT; + + period_ps = div64_u64(1000000000000ULL, rate); + + /* GWRLC value = 256 * ACLK_period[ns] * maxBandwidth[Gbps] */ + gwrlc = 256 * period_ps * txq->speed / 1000000; + + /* To avoid overflow internally, the value should be 97% */ + gwrlc = gwrlc * 97 / 100; - iowrite32(gwgrlulc, priv->addr + GWGRLULC); - iowrite32(gwgrlc, priv->addr + GWGRLC); + dev_dbg(&priv->pdev->dev, + "%s: index = %d, speed = %d, rate = %ld, gwrlc = %08x\n", + __func__, txq->index_trim, txq->speed, rate, gwrlc); + + iowrite32(GWRLULC_NOT_REQUIRED, priv->addr + GWRLULC(txq->index_trim)); + iowrite32(gwrlc | GWRLC_RLE, priv->addr + GWRLC(txq->index_trim)); } static bool rswitch_is_any_data_irq(struct rswitch_private *priv, u32 *dis, bool tx) @@ -548,6 +557,10 @@ static struct rswitch_gwca_queue *rswitch_gwca_get(struct rswitch_private *priv, memset(gq, 0, sizeof(*gq)); gq->index = index; + /* The first "Rate limiter" queue is located at GWCA_AXI_CHAIN_N - 1 */ + if (dir_tx) + gq->index_trim = GWCA_AXI_CHAIN_N - index - 1; + return gq; } @@ -651,7 +664,6 @@ static int rswitch_gwca_hw_init(struct rswitch_private *priv) iowrite32(lower_32_bits(priv->gwca.ts_queue.ring_dma), priv->addr + GWTDCAC10); iowrite32(upper_32_bits(priv->gwca.ts_queue.ring_dma), priv->addr + GWTDCAC00); iowrite32(GWCA_TS_IRQ_BIT, priv->addr + GWTSDCC0); - rswitch_gwca_set_rate_limit(priv, priv->gwca.speed); for (i = 0; i < RSWITCH_NUM_PORTS; i++) { err = rswitch_rxdmac_init(priv, i); @@ -1441,6 +1453,8 @@ static int rswitch_open(struct net_device *ndev) napi_enable(&rdev->napi); netif_start_queue(ndev); + rswitch_gwca_set_rate_limit(rdev->priv, rdev->tx_queue); + rswitch_enadis_data_irq(rdev->priv, rdev->tx_queue->index, true); rswitch_enadis_data_irq(rdev->priv, rdev->rx_queue->index, true); @@ -1723,9 +1737,6 @@ static int rswitch_device_alloc(struct rswitch_private *priv, int index) if (err < 0) goto out_get_params; - if (rdev->priv->gwca.speed < rdev->etha->speed) - rdev->priv->gwca.speed = rdev->etha->speed; - err = rswitch_rxdmac_alloc(ndev); if (err < 0) goto out_rxdmac; @@ -1734,6 +1745,8 @@ static int rswitch_device_alloc(struct rswitch_private *priv, int index) if (err < 0) goto out_txdmac; + rdev->tx_queue->speed = rdev->etha->speed; + return 0; out_txdmac: @@ -1903,6 +1916,10 @@ static int renesas_eth_sw_probe(struct platform_device *pdev) pm_runtime_enable(&pdev->dev); pm_runtime_get_sync(&pdev->dev); + priv->aclk = devm_clk_get_optional(&pdev->dev, "aclk"); + if (IS_ERR(priv->aclk)) + return PTR_ERR(priv->aclk); + ret = rswitch_init(priv); if (ret < 0) { pm_runtime_put(&pdev->dev); diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index 7ba45ddab42a..741f45266c29 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -7,9 +7,11 @@ #ifndef __RSWITCH_H__ #define __RSWITCH_H__ +#include <linux/clk.h> #include <linux/platform_device.h> #include "rcar_gen4_ptp.h" +#define RSWITCH_ACLK_DEFAULT 400000000UL #define RSWITCH_NUM_PORTS 3 #define rswitch_for_each_enabled_port(priv, i) \ for (i = 0; i < RSWITCH_NUM_PORTS; i++) \ @@ -676,7 +678,7 @@ enum rswitch_reg { GWIDACAM10 = GWRO + 0x0984, GWGRLC = GWRO + 0x0a00, GWGRLULC = GWRO + 0x0a04, - GWRLIVC0 = GWRO + 0x0a80, + GWRLC0 = GWRO + 0x0a80, GWRLULC0 = GWRO + 0x0a84, GWIDPC = GWRO + 0x0b00, GWIDC0 = GWRO + 0x0c00, @@ -778,6 +780,11 @@ enum rswitch_gwca_mode { #define GWTRC(queue) (GWTRC0 + (queue) / 32 * 4) #define GWDCC_OFFS(queue) (GWDCC0 + (queue) * 4) +#define GWRLC(trim) (GWRLC0 + (trim) * 8) +#define GWRLC_RLE BIT(16) +#define GWRLULC(trim) (GWRLULC0 + (trim) * 8) +#define GWRLULC_NOT_REQUIRED 0x00800000 + #define GWDIS(i) (GWDIS0 + (i) * 0x10) #define GWDIE(i) (GWDIE0 + (i) * 0x10) #define GWDID(i) (GWDID0 + (i) * 0x10) @@ -942,6 +949,10 @@ struct rswitch_gwca_queue { bool dir_tx; struct sk_buff **skbs; struct net_device *ndev; /* queue to ndev for irq */ + + /* For tx_ring */ + int index_trim; + int speed; }; struct rswitch_gwca_ts_info { @@ -963,7 +974,6 @@ struct rswitch_gwca { DECLARE_BITMAP(used, GWCA_AXI_CHAIN_N); u32 tx_irq_bits[GWCA_NUM_IRQ_REGS]; u32 rx_irq_bits[GWCA_NUM_IRQ_REGS]; - int speed; }; #define NUM_QUEUES_PER_NDEV 2 @@ -997,6 +1007,7 @@ struct rswitch_private { struct platform_device *pdev; void __iomem *addr; struct rcar_gen4_ptp_private *ptp_priv; + struct clk *aclk; struct rswitch_device *rdev[RSWITCH_NUM_PORTS]; DECLARE_BITMAP(opened_ports, RSWITCH_NUM_PORTS);
Use per-queue rate limiter instead of global rate limiter. Otherwise TX performance will be low when we use multiple ports at the same time. Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com> --- drivers/net/ethernet/renesas/rswitch.c | 51 +++++++++++++++++--------- drivers/net/ethernet/renesas/rswitch.h | 15 +++++++- 2 files changed, 47 insertions(+), 19 deletions(-)