Message ID | 20250105011525.1718380-1-kuba@kernel.org (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next] selftests: drv-net: test drivers sleeping in ndo_get_stats64 | expand |
Jakub Kicinski wrote: > Most of our tests use rtnetlink to read device stats, so they > don't expose the drivers much to paths in which device stats > are read under RCU. Add tests which hammer profcs reads to > make sure drivers: > - don't sleep while reporting stats, > - can handle parallel reads, > - can handle device going down while reading. > > Set ifname on the env class in NetDrvEnv, we already do that > in NetDrvEpEnv. > > KTAP version 1 > 1..7 > ok 1 stats.check_pause > ok 2 stats.check_fec > ok 3 stats.pkt_byte_sum > ok 4 stats.qstat_by_ifindex > ok 5 stats.check_down > ok 6 stats.procfs_hammer > # completed up/down cycles: 6 > ok 7 stats.procfs_downup_hammer > # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 > > Signed-off-by: Jakub Kicinski <kuba@kernel.org> Reviewed-by: Willem de Bruijn <willemb@google.com> Two tiny comments, neither cause for respin. > +@ksft_disruptive > +def procfs_downup_hammer(cfg) -> None: > + """ > + Reading stats via procfs only holds the RCU lock, drivers often try > + to sleep when reading the stats, or don't protect against races. > + """ > + # Max out the queues, we'll flip between max an 1 s/an/and/ > + channels = ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) > + if channels['combined-count'] == 0: > + rx_type = 'rx' > + else: > + rx_type = 'combined' > + cur_queue_cnt = channels[f'{rx_type}-count'] > + max_queue_cnt = channels[f'{rx_type}-max'] > + > + cmd(f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}") > + defer(cmd, f"ethtool -L {cfg.ifname} {rx_type} {cur_queue_cnt}") > + > + # Real test stats > + stats = __run_inf_loop("cat /proc/net/dev") > + defer(stats.kill) > + > + ipset = f"ip link set dev {cfg.ifname}" > + defer(ip, f"link set dev {cfg.ifname} up") unimportant: could reference ipset here too, as in below. > + # The "echo -n 1" lets us count iterations below > + updown = f"{ipset} down; sleep 0.05; {ipset} up; sleep 0.05; " + \ > + f"ethtool -L {cfg.ifname} {rx_type} 1; " + \ > + f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}; " + \ > + "echo -n 1"
On Sun, 05 Jan 2025 10:55:50 -0500 Willem de Bruijn wrote: > Two tiny comments, neither cause for respin. Let me respin, not much work since comment changes shouldn't need a re-test.. > > +@ksft_disruptive > > +def procfs_downup_hammer(cfg) -> None: > > + """ > > + Reading stats via procfs only holds the RCU lock, drivers often try > > + to sleep when reading the stats, or don't protect against races. > > + """ > > + # Max out the queues, we'll flip between max an 1 > > s/an/and/ > > > + channels = ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) > > + if channels['combined-count'] == 0: > > + rx_type = 'rx' > > + else: > > + rx_type = 'combined' > > + cur_queue_cnt = channels[f'{rx_type}-count'] > > + max_queue_cnt = channels[f'{rx_type}-max'] > > + > > + cmd(f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}") > > + defer(cmd, f"ethtool -L {cfg.ifname} {rx_type} {cur_queue_cnt}") > > + > > + # Real test stats > > + stats = __run_inf_loop("cat /proc/net/dev") > > + defer(stats.kill) > > + > > + ipset = f"ip link set dev {cfg.ifname}" > > + defer(ip, f"link set dev {cfg.ifname} up") > > unimportant: could reference ipset here too, as in below. Ha, that's what I did initially, but then running it I discovered that ip() adds the initial "ip", so we end up executing: ip ip link set...
Jakub Kicinski <kuba@kernel.org> writes: > Most of our tests use rtnetlink to read device stats, so they > don't expose the drivers much to paths in which device stats > are read under RCU. Add tests which hammer profcs reads to > make sure drivers: > - don't sleep while reporting stats, > - can handle parallel reads, > - can handle device going down while reading. > > Set ifname on the env class in NetDrvEnv, we already do that > in NetDrvEpEnv. > > KTAP version 1 > 1..7 > ok 1 stats.check_pause > ok 2 stats.check_fec > ok 3 stats.pkt_byte_sum > ok 4 stats.qstat_by_ifindex > ok 5 stats.check_down > ok 6 stats.procfs_hammer > # completed up/down cycles: 6 > ok 7 stats.procfs_downup_hammer > # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 > > Signed-off-by: Jakub Kicinski <kuba@kernel.org> Reviewed-by: Petr Machata <petrm@nvidia.com>
diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index fea343f209ea..987e452d3a45 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -48,6 +48,7 @@ from .remote import Remote else: self._ns = NetdevSimDev(**kwargs) self.dev = self._ns.nsims[0].dev + self.ifname = self.dev['ifname'] self.ifindex = self.dev['ifindex'] def __enter__(self): diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 031ac9def6c0..55d647c006ed 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -2,12 +2,15 @@ # SPDX-License-Identifier: GPL-2.0 import errno +import subprocess +import time from lib.py import ksft_run, ksft_exit, ksft_pr -from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx +from lib.py import ksft_ge, ksft_eq, ksft_is, ksft_in, ksft_lt, ksft_true, ksft_raises +from lib.py import KsftSkipEx, KsftXfailEx from lib.py import ksft_disruptive from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError from lib.py import NetDrvEnv -from lib.py import ip, defer +from lib.py import cmd, ip, defer ethnl = EthtoolFamily() netfam = NetdevFamily() @@ -174,10 +177,95 @@ rtnl = RtnlFamily() netfam.qstats_get({"ifindex": cfg.ifindex, "scope": "queue"}, dump=True) +def __run_inf_loop(body): + body = body.strip() + if body[-1] != ';': + body += ';' + + return subprocess.Popen(f"while true; do {body} done", shell=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + +def __stats_increase_sanely(old, new) -> None: + for k in old.keys(): + ksft_ge(new[k], old[k]) + ksft_lt(new[k] - old[k], 1 << 31, comment="likely wrapping error") + + +def procfs_hammer(cfg) -> None: + """ + Reading stats via procfs only holds the RCU lock, which is not an exclusive + lock, make sure drivers can handle parallel reads of stats. + """ + one = __run_inf_loop("cat /proc/net/dev") + defer(one.kill) + two = __run_inf_loop("cat /proc/net/dev") + defer(two.kill) + + time.sleep(1) + # Make sure the processes are running + ksft_is(one.poll(), None) + ksft_is(two.poll(), None) + + rtstat1 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64'] + time.sleep(2) + rtstat2 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64'] + __stats_increase_sanely(rtstat1, rtstat2) + # defers will kill the loops + + +@ksft_disruptive +def procfs_downup_hammer(cfg) -> None: + """ + Reading stats via procfs only holds the RCU lock, drivers often try + to sleep when reading the stats, or don't protect against races. + """ + # Max out the queues, we'll flip between max an 1 + channels = ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + if channels['combined-count'] == 0: + rx_type = 'rx' + else: + rx_type = 'combined' + cur_queue_cnt = channels[f'{rx_type}-count'] + max_queue_cnt = channels[f'{rx_type}-max'] + + cmd(f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}") + defer(cmd, f"ethtool -L {cfg.ifname} {rx_type} {cur_queue_cnt}") + + # Real test stats + stats = __run_inf_loop("cat /proc/net/dev") + defer(stats.kill) + + ipset = f"ip link set dev {cfg.ifname}" + defer(ip, f"link set dev {cfg.ifname} up") + # The "echo -n 1" lets us count iterations below + updown = f"{ipset} down; sleep 0.05; {ipset} up; sleep 0.05; " + \ + f"ethtool -L {cfg.ifname} {rx_type} 1; " + \ + f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}; " + \ + "echo -n 1" + updown = __run_inf_loop(updown) + kill_updown = defer(updown.kill) + + time.sleep(1) + # Make sure the processes are running + ksft_is(stats.poll(), None) + ksft_is(updown.poll(), None) + + rtstat1 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64'] + # We're looking for crashes, give it extra time + time.sleep(9) + rtstat2 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64'] + __stats_increase_sanely(rtstat1, rtstat2) + + kill_updown.exec() + stdout, _ = updown.communicate(timeout=5) + ksft_pr("completed up/down cycles:", len(stdout.decode('utf-8'))) + + def main() -> None: with NetDrvEnv(__file__, queue_count=100) as cfg: ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex, - check_down], + check_down, procfs_hammer, procfs_downup_hammer], args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 477ae76de93d..3efe005436cd 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -71,6 +71,11 @@ KSFT_DISRUPTIVE = True _fail("Check failed", a, "not in", b, comment) +def ksft_is(a, b, comment=""): + if a is not b: + _fail("Check failed", a, "is not", b, comment) + + def ksft_ge(a, b, comment=""): if a < b: _fail("Check failed", a, "<", b, comment)
Most of our tests use rtnetlink to read device stats, so they don't expose the drivers much to paths in which device stats are read under RCU. Add tests which hammer profcs reads to make sure drivers: - don't sleep while reporting stats, - can handle parallel reads, - can handle device going down while reading. Set ifname on the env class in NetDrvEnv, we already do that in NetDrvEpEnv. KTAP version 1 1..7 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum ok 4 stats.qstat_by_ifindex ok 5 stats.check_down ok 6 stats.procfs_hammer # completed up/down cycles: 6 ok 7 stats.procfs_downup_hammer # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- CC: shuah@kernel.org CC: willemb@google.com CC: petrm@nvidia.com CC: linux-kselftest@vger.kernel.org --- .../selftests/drivers/net/lib/py/env.py | 1 + tools/testing/selftests/drivers/net/stats.py | 94 ++++++++++++++++++- tools/testing/selftests/net/lib/py/ksft.py | 5 + 3 files changed, 97 insertions(+), 3 deletions(-)