diff mbox series

[blktests,v4,5/5] nvme/055: add test for nvme-tcp zero-copy offload

Message ID 20241126203857.27210-6-aaptel@nvidia.com (mailing list archive)
State New
Headers show
Series Add support to run against arbitrary targets | expand

Commit Message

Aurelien Aptel Nov. 26, 2024, 8:38 p.m. UTC
This commit adds a new test for the kernel ULP DDP (Direct Data
Placement) feature with NVMe-TCP.

Configuration of DDP is per NIC and is done through a script in the
kernel source. For this reason we add 2 new config vars:
- KERNELSRC: path to the running kernel sources
- NVME_IFACE: name of the network interface to configure the offload on

Signed-off-by: Aurelien Aptel <aaptel@nvidia.com>
Signed-off-by: Shai Malin smalin@nvidia.com
Reviewed-by: Daniel Wagner <dwagner@suse.de>
---
 Documentation/running-tests.md |   9 ++
 README.md                      |   1 +
 common/rc                      |   8 +
 tests/nvme/055                 | 285 +++++++++++++++++++++++++++++++++
 tests/nvme/055.out             |  44 +++++
 tests/nvme/rc                  |   8 +
 6 files changed, 355 insertions(+)
 create mode 100755 tests/nvme/055
 create mode 100644 tests/nvme/055.out
diff mbox series

Patch

diff --git a/Documentation/running-tests.md b/Documentation/running-tests.md
index fe4f729..a42fc91 100644
--- a/Documentation/running-tests.md
+++ b/Documentation/running-tests.md
@@ -124,6 +124,15 @@  The NVMe tests can be additionally parameterized via environment variables.
   be skipped and this script gets called. This makes it possible to run
   the fabric nvme tests against a real target.
 
+#### NVMe-TCP zero-copy offload
+
+The NVMe-TCP ZC offload tests use a couple more variables.
+
+- KERNELSRC: Path to running kernel sources.
+  Needed for the script to configure the offload.
+- NVME_IFACE: Name of the interface the offload should be enabled on.
+  This should be the same interface the NVMe connection is made with.
+
 ### Running nvme-rdma and SRP tests
 
 These tests will use the siw (soft-iWARP) driver by default. The rdma_rxe
diff --git a/README.md b/README.md
index 55227d9..5073510 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,7 @@  Some tests require the following:
 - nbd-client and nbd-server (Debian) or nbd (Fedora, openSUSE, Arch Linux)
 - dmsetup (Debian) or device-mapper (Fedora, openSUSE, Arch Linux)
 - rublk (`cargo install --version=^0.1 rublk`) for ublk test
+- python3, ethtool, iproute2 for nvme-tcp zero-copy offload test
 
 Build blktests with `make`. Optionally, install it to a known location with
 `make install` (`/usr/local/blktests` by default, but this can be changed by
diff --git a/common/rc b/common/rc
index b2e68b2..0c8b51f 100644
--- a/common/rc
+++ b/common/rc
@@ -148,6 +148,14 @@  _have_loop() {
 	_have_driver loop && _have_program losetup
 }
 
+_have_kernel_source() {
+	if [ -z "${KERNELSRC}" ]; then
+		SKIP_REASONS+=("KERNELSRC not set")
+		return 1
+	fi
+	return 0
+}
+
 _have_blktrace() {
 	# CONFIG_BLK_DEV_IO_TRACE might still be disabled, but this is easier
 	# to check. We can fix it if someone complains.
diff --git a/tests/nvme/055 b/tests/nvme/055
new file mode 100755
index 0000000..7e76126
--- /dev/null
+++ b/tests/nvme/055
@@ -0,0 +1,285 @@ 
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2024 Aurelien Aptel <aaptel@nvidia.com>
+#
+# zero-copy offload
+
+. tests/nvme/rc
+
+DESCRIPTION="enable zero copy offload and run rw traffic"
+TIMED=1
+
+iface_idx=""
+
+# these vars get updated after each call to connect_run_disconnect()
+nb_packets=0
+nb_bytes=0
+nb_offload_packets=0
+nb_offload_bytes=0
+offload_bytes_ratio=0
+offload_packets_ratio=0
+
+requires() {
+	_nvme_requires
+	_require_remote_nvme_target
+	_require_nvme_trtype tcp
+	_have_kernel_option ULP_DDP
+	# require nvme-tcp as a module to be able to change the ddp_offload param
+	_have_module nvme_tcp && _have_module_param nvme_tcp ddp_offload
+	_have_fio
+	_have_program ip
+	_have_program ethtool
+	_have_kernel_source && have_netlink_cli && _have_program python3
+	have_iface
+}
+
+have_netlink_cli() {
+	local cli
+	cli="${KERNELSRC}/tools/net/ynl/cli.py"
+
+	if ! [ -f "$cli" ]; then
+		SKIP_REASONS+=("Kernel sources do not have tools/net/ynl/cli.py")
+		return 1
+	fi
+
+	if ! "$cli" -h &> /dev/null; then
+		SKIP_REASONS+=("Cannot run the kernel tools/net/ynl/cli.py")
+		return 1;
+	fi
+
+	if ! [ -f "${KERNELSRC}/Documentation/netlink/specs/ulp_ddp.yaml" ]; then
+		SKIP_REASONS+=("Kernel sources do not have the ULP DDP netlink specs")
+		return 1
+	fi
+}
+
+have_iface() {
+	if [ -z "${NVME_IFACE}" ]; then
+		SKIP_REASONS+=("NVME_IFACE not set")
+		return 1
+	fi
+	return 0
+}
+
+set_conditions() {
+	_set_nvme_trtype "$@"
+}
+
+netlink_cli() {
+	"${KERNELSRC}/tools/net/ynl/cli.py" \
+		--spec "${KERNELSRC}/Documentation/netlink/specs/ulp_ddp.yaml" \
+		"$@"
+}
+
+eth_stat() {
+	ethtool -S "${NVME_IFACE}" | awk "/ $1:/ { print \$2 }"
+}
+
+ddp_stat() {
+	netlink_cli --do stats-get --json "{\"ifindex\": $iface_idx}" \
+		| awk -F: "/'$1'/{print \$2;}" | tr -d '{},'
+}
+
+ddp_caps() {
+	local out
+	out="$(netlink_cli --do caps-get --json "{\"ifindex\": $iface_idx}")"
+	echo "$out" | tr '{},' '\n' | tr -d ' '| awk -F: "/$1/ { print \$2 }"
+}
+
+configure_ddp() {
+	local mod_param
+	local cap
+
+	mod_param=$1
+	cap=$2
+
+	echo "=== configured with ddp_offload=$mod_param and caps=$cap ==="
+
+	# set ddp_offload module param
+	modprobe -q -r nvme-tcp
+	modprobe -q nvme-tcp ddp_offload=$mod_param
+
+	# set capabilities
+	netlink_cli --do caps-set --json "{\"ifindex\": $iface_idx, \"wanted\": $cap, \"wanted_mask\": 3}" >> "$FULL" 2>&1
+}
+
+connect_run_disconnect() {
+	local io_size
+	local nvme_dev
+	local nb_drop
+	local drop_ratio
+	local nb_resync
+	local resync_ratio
+
+	# offload stat counters
+	local start_sk_add
+	local start_sk_add_fail
+	local start_sk_del
+	local start_setup
+	local start_setup_fail
+	local start_teardown
+	local start_off_bytes
+	local start_eth_bytes
+	local start_off_packets
+	local start_eth_packets
+	local end_sk_add
+	local end_sk_add_fail
+	local end_sk_del
+	local end_setup
+	local end_setup_fail
+	local end_teardown
+	local end_drop
+	local end_resync
+	local end_off_bytes
+	local end_eth_bytes
+	local end_off_packets
+	local end_eth_packets
+
+	io_size=$1
+
+	start_sk_add=$(ddp_stat rx-nvme-tcp-sk-add)
+	start_sk_add_fail=$(ddp_stat rx-nvme-tcp-sk-add-fail)
+	start_sk_del=$(ddp_stat rx-nvme-tcp-sk-del)
+	start_setup=$(ddp_stat rx-nvme-tcp-setup)
+	start_setup_fail=$(ddp_stat rx-nvme-tcp-setup-fail)
+	start_teardown=$(ddp_stat rx-nvme-tcp-teardown)
+	start_drop=$(ddp_stat rx-nvme-tcp-drop)
+	start_resync=$(ddp_stat rx-nvme-tcp-resync)
+	start_off_packets=$(ddp_stat rx-nvme-tcp-packets)
+	start_off_bytes=$(ddp_stat rx-nvme-tcp-bytes)
+	start_eth_packets=$(eth_stat rx_packets)
+	start_eth_bytes=$(eth_stat rx_bytes)
+	_nvme_connect_subsys --hdr-digest --data-digest --nr-io-queues 8
+
+	nvme_dev="/dev/$(_find_nvme_ns "${def_subsys_uuid}")"
+
+	local common_args=(
+		--blocksize_range=$io_size
+		--rw=randrw
+		--numjobs=8
+		--iodepth=128
+		--name=randrw
+		--ioengine=libaio
+		--time_based
+		--runtime="$TIMEOUT"
+		--direct=1
+		--invalidate=1
+		--randrepeat=1
+		--norandommap
+		--filename="$nvme_dev"
+	)
+
+	echo "IO size: $io_size"
+
+	_run_fio "${common_args[@]}"
+	_nvme_disconnect_subsys >> "$FULL" 2>&1
+
+	end_sk_add=$(ddp_stat rx-nvme-tcp-sk-add)
+	end_sk_add_fail=$(ddp_stat rx-nvme-tcp-sk-add-fail)
+	end_sk_del=$(ddp_stat rx-nvme-tcp-sk-del)
+	end_setup=$(ddp_stat rx-nvme-tcp-setup)
+	end_setup_fail=$(ddp_stat rx-nvme-tcp-setup-fail)
+	end_teardown=$(ddp_stat rx-nvme-tcp-teardown)
+	end_drop=$(ddp_stat rx-nvme-tcp-drop)
+	end_resync=$(ddp_stat rx-nvme-tcp-resync)
+	end_off_packets=$(ddp_stat rx-nvme-tcp-packets)
+	end_eth_packets=$(eth_stat rx_packets)
+	end_off_bytes=$(ddp_stat rx-nvme-tcp-bytes)
+	end_eth_bytes=$(eth_stat rx_bytes)
+
+	echo "Offloaded sockets: $((end_sk_add - start_sk_add))"
+	echo "Failed sockets:    $((end_sk_add_fail - start_sk_add_fail))"
+	echo "Unoffloaded sockets:   $((end_sk_del - start_sk_del))"
+	echo "Offload packet leaked: $((end_setup - end_teardown))"
+	echo "Failed packet setup:   $((end_setup_fail - start_setup_fail))"
+
+	# global var results
+	nb_drop=$(( end_drop - start_drop ))
+	nb_resync=$(( end_resync - start_resync ))
+	nb_packets=$(( end_eth_packets - start_eth_packets ))
+	nb_offload_packets=$(( end_off_packets - start_off_packets ))
+	nb_bytes=$(( end_eth_bytes - start_eth_bytes ))
+	nb_offload_bytes=$(( end_off_bytes - start_off_bytes ))
+
+	offload_packets_ratio=0
+	offload_bytes_ratio=0
+
+	# sanity check and avoid div by zero in ratio calculation
+	if [[ nb_bytes -eq 0 || nb_packets -eq 0 ]]; then
+		echo "No traffic: $nb_bytes bytes, $nb_packets packets"
+		return
+	fi
+
+	offload_packets_ratio=$(( nb_offload_packets*100/nb_packets ))
+	offload_bytes_ratio=$(( nb_offload_bytes*100/nb_bytes ))
+
+	drop_ratio=$(( nb_drop*100/nb_packets ))
+	resync_ratio=$(( nb_resync*100/nb_packets ))
+	[[ drop_ratio -gt 5 ]] && echo "High drop ratio: $drop_ratio %"
+	[[ resync_ratio -gt 5 ]] && echo "High resync ratio: $resync_ratio %"
+}
+
+test() {
+	local starting_ddp_config
+
+	: "${TIMEOUT:=30}"
+
+	echo "Running ${TEST_NAME}"
+
+	# get iface index
+	iface_idx=$(ip address | awk -F: "/${NVME_IFACE}/ { print \$1; exit; }")
+
+	# check hw supports ddp
+	if [[ $(( $(ddp_caps hw) & 3)) -ne 3 ]]; then
+		SKIP_REASONS+=("${NVME_IFACE} does not support nvme-tcp ddp offload")
+		return
+	fi
+
+	_setup_nvmet
+	_nvmet_target_setup
+
+	if [ "$(cat "/sys/module/nvme_tcp/parameters/ddp_offload")" = Y ]; then
+		starting_ddp_config="1 $(ddp_caps active)"
+	else
+		starting_ddp_config="0 $(ddp_caps active)"
+	fi
+
+	# if any of the offload knobs are disabled, no offload should occur
+	# and offloaded packets & bytes should be zero
+
+	configure_ddp 0 0
+	connect_run_disconnect 32k-1M
+	echo "Offloaded packets: $nb_offload_packets"
+	echo "Offloaded bytes: $nb_offload_bytes"
+
+	configure_ddp 0 3
+	connect_run_disconnect 32k-1M
+	echo "Offloaded packets: $nb_offload_packets"
+	echo "Offloaded bytes: $nb_offload_bytes"
+
+	configure_ddp 1 0
+	connect_run_disconnect 32k-1M
+	echo "Offloaded packets: $nb_offload_packets"
+	echo "Offloaded bytes: $nb_offload_bytes"
+
+	# if everything is enabled, the offload should happen for large IOs only
+	configure_ddp 1 3
+
+	connect_run_disconnect 32k-1M
+	[[ nb_offload_packets -lt 100 ]] && echo "Low offloaded packets: $nb_offload_packets"
+	[[ nb_offload_bytes -lt 32768 ]] && echo "Low offloaded bytes: $nb_offload_bytes"
+	[[ offload_bytes_ratio -lt 90 ]] && echo "Low offloaded bytes ratio: $offload_bytes_ratio %"
+	[[ offload_packets_ratio -lt 95 ]] && echo "Low offloaded packets ratio: $offload_packets_ratio %"
+
+	# small IO should be under the offload threshold, ratio should be zero
+	connect_run_disconnect 4k-16k
+	echo "Offload bytes ratio: $offload_bytes_ratio %"
+	echo "Offload packets ratio: $offload_packets_ratio %"
+
+	_nvmet_target_cleanup
+
+	# restore starting config
+	configure_ddp $starting_ddp_config > /dev/null
+
+	echo "Test complete"
+}
diff --git a/tests/nvme/055.out b/tests/nvme/055.out
new file mode 100644
index 0000000..06706a6
--- /dev/null
+++ b/tests/nvme/055.out
@@ -0,0 +1,44 @@ 
+Running nvme/055
+=== configured with ddp_offload=0 and caps=0 ===
+IO size: 32k-1M
+Offloaded sockets: 0
+Failed sockets:    0
+Unoffloaded sockets:   0
+Offload packet leaked: 0
+Failed packet setup:   0
+Offloaded packets: 0
+Offloaded bytes: 0
+=== configured with ddp_offload=0 and caps=3 ===
+IO size: 32k-1M
+Offloaded sockets: 0
+Failed sockets:    0
+Unoffloaded sockets:   0
+Offload packet leaked: 0
+Failed packet setup:   0
+Offloaded packets: 0
+Offloaded bytes: 0
+=== configured with ddp_offload=1 and caps=0 ===
+IO size: 32k-1M
+Offloaded sockets: 0
+Failed sockets:    0
+Unoffloaded sockets:   0
+Offload packet leaked: 0
+Failed packet setup:   0
+Offloaded packets: 0
+Offloaded bytes: 0
+=== configured with ddp_offload=1 and caps=3 ===
+IO size: 32k-1M
+Offloaded sockets: 8
+Failed sockets:    0
+Unoffloaded sockets:   8
+Offload packet leaked: 0
+Failed packet setup:   0
+IO size: 4k-16k
+Offloaded sockets: 8
+Failed sockets:    0
+Unoffloaded sockets:   8
+Offload packet leaked: 0
+Failed packet setup:   0
+Offload bytes ratio: 0 %
+Offload packets ratio: 0 %
+Test complete
diff --git a/tests/nvme/rc b/tests/nvme/rc
index d1a4c01..4a43e43 100644
--- a/tests/nvme/rc
+++ b/tests/nvme/rc
@@ -199,6 +199,14 @@  _require_kernel_nvme_target() {
 	return 0
 }
 
+_require_remote_nvme_target() {
+	if [ -z "${nvme_target_control}" ]; then
+		SKIP_REASONS+=("Remote target required but NVME_TARGET_CONTROL is not set")
+		return 1
+	fi
+	return 0
+}
+
 _test_dev_nvme_ctrl() {
 	echo "/dev/char/$(cat "${TEST_DEV_SYSFS}/device/dev")"
 }