diff mbox series

[net-next,v15,14/14] selftests: add ncdevmem, netcat for devmem TCP

Message ID 20240628003253.1694510-15-almasrymina@google.com (mailing list archive)
State Handled Elsewhere
Headers show
Series Device Memory TCP | expand

Commit Message

Mina Almasry June 28, 2024, 12:32 a.m. UTC
ncdevmem is a devmem TCP netcat. It works similarly to netcat, but it
sends and receives data using the devmem TCP APIs. It uses udmabuf as
the dmabuf provider. It is compatible with a regular netcat running on
a peer, or a ncdevmem running on a peer.

In addition to normal netcat support, ncdevmem has a validation mode,
where it sends a specific pattern and validates this pattern on the
receiver side to ensure data integrity.

Suggested-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>

---
v15:
- Fix linking against libynl. (Jakub)

v9: https://lore.kernel.org/netdev/20240403002053.2376017-15-almasrymina@google.com/
- Remove unused nic_pci_addr entry (Cong).

v6:
- Updated to bind 8 queues.
- Added RSS configuration.
- Added some more tests for the netlink API.

Changes in v1:
- Many more general cleanups (Willem).
- Removed driver reset (Jakub).
- Removed hardcoded if index (Paolo).

RFC v2:
- General cleanups (Willem).

---
 tools/testing/selftests/net/.gitignore |   1 +
 tools/testing/selftests/net/Makefile   |   9 +
 tools/testing/selftests/net/ncdevmem.c | 542 +++++++++++++++++++++++++
 3 files changed, 552 insertions(+)
 create mode 100644 tools/testing/selftests/net/ncdevmem.c

Comments

Taehee Yoo July 4, 2024, 7:59 a.m. UTC | #1
On Fri, Jun 28, 2024 at 9:38 AM Mina Almasry <almasrymina@google.com> wrote:
>

Hi Mina,
Thank you so much for this work!

> ncdevmem is a devmem TCP netcat. It works similarly to netcat, but it
> sends and receives data using the devmem TCP APIs. It uses udmabuf as
> the dmabuf provider. It is compatible with a regular netcat running on
> a peer, or a ncdevmem running on a peer.
>
> In addition to normal netcat support, ncdevmem has a validation mode,
> where it sends a specific pattern and validates this pattern on the
> receiver side to ensure data integrity.
>
> Suggested-by: Stanislav Fomichev <sdf@google.com>
> Signed-off-by: Mina Almasry <almasrymina@google.com>
>
> ---
> v15:
> - Fix linking against libynl. (Jakub)
>
> v9: https://lore.kernel.org/netdev/20240403002053.2376017-15-almasrymina@google.com/
> - Remove unused nic_pci_addr entry (Cong).
>
> v6:
> - Updated to bind 8 queues.
> - Added RSS configuration.
> - Added some more tests for the netlink API.
>
> Changes in v1:
> - Many more general cleanups (Willem).
> - Removed driver reset (Jakub).
> - Removed hardcoded if index (Paolo).
>
> RFC v2:
> - General cleanups (Willem).
>
> ---
> tools/testing/selftests/net/.gitignore | 1 +
> tools/testing/selftests/net/Makefile | 9 +
> tools/testing/selftests/net/ncdevmem.c | 542 +++++++++++++++++++++++++
> 3 files changed, 552 insertions(+)
> create mode 100644 tools/testing/selftests/net/ncdevmem.c
>
> diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
> index 666ab7d9390b1..fe770903118c5 100644
> --- a/tools/testing/selftests/net/.gitignore
> +++ b/tools/testing/selftests/net/.gitignore
> @@ -17,6 +17,7 @@ ipv6_flowlabel
> ipv6_flowlabel_mgr
> log.txt
> msg_zerocopy
> +ncdevmem
> nettest
> psock_fanout
> psock_snd
> diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
> index bc3925200637c..39420a6e86b7f 100644
> --- a/tools/testing/selftests/net/Makefile
> +++ b/tools/testing/selftests/net/Makefile
> @@ -95,6 +95,11 @@ TEST_PROGS += fq_band_pktlimit.sh
> TEST_PROGS += vlan_hw_filter.sh
> TEST_PROGS += bpf_offload.py
>
> +# YNL files, must be before "include ..lib.mk"
> +EXTRA_CLEAN += $(OUTPUT)/libynl.a
> +YNL_GEN_FILES := ncdevmem
> +TEST_GEN_FILES += $(YNL_GEN_FILES)
> +
> TEST_FILES := settings
> TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh
>
> @@ -104,6 +109,10 @@ TEST_INCLUDES := forwarding/lib.sh
>
> include ../lib.mk
>
> +# YNL build
> +YNL_GENS := netdev
> +include ynl.mk
> +
> $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap
> $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
> $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto
> diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
> new file mode 100644
> index 0000000000000..e00255e54f77b
> --- /dev/null
> +++ b/tools/testing/selftests/net/ncdevmem.c
> @@ -0,0 +1,542 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define _GNU_SOURCE
> +#define __EXPORTED_HEADERS__
> +
> +#include <linux/uio.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <stdbool.h>
> +#include <string.h>
> +#include <errno.h>
> +#define __iovec_defined
> +#include <fcntl.h>
> +#include <malloc.h>
> +#include <error.h>
> +
> +#include <arpa/inet.h>
> +#include <sys/socket.h>
> +#include <sys/mman.h>
> +#include <sys/ioctl.h>
> +#include <sys/syscall.h>
> +
> +#include <linux/memfd.h>
> +#include <linux/if.h>
> +#include <linux/dma-buf.h>
> +#include <linux/udmabuf.h>
> +#include <libmnl/libmnl.h>
> +#include <linux/types.h>
> +#include <linux/netlink.h>
> +#include <linux/genetlink.h>
> +#include <linux/netdev.h>
> +#include <time.h>
> +
> +#include "netdev-user.h"
> +#include <ynl.h>
> +
> +#define PAGE_SHIFT 12
> +#define TEST_PREFIX "ncdevmem"
> +#define NUM_PAGES 16000
> +
> +#ifndef MSG_SOCK_DEVMEM
> +#define MSG_SOCK_DEVMEM 0x2000000
> +#endif
> +
> +/*
> + * tcpdevmem netcat. Works similarly to netcat but does device memory TCP
> + * instead of regular TCP. Uses udmabuf to mock a dmabuf provider.
> + *
> + * Usage:
> + *
> + * On server:
> + * ncdevmem -s <server IP> -c <client IP> -f eth1 -d 3 -n 0000:06:00.0 -l \
> + * -p 5201 -v 7

The 'n' option disappeared, please remove it.


> + *
> + * On client:
> + * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \
> + * tr \\n \\0 | \
> + * head -c 5G | \
> + * nc <server IP> 5201 -p 5201
> + *
> + * Note this is compatible with regular netcat. i.e. the sender or receiver can
> + * be replaced with regular netcat to test the RX or TX path in isolation.
> + */
> +
> +static char *server_ip = "192.168.1.4";
> +static char *client_ip = "192.168.1.2";
> +static char *port = "5201";
> +static size_t do_validation;
> +static int start_queue = 8;
> +static int num_queues = 8;
> +static char *ifname = "eth1";
> +static unsigned int ifindex = 3;
> +static unsigned int iterations;
> +static unsigned int dmabuf_id;
> +
> +void print_bytes(void *ptr, size_t size)
> +{
> + unsigned char *p = ptr;
> + int i;
> +
> + for (i = 0; i < size; i++)
> + printf("%02hhX ", p[i]);
> + printf("\n");
> +}
> +
> +void print_nonzero_bytes(void *ptr, size_t size)
> +{
> + unsigned char *p = ptr;
> + unsigned int i;
> +
> + for (i = 0; i < size; i++)
> + putchar(p[i]);
> + printf("\n");
> +}
> +
> +void validate_buffer(void *line, size_t size)
> +{
> + static unsigned char seed = 1;
> + unsigned char *ptr = line;
> + int errors = 0;
> + size_t i;
> +
> + for (i = 0; i < size; i++) {
> + if (ptr[i] != seed) {
> + fprintf(stderr,
> + "Failed validation: expected=%u, actual=%u, index=%lu\n",
> + seed, ptr[i], i);
> + errors++;
> + if (errors > 20)
> + error(1, 0, "validation failed.");
> + }
> + seed++;
> + if (seed == do_validation)
> + seed = 0;
> + }
> +
> + fprintf(stdout, "Validated buffer\n");
> +}
> +
> +static void reset_flow_steering(void)
> +{
> + char command[256];
> +
> + memset(command, 0, sizeof(command));
> + snprintf(command, sizeof(command), "sudo ethtool -K %s ntuple off",
> + "eth1");

I think we use ifname instead of "eth1".

> + system(command);
> +
> + memset(command, 0, sizeof(command));
> + snprintf(command, sizeof(command), "sudo ethtool -K %s ntuple on",
> + "eth1");

Please use ifname instead of "eth1" too.

> + system(command);
> +}
> +
> +static void configure_rss(void)
> +{
> + char command[256];
> +
> + memset(command, 0, sizeof(command));
> + snprintf(command, sizeof(command), "sudo ethtool -X %s equal %d",
> + ifname, start_queue);
> + system(command);
> +}
> +
> +static void configure_flow_steering(void)
> +{
> + char command[256];
> +
> + memset(command, 0, sizeof(command));
> + snprintf(command, sizeof(command),
> + "sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d",
> + ifname, client_ip, server_ip, port, port, start_queue);
> + system(command);
> +}
> +
> +static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd,
> + struct netdev_queue_dmabuf *queues,
> + unsigned int n_queue_index, struct ynl_sock **ys)
> +{
> + struct netdev_bind_rx_req *req = NULL;
> + struct netdev_bind_rx_rsp *rsp = NULL;
> + struct ynl_error yerr;
> +
> + *ys = ynl_sock_create(&ynl_netdev_family, &yerr);
> + if (!*ys) {
> + fprintf(stderr, "YNL: %s\n", yerr.msg);
> + return -1;
> + }
> +
> + req = netdev_bind_rx_req_alloc();
> + netdev_bind_rx_req_set_ifindex(req, ifindex);
> + netdev_bind_rx_req_set_dmabuf_fd(req, dmabuf_fd);
> + __netdev_bind_rx_req_set_queues(req, queues, n_queue_index);
> +
> + rsp = netdev_bind_rx(*ys, req);
> + if (!rsp) {
> + perror("netdev_bind_rx");
> + goto err_close;
> + }
> +
> + if (!rsp->_present.dmabuf_id) {
> + perror("dmabuf_id not present");
> + goto err_close;
> + }
> +
> + printf("got dmabuf id=%d\n", rsp->dmabuf_id);
> + dmabuf_id = rsp->dmabuf_id;
> +
> + netdev_bind_rx_req_free(req);
> + netdev_bind_rx_rsp_free(rsp);
> +
> + return 0;
> +
> +err_close:
> + fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg);
> + netdev_bind_rx_req_free(req);
> + ynl_sock_destroy(*ys);
> + return -1;
> +}
> +
> +static void create_udmabuf(int *devfd, int *memfd, int *buf, size_t dmabuf_size)
> +{
> + struct udmabuf_create create;
> + int ret;
> +
> + *devfd = open("/dev/udmabuf", O_RDWR);
> + if (*devfd < 0) {
> + error(70, 0,
> + "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
> + TEST_PREFIX);
> + }
> +
> + *memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
> + if (*memfd < 0)
> + error(70, 0, "%s: [skip,no-memfd]\n", TEST_PREFIX);
> +
> + /* Required for udmabuf */
> + ret = fcntl(*memfd, F_ADD_SEALS, F_SEAL_SHRINK);
> + if (ret < 0)
> + error(73, 0, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
> +
> + ret = ftruncate(*memfd, dmabuf_size);
> + if (ret == -1)
> + error(74, 0, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
> +
> + memset(&create, 0, sizeof(create));
> +
> + create.memfd = *memfd;
> + create.offset = 0;
> + create.size = dmabuf_size;
> + *buf = ioctl(*devfd, UDMABUF_CREATE, &create);
> + if (*buf < 0)
> + error(75, 0, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX);
> +}
> +
> +int do_server(void)
> +{
> + char ctrl_data[sizeof(int) * 20000];
> + struct netdev_queue_dmabuf *queues;
> + size_t non_page_aligned_frags = 0;
> + struct sockaddr_in client_addr;
> + struct sockaddr_in server_sin;
> + size_t page_aligned_frags = 0;
> + int devfd, memfd, buf, ret;
> + size_t total_received = 0;
> + socklen_t client_addr_len;
> + bool is_devmem = false;
> + char *buf_mem = NULL;
> + struct ynl_sock *ys;
> + size_t dmabuf_size;
> + char iobuf[819200];
> + char buffer[256];
> + int socket_fd;
> + int client_fd;
> + size_t i = 0;
> + int opt = 1;
> +
> + dmabuf_size = getpagesize() * NUM_PAGES;
> +
> + create_udmabuf(&devfd, &memfd, &buf, dmabuf_size);
> +
> + reset_flow_steering();
> +
> + /* Configure RSS to divert all traffic from our devmem queues */
> + configure_rss();
> +
> + /* Flow steer our devmem flows to start_queue */
> + configure_flow_steering();
> +
> + sleep(1);
> +
> + queues = malloc(sizeof(*queues) * num_queues);
> +
> + for (i = 0; i < num_queues; i++) {
> + queues[i]._present.type = 1;
> + queues[i]._present.idx = 1;
> + queues[i].type = NETDEV_QUEUE_TYPE_RX;
> + queues[i].idx = start_queue + i;
> + }
> +
> + if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
> + error(1, 0, "Failed to bind\n");
> +
> + buf_mem = mmap(NULL, dmabuf_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> + buf, 0);
> + if (buf_mem == MAP_FAILED)
> + error(1, 0, "mmap()");
> +
> + server_sin.sin_family = AF_INET;
> + server_sin.sin_port = htons(atoi(port));
> +
> + ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr);
> + if (socket < 0)
> + error(79, 0, "%s: [FAIL, create socket]\n", TEST_PREFIX);
> +
> + socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0);
> + if (socket < 0)
> + error(errno, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX);
> +
> + ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt,
> + sizeof(opt));
> + if (ret)
> + error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
> +
> + ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt,
> + sizeof(opt));
> + if (ret)
> + error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
> +
> + printf("binding to address %s:%d\n", server_ip,
> + ntohs(server_sin.sin_port));
> +
> + ret = bind(socket_fd, &server_sin, sizeof(server_sin));
> + if (ret)
> + error(errno, errno, "%s: [FAIL, bind]\n", TEST_PREFIX);
> +
> + ret = listen(socket_fd, 1);
> + if (ret)
> + error(errno, errno, "%s: [FAIL, listen]\n", TEST_PREFIX);
> +
> + client_addr_len = sizeof(client_addr);
> +
> + inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer,
> + sizeof(buffer));
> + printf("Waiting or connection on %s:%d\n", buffer,
> + ntohs(server_sin.sin_port));
> + client_fd = accept(socket_fd, &client_addr, &client_addr_len);
> +
> + inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer,
> + sizeof(buffer));
> + printf("Got connection from %s:%d\n", buffer,
> + ntohs(client_addr.sin_port));
> +
> + while (1) {
> + struct iovec iov = { .iov_base = iobuf,
> + .iov_len = sizeof(iobuf) };
> + struct dmabuf_cmsg *dmabuf_cmsg = NULL;
> + struct dma_buf_sync sync = { 0 };
> + struct cmsghdr *cm = NULL;
> + struct msghdr msg = { 0 };
> + struct dmabuf_token token;
> + ssize_t ret;
> +
> + is_devmem = false;
> + printf("\n\n");
> +
> + msg.msg_iov = &iov;
> + msg.msg_iovlen = 1;
> + msg.msg_control = ctrl_data;
> + msg.msg_controllen = sizeof(ctrl_data);
> + ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM);
> + printf("recvmsg ret=%ld\n", ret);
> + if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
> + continue;
> + if (ret < 0) {
> + perror("recvmsg");
> + continue;
> + }
> + if (ret == 0) {
> + printf("client exited\n");
> + goto cleanup;
> + }
> +
> + i++;
> + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
> + if (cm->cmsg_level != SOL_SOCKET ||
> + (cm->cmsg_type != SCM_DEVMEM_DMABUF &&
> + cm->cmsg_type != SCM_DEVMEM_LINEAR)) {
> + fprintf(stdout, "skipping non-devmem cmsg\n");
> + continue;
> + }
> +
> + dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm);
> + is_devmem = true;
> +
> + if (cm->cmsg_type == SCM_DEVMEM_LINEAR) {
> + /* TODO: process data copied from skb's linear
> + * buffer.
> + */
> + fprintf(stdout,
> + "SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n",
> + dmabuf_cmsg->frag_size);
> +
> + continue;
> + }
> +
> + token.token_start = dmabuf_cmsg->frag_token;
> + token.token_count = 1;
> +
> + total_received += dmabuf_cmsg->frag_size;
> + printf("received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n",
> + dmabuf_cmsg->frag_offset >> PAGE_SHIFT,
> + dmabuf_cmsg->frag_offset % getpagesize(),
> + dmabuf_cmsg->frag_offset, dmabuf_cmsg->frag_size,
> + dmabuf_cmsg->frag_token, total_received,
> + dmabuf_cmsg->dmabuf_id);
> +
> + if (dmabuf_cmsg->dmabuf_id != dmabuf_id)
> + error(1, 0,
> + "received on wrong dmabuf_id: flow steering error\n");
> +
> + if (dmabuf_cmsg->frag_size % getpagesize())
> + non_page_aligned_frags++;
> + else
> + page_aligned_frags++;
> +
> + sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START;
> + ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
> +
> + if (do_validation)
> + validate_buffer(
> + ((unsigned char *)buf_mem) +
> + dmabuf_cmsg->frag_offset,
> + dmabuf_cmsg->frag_size);
> + else
> + print_nonzero_bytes(
> + ((unsigned char *)buf_mem) +
> + dmabuf_cmsg->frag_offset,
> + dmabuf_cmsg->frag_size);
> +
> + sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
> + ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
> +
> + ret = setsockopt(client_fd, SOL_SOCKET,
> + SO_DEVMEM_DONTNEED, &token,
> + sizeof(token));
> + if (ret != 1)
> + error(1, 0,
> + "SO_DEVMEM_DONTNEED not enough tokens");
> + }
> + if (!is_devmem)
> + error(1, 0, "flow steering error\n");
> +
> + printf("total_received=%lu\n", total_received);
> + }
> +
> + fprintf(stdout, "%s: ok\n", TEST_PREFIX);
> +
> + fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
> + page_aligned_frags, non_page_aligned_frags);
> +
> + fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
> + page_aligned_frags, non_page_aligned_frags);
> +
> +cleanup:
> +
> + munmap(buf_mem, dmabuf_size);
> + close(client_fd);
> + close(socket_fd);
> + close(buf);
> + close(memfd);
> + close(devfd);
> + ynl_sock_destroy(ys);
> +
> + return 0;
> +}
> +
> +void run_devmem_tests(void)
> +{
> + struct netdev_queue_dmabuf *queues;
> + int devfd, memfd, buf;
> + struct ynl_sock *ys;
> + size_t dmabuf_size;
> + size_t i = 0;
> +
> + dmabuf_size = getpagesize() * NUM_PAGES;
> +
> + create_udmabuf(&devfd, &memfd, &buf, dmabuf_size);
> +
> + /* Configure RSS to divert all traffic from our devmem queues */
> + configure_rss();
> +
> + sleep(1);
> +
> + queues = malloc(sizeof(*queues) * num_queues);
> +
> + for (i = 0; i < num_queues; i++) {
> + queues[i]._present.type = 1;
> + queues[i]._present.idx = 1;
> + queues[i].type = NETDEV_QUEUE_TYPE_RX;
> + queues[i].idx = start_queue + i;
> + }
> +
> + if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
> + error(1, 0, "Failed to bind\n");
> +
> + /* Closing the netlink socket does an implicit unbind */
> + ynl_sock_destroy(ys);
> +}
> +
> +int main(int argc, char *argv[])
> +{
> + int is_server = 0, opt;
> +
> + while ((opt = getopt(argc, argv, "ls:c:p:v:q:f:n:i:d:")) != -1) {

I think 't' option should be added here.

> + switch (opt) {
> + case 'l':
> + is_server = 1;
> + break;
> + case 's':
> + server_ip = optarg;
> + break;
> + case 'c':
> + client_ip = optarg;
> + break;
> + case 'p':
> + port = optarg;
> + break;
> + case 'v':
> + do_validation = atoll(optarg);
> + break;
> + case 'q':
> + num_queues = atoi(optarg);
> + break;
> + case 't':
> + start_queue = atoi(optarg);
> + break;
> + case 'f':
> + ifname = optarg;
> + break;
> + case 'd':
> + ifindex = atoi(optarg);

How about using if_nametoindex() instead of 'd' option?

> + break;
> + case 'i':
> + iterations = atoll(optarg);

I couldn't find a use of this variable.

> + break;
> + case '?':
> + printf("unknown option: %c\n", optopt);
> + break;
> + }
> + }
> +
> + for (; optind < argc; optind++)
> + printf("extra arguments: %s\n", argv[optind]);
> +
> + run_devmem_tests();
> +
> + if (is_server)
> + return do_server();
> +
> + return 0;
> +}
> --
> 2.45.2.803.g4e1b14247a-goog
>
>

Thanks a lot!
Taehee Yoo
diff mbox series

Patch

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 666ab7d9390b1..fe770903118c5 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -17,6 +17,7 @@  ipv6_flowlabel
 ipv6_flowlabel_mgr
 log.txt
 msg_zerocopy
+ncdevmem
 nettest
 psock_fanout
 psock_snd
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index bc3925200637c..39420a6e86b7f 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -95,6 +95,11 @@  TEST_PROGS += fq_band_pktlimit.sh
 TEST_PROGS += vlan_hw_filter.sh
 TEST_PROGS += bpf_offload.py
 
+# YNL files, must be before "include ..lib.mk"
+EXTRA_CLEAN += $(OUTPUT)/libynl.a
+YNL_GEN_FILES := ncdevmem
+TEST_GEN_FILES += $(YNL_GEN_FILES)
+
 TEST_FILES := settings
 TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh
 
@@ -104,6 +109,10 @@  TEST_INCLUDES := forwarding/lib.sh
 
 include ../lib.mk
 
+# YNL build
+YNL_GENS := netdev
+include ynl.mk
+
 $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap
 $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
 $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto
diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c
new file mode 100644
index 0000000000000..e00255e54f77b
--- /dev/null
+++ b/tools/testing/selftests/net/ncdevmem.c
@@ -0,0 +1,542 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <linux/uio.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <errno.h>
+#define __iovec_defined
+#include <fcntl.h>
+#include <malloc.h>
+#include <error.h>
+
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include <linux/memfd.h>
+#include <linux/if.h>
+#include <linux/dma-buf.h>
+#include <linux/udmabuf.h>
+#include <libmnl/libmnl.h>
+#include <linux/types.h>
+#include <linux/netlink.h>
+#include <linux/genetlink.h>
+#include <linux/netdev.h>
+#include <time.h>
+
+#include "netdev-user.h"
+#include <ynl.h>
+
+#define PAGE_SHIFT 12
+#define TEST_PREFIX "ncdevmem"
+#define NUM_PAGES 16000
+
+#ifndef MSG_SOCK_DEVMEM
+#define MSG_SOCK_DEVMEM 0x2000000
+#endif
+
+/*
+ * tcpdevmem netcat. Works similarly to netcat but does device memory TCP
+ * instead of regular TCP. Uses udmabuf to mock a dmabuf provider.
+ *
+ * Usage:
+ *
+ *	On server:
+ *	ncdevmem -s <server IP> -c <client IP> -f eth1 -d 3 -n 0000:06:00.0 -l \
+ *		-p 5201 -v 7
+ *
+ *	On client:
+ *	yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \
+ *		tr \\n \\0 | \
+ *		head -c 5G | \
+ *		nc <server IP> 5201 -p 5201
+ *
+ * Note this is compatible with regular netcat. i.e. the sender or receiver can
+ * be replaced with regular netcat to test the RX or TX path in isolation.
+ */
+
+static char *server_ip = "192.168.1.4";
+static char *client_ip = "192.168.1.2";
+static char *port = "5201";
+static size_t do_validation;
+static int start_queue = 8;
+static int num_queues = 8;
+static char *ifname = "eth1";
+static unsigned int ifindex = 3;
+static unsigned int iterations;
+static unsigned int dmabuf_id;
+
+void print_bytes(void *ptr, size_t size)
+{
+	unsigned char *p = ptr;
+	int i;
+
+	for (i = 0; i < size; i++)
+		printf("%02hhX ", p[i]);
+	printf("\n");
+}
+
+void print_nonzero_bytes(void *ptr, size_t size)
+{
+	unsigned char *p = ptr;
+	unsigned int i;
+
+	for (i = 0; i < size; i++)
+		putchar(p[i]);
+	printf("\n");
+}
+
+void validate_buffer(void *line, size_t size)
+{
+	static unsigned char seed = 1;
+	unsigned char *ptr = line;
+	int errors = 0;
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		if (ptr[i] != seed) {
+			fprintf(stderr,
+				"Failed validation: expected=%u, actual=%u, index=%lu\n",
+				seed, ptr[i], i);
+			errors++;
+			if (errors > 20)
+				error(1, 0, "validation failed.");
+		}
+		seed++;
+		if (seed == do_validation)
+			seed = 0;
+	}
+
+	fprintf(stdout, "Validated buffer\n");
+}
+
+static void reset_flow_steering(void)
+{
+	char command[256];
+
+	memset(command, 0, sizeof(command));
+	snprintf(command, sizeof(command), "sudo ethtool -K %s ntuple off",
+		 "eth1");
+	system(command);
+
+	memset(command, 0, sizeof(command));
+	snprintf(command, sizeof(command), "sudo ethtool -K %s ntuple on",
+		 "eth1");
+	system(command);
+}
+
+static void configure_rss(void)
+{
+	char command[256];
+
+	memset(command, 0, sizeof(command));
+	snprintf(command, sizeof(command), "sudo ethtool -X %s equal %d",
+		 ifname, start_queue);
+	system(command);
+}
+
+static void configure_flow_steering(void)
+{
+	char command[256];
+
+	memset(command, 0, sizeof(command));
+	snprintf(command, sizeof(command),
+		 "sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d",
+		 ifname, client_ip, server_ip, port, port, start_queue);
+	system(command);
+}
+
+static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd,
+			 struct netdev_queue_dmabuf *queues,
+			 unsigned int n_queue_index, struct ynl_sock **ys)
+{
+	struct netdev_bind_rx_req *req = NULL;
+	struct netdev_bind_rx_rsp *rsp = NULL;
+	struct ynl_error yerr;
+
+	*ys = ynl_sock_create(&ynl_netdev_family, &yerr);
+	if (!*ys) {
+		fprintf(stderr, "YNL: %s\n", yerr.msg);
+		return -1;
+	}
+
+	req = netdev_bind_rx_req_alloc();
+	netdev_bind_rx_req_set_ifindex(req, ifindex);
+	netdev_bind_rx_req_set_dmabuf_fd(req, dmabuf_fd);
+	__netdev_bind_rx_req_set_queues(req, queues, n_queue_index);
+
+	rsp = netdev_bind_rx(*ys, req);
+	if (!rsp) {
+		perror("netdev_bind_rx");
+		goto err_close;
+	}
+
+	if (!rsp->_present.dmabuf_id) {
+		perror("dmabuf_id not present");
+		goto err_close;
+	}
+
+	printf("got dmabuf id=%d\n", rsp->dmabuf_id);
+	dmabuf_id = rsp->dmabuf_id;
+
+	netdev_bind_rx_req_free(req);
+	netdev_bind_rx_rsp_free(rsp);
+
+	return 0;
+
+err_close:
+	fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg);
+	netdev_bind_rx_req_free(req);
+	ynl_sock_destroy(*ys);
+	return -1;
+}
+
+static void create_udmabuf(int *devfd, int *memfd, int *buf, size_t dmabuf_size)
+{
+	struct udmabuf_create create;
+	int ret;
+
+	*devfd = open("/dev/udmabuf", O_RDWR);
+	if (*devfd < 0) {
+		error(70, 0,
+		      "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
+		      TEST_PREFIX);
+	}
+
+	*memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
+	if (*memfd < 0)
+		error(70, 0, "%s: [skip,no-memfd]\n", TEST_PREFIX);
+
+	/* Required for udmabuf */
+	ret = fcntl(*memfd, F_ADD_SEALS, F_SEAL_SHRINK);
+	if (ret < 0)
+		error(73, 0, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
+
+	ret = ftruncate(*memfd, dmabuf_size);
+	if (ret == -1)
+		error(74, 0, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
+
+	memset(&create, 0, sizeof(create));
+
+	create.memfd = *memfd;
+	create.offset = 0;
+	create.size = dmabuf_size;
+	*buf = ioctl(*devfd, UDMABUF_CREATE, &create);
+	if (*buf < 0)
+		error(75, 0, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX);
+}
+
+int do_server(void)
+{
+	char ctrl_data[sizeof(int) * 20000];
+	struct netdev_queue_dmabuf *queues;
+	size_t non_page_aligned_frags = 0;
+	struct sockaddr_in client_addr;
+	struct sockaddr_in server_sin;
+	size_t page_aligned_frags = 0;
+	int devfd, memfd, buf, ret;
+	size_t total_received = 0;
+	socklen_t client_addr_len;
+	bool is_devmem = false;
+	char *buf_mem = NULL;
+	struct ynl_sock *ys;
+	size_t dmabuf_size;
+	char iobuf[819200];
+	char buffer[256];
+	int socket_fd;
+	int client_fd;
+	size_t i = 0;
+	int opt = 1;
+
+	dmabuf_size = getpagesize() * NUM_PAGES;
+
+	create_udmabuf(&devfd, &memfd, &buf, dmabuf_size);
+
+	reset_flow_steering();
+
+	/* Configure RSS to divert all traffic from our devmem queues */
+	configure_rss();
+
+	/* Flow steer our devmem flows to start_queue */
+	configure_flow_steering();
+
+	sleep(1);
+
+	queues = malloc(sizeof(*queues) * num_queues);
+
+	for (i = 0; i < num_queues; i++) {
+		queues[i]._present.type = 1;
+		queues[i]._present.idx = 1;
+		queues[i].type = NETDEV_QUEUE_TYPE_RX;
+		queues[i].idx = start_queue + i;
+	}
+
+	if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
+		error(1, 0, "Failed to bind\n");
+
+	buf_mem = mmap(NULL, dmabuf_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+		       buf, 0);
+	if (buf_mem == MAP_FAILED)
+		error(1, 0, "mmap()");
+
+	server_sin.sin_family = AF_INET;
+	server_sin.sin_port = htons(atoi(port));
+
+	ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr);
+	if (socket < 0)
+		error(79, 0, "%s: [FAIL, create socket]\n", TEST_PREFIX);
+
+	socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0);
+	if (socket < 0)
+		error(errno, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX);
+
+	ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt,
+			 sizeof(opt));
+	if (ret)
+		error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
+
+	ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt,
+			 sizeof(opt));
+	if (ret)
+		error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
+
+	printf("binding to address %s:%d\n", server_ip,
+	       ntohs(server_sin.sin_port));
+
+	ret = bind(socket_fd, &server_sin, sizeof(server_sin));
+	if (ret)
+		error(errno, errno, "%s: [FAIL, bind]\n", TEST_PREFIX);
+
+	ret = listen(socket_fd, 1);
+	if (ret)
+		error(errno, errno, "%s: [FAIL, listen]\n", TEST_PREFIX);
+
+	client_addr_len = sizeof(client_addr);
+
+	inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer,
+		  sizeof(buffer));
+	printf("Waiting or connection on %s:%d\n", buffer,
+	       ntohs(server_sin.sin_port));
+	client_fd = accept(socket_fd, &client_addr, &client_addr_len);
+
+	inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer,
+		  sizeof(buffer));
+	printf("Got connection from %s:%d\n", buffer,
+	       ntohs(client_addr.sin_port));
+
+	while (1) {
+		struct iovec iov = { .iov_base = iobuf,
+				     .iov_len = sizeof(iobuf) };
+		struct dmabuf_cmsg *dmabuf_cmsg = NULL;
+		struct dma_buf_sync sync = { 0 };
+		struct cmsghdr *cm = NULL;
+		struct msghdr msg = { 0 };
+		struct dmabuf_token token;
+		ssize_t ret;
+
+		is_devmem = false;
+		printf("\n\n");
+
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = ctrl_data;
+		msg.msg_controllen = sizeof(ctrl_data);
+		ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM);
+		printf("recvmsg ret=%ld\n", ret);
+		if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
+			continue;
+		if (ret < 0) {
+			perror("recvmsg");
+			continue;
+		}
+		if (ret == 0) {
+			printf("client exited\n");
+			goto cleanup;
+		}
+
+		i++;
+		for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
+			if (cm->cmsg_level != SOL_SOCKET ||
+			    (cm->cmsg_type != SCM_DEVMEM_DMABUF &&
+			     cm->cmsg_type != SCM_DEVMEM_LINEAR)) {
+				fprintf(stdout, "skipping non-devmem cmsg\n");
+				continue;
+			}
+
+			dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm);
+			is_devmem = true;
+
+			if (cm->cmsg_type == SCM_DEVMEM_LINEAR) {
+				/* TODO: process data copied from skb's linear
+				 * buffer.
+				 */
+				fprintf(stdout,
+					"SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n",
+					dmabuf_cmsg->frag_size);
+
+				continue;
+			}
+
+			token.token_start = dmabuf_cmsg->frag_token;
+			token.token_count = 1;
+
+			total_received += dmabuf_cmsg->frag_size;
+			printf("received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n",
+			       dmabuf_cmsg->frag_offset >> PAGE_SHIFT,
+			       dmabuf_cmsg->frag_offset % getpagesize(),
+			       dmabuf_cmsg->frag_offset, dmabuf_cmsg->frag_size,
+			       dmabuf_cmsg->frag_token, total_received,
+			       dmabuf_cmsg->dmabuf_id);
+
+			if (dmabuf_cmsg->dmabuf_id != dmabuf_id)
+				error(1, 0,
+				      "received on wrong dmabuf_id: flow steering error\n");
+
+			if (dmabuf_cmsg->frag_size % getpagesize())
+				non_page_aligned_frags++;
+			else
+				page_aligned_frags++;
+
+			sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START;
+			ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+			if (do_validation)
+				validate_buffer(
+					((unsigned char *)buf_mem) +
+						dmabuf_cmsg->frag_offset,
+					dmabuf_cmsg->frag_size);
+			else
+				print_nonzero_bytes(
+					((unsigned char *)buf_mem) +
+						dmabuf_cmsg->frag_offset,
+					dmabuf_cmsg->frag_size);
+
+			sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
+			ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+			ret = setsockopt(client_fd, SOL_SOCKET,
+					 SO_DEVMEM_DONTNEED, &token,
+					 sizeof(token));
+			if (ret != 1)
+				error(1, 0,
+				      "SO_DEVMEM_DONTNEED not enough tokens");
+		}
+		if (!is_devmem)
+			error(1, 0, "flow steering error\n");
+
+		printf("total_received=%lu\n", total_received);
+	}
+
+	fprintf(stdout, "%s: ok\n", TEST_PREFIX);
+
+	fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
+		page_aligned_frags, non_page_aligned_frags);
+
+	fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
+		page_aligned_frags, non_page_aligned_frags);
+
+cleanup:
+
+	munmap(buf_mem, dmabuf_size);
+	close(client_fd);
+	close(socket_fd);
+	close(buf);
+	close(memfd);
+	close(devfd);
+	ynl_sock_destroy(ys);
+
+	return 0;
+}
+
+void run_devmem_tests(void)
+{
+	struct netdev_queue_dmabuf *queues;
+	int devfd, memfd, buf;
+	struct ynl_sock *ys;
+	size_t dmabuf_size;
+	size_t i = 0;
+
+	dmabuf_size = getpagesize() * NUM_PAGES;
+
+	create_udmabuf(&devfd, &memfd, &buf, dmabuf_size);
+
+	/* Configure RSS to divert all traffic from our devmem queues */
+	configure_rss();
+
+	sleep(1);
+
+	queues = malloc(sizeof(*queues) * num_queues);
+
+	for (i = 0; i < num_queues; i++) {
+		queues[i]._present.type = 1;
+		queues[i]._present.idx = 1;
+		queues[i].type = NETDEV_QUEUE_TYPE_RX;
+		queues[i].idx = start_queue + i;
+	}
+
+	if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
+		error(1, 0, "Failed to bind\n");
+
+	/* Closing the netlink socket does an implicit unbind */
+	ynl_sock_destroy(ys);
+}
+
+int main(int argc, char *argv[])
+{
+	int is_server = 0, opt;
+
+	while ((opt = getopt(argc, argv, "ls:c:p:v:q:f:n:i:d:")) != -1) {
+		switch (opt) {
+		case 'l':
+			is_server = 1;
+			break;
+		case 's':
+			server_ip = optarg;
+			break;
+		case 'c':
+			client_ip = optarg;
+			break;
+		case 'p':
+			port = optarg;
+			break;
+		case 'v':
+			do_validation = atoll(optarg);
+			break;
+		case 'q':
+			num_queues = atoi(optarg);
+			break;
+		case 't':
+			start_queue = atoi(optarg);
+			break;
+		case 'f':
+			ifname = optarg;
+			break;
+		case 'd':
+			ifindex = atoi(optarg);
+			break;
+		case 'i':
+			iterations = atoll(optarg);
+			break;
+		case '?':
+			printf("unknown option: %c\n", optopt);
+			break;
+		}
+	}
+
+	for (; optind < argc; optind++)
+		printf("extra arguments: %s\n", argv[optind]);
+
+	run_devmem_tests();
+
+	if (is_server)
+		return do_server();
+
+	return 0;
+}