diff mbox series

[net-next,v3,13/24] ovpn: implement TCP transport

Message ID 20240506011637.27272-14-antonio@openvpn.net (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series Introducing OpenVPN Data Channel Offload | expand

Checks

Context Check Description
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; GEN HAS DIFF 2 files changed, 2613 insertions(+);
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 932 this patch: 932
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 1 maintainers not CCed: openvpn-devel@lists.sourceforge.net
netdev/build_clang success Errors and warnings before: 938 this patch: 938
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 944 this patch: 944
netdev/checkpatch warning WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc fail Errors and warnings before: 1 this patch: 4
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-05-07--03-00 (tests: 1000)

Commit Message

Antonio Quartulli May 6, 2024, 1:16 a.m. UTC
With this changem ovpn is allowed to communicate to peers also via TCP.

Signed-off-by: Antonio Quartulli <antonio@openvpn.net>
---
 drivers/net/ovpn/Makefile |   1 +
 drivers/net/ovpn/io.c     |   5 +
 drivers/net/ovpn/main.c   |   9 +-
 drivers/net/ovpn/peer.h   |  29 +++
 drivers/net/ovpn/skb.h    |  51 ++++
 drivers/net/ovpn/socket.c |  20 ++
 drivers/net/ovpn/socket.h |  15 +-
 drivers/net/ovpn/tcp.c    | 511 ++++++++++++++++++++++++++++++++++++++
 drivers/net/ovpn/tcp.h    |  42 ++++
 9 files changed, 681 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ovpn/skb.h
 create mode 100644 drivers/net/ovpn/tcp.c
 create mode 100644 drivers/net/ovpn/tcp.h

Comments

Antonio Quartulli May 13, 2024, 1:37 p.m. UTC | #1
Hi Simon,

On 06/05/2024 03:16, Antonio Quartulli wrote:
[...]
> diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h
> index b5ff59a4b40f..ac4907705d98 100644
> --- a/drivers/net/ovpn/peer.h
> +++ b/drivers/net/ovpn/peer.h
> @@ -33,6 +33,16 @@
>    * @netif_rx_ring: queue of packets to be sent to the netdevice via NAPI
>    * @napi: NAPI object
>    * @sock: the socket being used to talk to this peer
> + * @tcp.tx_ring: queue for packets to be forwarded to userspace (TCP only)
> + * @tcp.tx_work: work for processing outgoing socket data (TCP only)
> + * @tcp.rx_work: wok for processing incoming socket data (TCP only)
> + * @tcp.raw_len: next packet length as read from the stream (TCP only)

can you please help me with the following warning from kerneldoc?
As you can see below, raw_len is an array.

May that be the reason why the script isn't picking it up correctly?

drivers/net/ovpn/peer.h:101: warning: Function parameter or struct 
member 'raw_len' not described in 'ovpn_peer'
drivers/net/ovpn/peer.h:101: warning: Excess struct member 'tcp.raw_len' 
description in 'ovpn_peer'

(line number may differ as I am in the middle of a rebase)

Regards,


> + * @tcp.skb: next packet being filled with data from the stream (TCP only)
> + * @tcp.offset: position of the next byte to write in the skb (TCP only)
> + * @tcp.data_len: next packet length converted to host order (TCP only)
> + * @tcp.sk_cb.sk_data_ready: pointer to original cb
> + * @tcp.sk_cb.sk_write_space: pointer to original cb
> + * @tcp.sk_cb.prot: pointer to original prot object
>    * @crypto: the crypto configuration (ciphers, keys, etc..)
>    * @dst_cache: cache for dst_entry used to send to peer
>    * @bind: remote peer binding
> @@ -59,6 +69,25 @@ struct ovpn_peer {
>   	struct ptr_ring netif_rx_ring;
>   	struct napi_struct napi;
>   	struct ovpn_socket *sock;
> +	/* state of the TCP reading. Needed to keep track of how much of a
> +	 * single packet has already been read from the stream and how much is
> +	 * missing
> +	 */
> +	struct {
> +		struct ptr_ring tx_ring;
> +		struct work_struct tx_work;
> +		struct work_struct rx_work;
> +
> +		u8 raw_len[sizeof(u16)];
> +		struct sk_buff *skb;
> +		u16 offset;
> +		u16 data_len;
> +		struct {
> +			void (*sk_data_ready)(struct sock *sk);
> +			void (*sk_write_space)(struct sock *sk);
> +			struct proto *prot;
> +		} sk_cb;
> +	} tcp;
>   	struct ovpn_crypto_state crypto;
>   	struct dst_cache dst_cache;
>   	struct ovpn_bind __rcu *bind;
> diff --git a/drivers/net/ovpn/skb.h b/drivers/net/ovpn/skb.h
> new file mode 100644
> index 000000000000..ba92811e12ff
> --- /dev/null
> +++ b/drivers/net/ovpn/skb.h
> @@ -0,0 +1,51 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*  OpenVPN data channel offload
> + *
> + *  Copyright (C) 2020-2024 OpenVPN, Inc.
> + *
> + *  Author:	Antonio Quartulli <antonio@openvpn.net>
> + *		James Yonan <james@openvpn.net>
> + */
> +
> +#ifndef _NET_OVPN_SKB_H_
> +#define _NET_OVPN_SKB_H_
> +
> +#include <linux/in.h>
> +#include <linux/in6.h>
> +#include <linux/ip.h>
> +#include <linux/skbuff.h>
> +#include <linux/socket.h>
> +#include <linux/types.h>
> +
> +#define OVPN_SKB_CB(skb) ((struct ovpn_skb_cb *)&((skb)->cb))
> +
> +struct ovpn_skb_cb {
> +	union {
> +		struct in_addr ipv4;
> +		struct in6_addr ipv6;
> +	} local;
> +	sa_family_t sa_fam;
> +};
> +
> +/* Return IP protocol version from skb header.
> + * Return 0 if protocol is not IPv4/IPv6 or cannot be read.
> + */
> +static inline __be16 ovpn_ip_check_protocol(struct sk_buff *skb)
> +{
> +	__be16 proto = 0;
> +
> +	/* skb could be non-linear,
> +	 * make sure IP header is in non-fragmented part
> +	 */
> +	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
> +		return 0;
> +
> +	if (ip_hdr(skb)->version == 4)
> +		proto = htons(ETH_P_IP);
> +	else if (ip_hdr(skb)->version == 6)
> +		proto = htons(ETH_P_IPV6);
> +
> +	return proto;
> +}
> +
> +#endif /* _NET_OVPN_SKB_H_ */
> diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c
> index e099a61b03fa..004db5b13663 100644
> --- a/drivers/net/ovpn/socket.c
> +++ b/drivers/net/ovpn/socket.c
> @@ -16,6 +16,7 @@
>   #include "packet.h"
>   #include "peer.h"
>   #include "socket.h"
> +#include "tcp.h"
>   #include "udp.h"
>   
>   /* Finalize release of socket, called after RCU grace period */
> @@ -26,6 +27,8 @@ static void ovpn_socket_detach(struct socket *sock)
>   
>   	if (sock->sk->sk_protocol == IPPROTO_UDP)
>   		ovpn_udp_socket_detach(sock);
> +	else if (sock->sk->sk_protocol == IPPROTO_TCP)
> +		ovpn_tcp_socket_detach(sock);
>   
>   	sockfd_put(sock);
>   }
> @@ -69,6 +72,8 @@ static int ovpn_socket_attach(struct socket *sock, struct ovpn_peer *peer)
>   
>   	if (sock->sk->sk_protocol == IPPROTO_UDP)
>   		ret = ovpn_udp_socket_attach(sock, peer->ovpn);
> +	else if (sock->sk->sk_protocol == IPPROTO_TCP)
> +		ret = ovpn_tcp_socket_attach(sock, peer);
>   
>   	return ret;
>   }
> @@ -124,6 +129,21 @@ struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer)
>   	ovpn_sock->sock = sock;
>   	kref_init(&ovpn_sock->refcount);
>   
> +	/* TCP sockets are per-peer, therefore they are linked to their unique
> +	 * peer
> +	 */
> +	if (sock->sk->sk_protocol == IPPROTO_TCP) {
> +		ovpn_sock->peer = peer;
> +		ret = ptr_ring_init(&ovpn_sock->recv_ring, OVPN_QUEUE_LEN,
> +				    GFP_KERNEL);
> +		if (ret < 0) {
> +			netdev_err(peer->ovpn->dev, "%s: cannot allocate TCP recv ring\n",
> +				   __func__);
> +			kfree(ovpn_sock);
> +			return ERR_PTR(ret);
> +		}
> +	}
> +
>   	rcu_assign_sk_user_data(sock->sk, ovpn_sock);
>   
>   	return ovpn_sock;
> diff --git a/drivers/net/ovpn/socket.h b/drivers/net/ovpn/socket.h
> index 0d23de5a9344..88c6271ba5c7 100644
> --- a/drivers/net/ovpn/socket.h
> +++ b/drivers/net/ovpn/socket.h
> @@ -21,12 +21,25 @@ struct ovpn_peer;
>   /**
>    * struct ovpn_socket - a kernel socket referenced in the ovpn code
>    * @ovpn: ovpn instance owning this socket (UDP only)
> + * @peer: unique peer transmitting over this socket (TCP only)
> + * @recv_ring: queue where non-data packets directed to userspace are stored
>    * @sock: the low level sock object
>    * @refcount: amount of contexts currently referencing this object
>    * @rcu: member used to schedule RCU destructor callback
>    */
>   struct ovpn_socket {
> -	struct ovpn_struct *ovpn;
> +	union {
> +		/* the VPN session object owning this socket (UDP only) */
> +		struct ovpn_struct *ovpn;
> +
> +		/* TCP only */
> +		struct {
> +			/** @peer: unique peer transmitting over this socket */
> +			struct ovpn_peer *peer;
> +			struct ptr_ring recv_ring;
> +		};
> +	};
> +
>   	struct socket *sock;
>   	struct kref refcount;
>   	struct rcu_head rcu;
> diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c
> new file mode 100644
> index 000000000000..84ad7cd4fc4f
> --- /dev/null
> +++ b/drivers/net/ovpn/tcp.c
> @@ -0,0 +1,511 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*  OpenVPN data channel offload
> + *
> + *  Copyright (C) 2019-2024 OpenVPN, Inc.
> + *
> + *  Author:	Antonio Quartulli <antonio@openvpn.net>
> + */
> +
> +#include <linux/ptr_ring.h>
> +#include <linux/skbuff.h>
> +#include <net/tcp.h>
> +#include <net/route.h>
> +
> +#include "ovpnstruct.h"
> +#include "main.h"
> +#include "io.h"
> +#include "packet.h"
> +#include "peer.h"
> +#include "proto.h"
> +#include "skb.h"
> +#include "socket.h"
> +#include "tcp.h"
> +
> +static struct proto ovpn_tcp_prot;
> +
> +static int ovpn_tcp_read_sock(read_descriptor_t *desc, struct sk_buff *in_skb,
> +			      unsigned int in_offset, size_t in_len)
> +{
> +	struct sock *sk = desc->arg.data;
> +	struct ovpn_socket *sock;
> +	struct ovpn_skb_cb *cb;
> +	struct ovpn_peer *peer;
> +	size_t chunk, copied = 0;
> +	void *data;
> +	u16 len;
> +	int st;
> +
> +	rcu_read_lock();
> +	sock = rcu_dereference_sk_user_data(sk);
> +	rcu_read_unlock();
> +
> +	if (unlikely(!sock || !sock->peer)) {
> +		pr_err("ovpn: read_sock triggered for socket with no metadata\n");
> +		desc->error = -EINVAL;
> +		return 0;
> +	}
> +
> +	peer = sock->peer;
> +
> +	while (in_len > 0) {
> +		/* no skb allocated means that we have to read (or finish
> +		 * reading) the 2 bytes prefix containing the actual packet
> +		 * size.
> +		 */
> +		if (!peer->tcp.skb) {
> +			chunk = min_t(size_t, in_len,
> +				      sizeof(u16) - peer->tcp.offset);
> +			WARN_ON(skb_copy_bits(in_skb, in_offset,
> +					      peer->tcp.raw_len +
> +					      peer->tcp.offset, chunk) < 0);
> +			peer->tcp.offset += chunk;
> +
> +			/* keep on reading until we got the whole packet size */
> +			if (peer->tcp.offset != sizeof(u16))
> +				goto next_read;
> +
> +			len = ntohs(*(__be16 *)peer->tcp.raw_len);
> +			/* invalid packet length: this is a fatal TCP error */
> +			if (!len) {
> +				netdev_err(peer->ovpn->dev,
> +					   "%s: received invalid packet length: %d\n",
> +					   __func__, len);
> +				desc->error = -EINVAL;
> +				goto err;
> +			}
> +
> +			/* add 2 bytes to allocated space (and immediately
> +			 * reserve them) for packet length prepending, in case
> +			 * the skb has to be forwarded to userspace
> +			 */
> +			peer->tcp.skb =
> +				netdev_alloc_skb_ip_align(peer->ovpn->dev,
> +							  len + sizeof(u16));
> +			if (!peer->tcp.skb) {
> +				desc->error = -ENOMEM;
> +				goto err;
> +			}
> +			skb_reserve(peer->tcp.skb, sizeof(u16));
> +
> +			peer->tcp.offset = 0;
> +			peer->tcp.data_len = len;
> +		} else {
> +			chunk = min_t(size_t, in_len,
> +				      peer->tcp.data_len - peer->tcp.offset);
> +
> +			/* extend skb to accommodate the new chunk and copy it
> +			 * from the input skb
> +			 */
> +			data = skb_put(peer->tcp.skb, chunk);
> +			WARN_ON(skb_copy_bits(in_skb, in_offset, data,
> +					      chunk) < 0);
> +			peer->tcp.offset += chunk;
> +
> +			/* keep on reading until we get the full packet */
> +			if (peer->tcp.offset != peer->tcp.data_len)
> +				goto next_read;
> +
> +			/* do not perform IP caching for TCP connections */
> +			cb = OVPN_SKB_CB(peer->tcp.skb);
> +			cb->sa_fam = AF_UNSPEC;
> +
> +			/* At this point we know the packet is from a configured
> +			 * peer.
> +			 * DATA_V2 packets are handled in kernel space, the rest
> +			 * goes to user space.
> +			 *
> +			 * Queue skb for sending to userspace via recvmsg on the
> +			 * socket
> +			 */
> +			if (likely(ovpn_opcode_from_skb(peer->tcp.skb, 0) ==
> +				   OVPN_DATA_V2)) {
> +				/* hold reference to peer as required by
> +				 * ovpn_recv().
> +				 *
> +				 * NOTE: in this context we should already be
> +				 * holding a reference to this peer, therefore
> +				 * ovpn_peer_hold() is not expected to fail
> +				 */
> +				WARN_ON(!ovpn_peer_hold(peer));
> +				st = ovpn_recv(peer->ovpn, peer, peer->tcp.skb);
> +				if (unlikely(st < 0))
> +					ovpn_peer_put(peer);
> +
> +			} else {
> +				/* prepend skb with packet len. this way
> +				 * userspace can parse the packet as if it just
> +				 * arrived from the remote endpoint
> +				 */
> +				void *raw_len = __skb_push(peer->tcp.skb,
> +							   sizeof(u16));
> +
> +				memcpy(raw_len, peer->tcp.raw_len, sizeof(u16));
> +
> +				st = ptr_ring_produce_bh(&peer->sock->recv_ring,
> +							 peer->tcp.skb);
> +				if (likely(!st))
> +					peer->tcp.sk_cb.sk_data_ready(sk);
> +			}
> +
> +			/* skb not consumed - free it now */
> +			if (unlikely(st < 0))
> +				kfree_skb(peer->tcp.skb);
> +
> +			peer->tcp.skb = NULL;
> +			peer->tcp.offset = 0;
> +			peer->tcp.data_len = 0;
> +		}
> +next_read:
> +		in_len -= chunk;
> +		in_offset += chunk;
> +		copied += chunk;
> +	}
> +
> +	return copied;
> +err:
> +	netdev_err(peer->ovpn->dev, "cannot process incoming TCP data: %d\n",
> +		   desc->error);
> +	ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_TRANSPORT_ERROR);
> +	return 0;
> +}
> +
> +static void ovpn_tcp_data_ready(struct sock *sk)
> +{
> +	struct socket *sock = sk->sk_socket;
> +	read_descriptor_t desc;
> +
> +	if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
> +		return;
> +
> +	desc.arg.data = sk;
> +	desc.error = 0;
> +	desc.count = 1;
> +
> +	sock->ops->read_sock(sk, &desc, ovpn_tcp_read_sock);
> +}
> +
> +static void ovpn_tcp_write_space(struct sock *sk)
> +{
> +	struct ovpn_socket *sock;
> +
> +	rcu_read_lock();
> +	sock = rcu_dereference_sk_user_data(sk);
> +	rcu_read_unlock();
> +
> +	if (!sock || !sock->peer)
> +		return;
> +
> +	queue_work(sock->peer->ovpn->events_wq, &sock->peer->tcp.tx_work);
> +}
> +
> +static bool ovpn_tcp_sock_is_readable(struct sock *sk)
> +
> +{
> +	struct ovpn_socket *sock;
> +
> +	rcu_read_lock();
> +	sock = rcu_dereference_sk_user_data(sk);
> +	rcu_read_unlock();
> +
> +	if (!sock || !sock->peer)
> +		return false;
> +
> +	return !ptr_ring_empty_bh(&sock->recv_ring);
> +}
> +
> +static int ovpn_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> +			    int flags, int *addr_len)
> +{
> +	bool tmp = flags & MSG_DONTWAIT;
> +	DEFINE_WAIT_FUNC(wait, woken_wake_function);
> +	int ret, chunk, copied = 0;
> +	struct ovpn_socket *sock;
> +	struct sk_buff *skb;
> +	long timeo;
> +
> +	if (unlikely(flags & MSG_ERRQUEUE))
> +		return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR);
> +
> +	timeo = sock_rcvtimeo(sk, tmp);
> +
> +	rcu_read_lock();
> +	sock = rcu_dereference_sk_user_data(sk);
> +	rcu_read_unlock();
> +
> +	if (!sock || !sock->peer) {
> +		ret = -EBADF;
> +		goto unlock;
> +	}
> +
> +	while (ptr_ring_empty_bh(&sock->recv_ring)) {
> +		if (sk->sk_shutdown & RCV_SHUTDOWN)
> +			return 0;
> +
> +		if (sock_flag(sk, SOCK_DONE))
> +			return 0;
> +
> +		if (!timeo) {
> +			ret = -EAGAIN;
> +			goto unlock;
> +		}
> +
> +		add_wait_queue(sk_sleep(sk), &wait);
> +		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
> +		sk_wait_event(sk, &timeo, !ptr_ring_empty_bh(&sock->recv_ring),
> +			      &wait);
> +		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
> +		remove_wait_queue(sk_sleep(sk), &wait);
> +
> +		/* take care of signals */
> +		if (signal_pending(current)) {
> +			ret = sock_intr_errno(timeo);
> +			goto unlock;
> +		}
> +	}
> +
> +	while (len && (skb = __ptr_ring_peek(&sock->recv_ring))) {
> +		chunk = min_t(size_t, len, skb->len);
> +		ret = skb_copy_datagram_msg(skb, 0, msg, chunk);
> +		if (ret < 0) {
> +			pr_err("ovpn: cannot copy TCP data to userspace: %d\n",
> +			       ret);
> +			kfree_skb(skb);
> +			goto unlock;
> +		}
> +
> +		__skb_pull(skb, chunk);
> +
> +		if (!skb->len) {
> +			/* skb was entirely consumed and can now be removed from
> +			 * the ring
> +			 */
> +			__ptr_ring_discard_one(&sock->recv_ring);
> +			consume_skb(skb);
> +		}
> +
> +		len -= chunk;
> +		copied += chunk;
> +	}
> +	ret = copied;
> +
> +unlock:
> +	return ret ? : -EAGAIN;
> +}
> +
> +static void ovpn_destroy_skb(void *skb)
> +{
> +	consume_skb(skb);
> +}
> +
> +void ovpn_tcp_socket_detach(struct socket *sock)
> +{
> +	struct ovpn_socket *ovpn_sock;
> +	struct ovpn_peer *peer;
> +
> +	if (!sock)
> +		return;
> +
> +	rcu_read_lock();
> +	ovpn_sock = rcu_dereference_sk_user_data(sock->sk);
> +	rcu_read_unlock();
> +
> +	if (!ovpn_sock->peer)
> +		return;
> +
> +	peer = ovpn_sock->peer;
> +
> +	/* restore CBs that were saved in ovpn_sock_set_tcp_cb() */
> +	write_lock_bh(&sock->sk->sk_callback_lock);
> +	sock->sk->sk_data_ready = peer->tcp.sk_cb.sk_data_ready;
> +	sock->sk->sk_write_space = peer->tcp.sk_cb.sk_write_space;
> +	sock->sk->sk_prot = peer->tcp.sk_cb.prot;
> +	rcu_assign_sk_user_data(sock->sk, NULL);
> +	write_unlock_bh(&sock->sk->sk_callback_lock);
> +
> +	/* cancel any ongoing work. Done after removing the CBs so that these
> +	 * workers cannot be re-armed
> +	 */
> +	cancel_work_sync(&peer->tcp.tx_work);
> +
> +	ptr_ring_cleanup(&ovpn_sock->recv_ring, ovpn_destroy_skb);
> +	ptr_ring_cleanup(&peer->tcp.tx_ring, ovpn_destroy_skb);
> +}
> +
> +/* Try to send one skb (or part of it) over the TCP stream.
> + *
> + * Return 0 on success or a negative error code otherwise.
> + *
> + * Note that the skb is modified by putting away the data being sent, therefore
> + * the caller should check if skb->len is zero to understand if the full skb was
> + * sent or not.
> + */
> +static int ovpn_tcp_send_one(struct ovpn_peer *peer, struct sk_buff *skb)
> +{
> +	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
> +	struct kvec iv = { 0 };
> +	int ret;
> +
> +	if (skb_linearize(skb) < 0) {
> +		net_err_ratelimited("%s: can't linearize packet\n", __func__);
> +		return -ENOMEM;
> +	}
> +
> +	/* initialize iv structure now as skb_linearize() may have changed
> +	 * skb->data
> +	 */
> +	iv.iov_base = skb->data;
> +	iv.iov_len = skb->len;
> +
> +	ret = kernel_sendmsg(peer->sock->sock, &msg, &iv, 1, iv.iov_len);
> +	if (ret > 0) {
> +		__skb_pull(skb, ret);
> +
> +		/* since we update per-cpu stats in process context,
> +		 * we need to disable softirqs
> +		 */
> +		local_bh_disable();
> +		dev_sw_netstats_tx_add(peer->ovpn->dev, 1, ret);
> +		local_bh_enable();
> +
> +		return 0;
> +	}
> +
> +	return ret;
> +}
> +
> +/* Process packets in TCP TX queue */
> +static void ovpn_tcp_tx_work(struct work_struct *work)
> +{
> +	struct ovpn_peer *peer;
> +	struct sk_buff *skb;
> +	int ret;
> +
> +	peer = container_of(work, struct ovpn_peer, tcp.tx_work);
> +	while ((skb = __ptr_ring_peek(&peer->tcp.tx_ring))) {
> +		ret = ovpn_tcp_send_one(peer, skb);
> +		if (ret < 0 && ret != -EAGAIN) {
> +			net_warn_ratelimited("%s: cannot send TCP packet to peer %u: %d\n",
> +					     __func__, peer->id, ret);
> +			/* in case of TCP error stop sending loop and delete
> +			 * peer
> +			 */
> +			ovpn_peer_del(peer,
> +				      OVPN_DEL_PEER_REASON_TRANSPORT_ERROR);
> +			break;
> +		} else if (!skb->len) {
> +			/* skb was entirely consumed and can now be removed from
> +			 * the ring
> +			 */
> +			__ptr_ring_discard_one(&peer->tcp.tx_ring);
> +			consume_skb(skb);
> +		}
> +
> +		/* give a chance to be rescheduled if needed */
> +		cond_resched();
> +	}
> +}
> +
> +/* Put packet into TCP TX queue and schedule a consumer */
> +void ovpn_queue_tcp_skb(struct ovpn_peer *peer, struct sk_buff *skb)
> +{
> +	int ret;
> +
> +	ret = ptr_ring_produce_bh(&peer->tcp.tx_ring, skb);
> +	if (ret < 0) {
> +		kfree_skb_list(skb);
> +		return;
> +	}
> +
> +	queue_work(peer->ovpn->events_wq, &peer->tcp.tx_work);
> +}
> +
> +/* Set TCP encapsulation callbacks */
> +int ovpn_tcp_socket_attach(struct socket *sock, struct ovpn_peer *peer)
> +{
> +	void *old_data;
> +	int ret;
> +
> +	INIT_WORK(&peer->tcp.tx_work, ovpn_tcp_tx_work);
> +
> +	ret = ptr_ring_init(&peer->tcp.tx_ring, OVPN_QUEUE_LEN, GFP_KERNEL);
> +	if (ret < 0) {
> +		netdev_err(peer->ovpn->dev, "cannot allocate TCP TX ring\n");
> +		return ret;
> +	}
> +
> +	peer->tcp.skb = NULL;
> +	peer->tcp.offset = 0;
> +	peer->tcp.data_len = 0;
> +
> +	write_lock_bh(&sock->sk->sk_callback_lock);
> +
> +	/* make sure no pre-existing encapsulation handler exists */
> +	rcu_read_lock();
> +	old_data = rcu_dereference_sk_user_data(sock->sk);
> +	rcu_read_unlock();
> +	if (old_data) {
> +		netdev_err(peer->ovpn->dev,
> +			   "provided socket already taken by other user\n");
> +		ret = -EBUSY;
> +		goto err;
> +	}
> +
> +	/* sanity check */
> +	if (sock->sk->sk_protocol != IPPROTO_TCP) {
> +		netdev_err(peer->ovpn->dev,
> +			   "provided socket is UDP but expected TCP\n");
> +		ret = -EINVAL;
> +		goto err;
> +	}
> +
> +	/* only a fully connected socket are expected. Connection should be
> +	 * handled in userspace
> +	 */
> +	if (sock->sk->sk_state != TCP_ESTABLISHED) {
> +		netdev_err(peer->ovpn->dev,
> +			   "provided TCP socket is not in ESTABLISHED state: %d\n",
> +			   sock->sk->sk_state);
> +		ret = -EINVAL;
> +		goto err;
> +	}
> +
> +	/* save current CBs so that they can be restored upon socket release */
> +	peer->tcp.sk_cb.sk_data_ready = sock->sk->sk_data_ready;
> +	peer->tcp.sk_cb.sk_write_space = sock->sk->sk_write_space;
> +	peer->tcp.sk_cb.prot = sock->sk->sk_prot;
> +
> +	/* assign our static CBs */
> +	sock->sk->sk_data_ready = ovpn_tcp_data_ready;
> +	sock->sk->sk_write_space = ovpn_tcp_write_space;
> +	sock->sk->sk_prot = &ovpn_tcp_prot;
> +
> +	write_unlock_bh(&sock->sk->sk_callback_lock);
> +
> +	return 0;
> +err:
> +	write_unlock_bh(&sock->sk->sk_callback_lock);
> +	ptr_ring_cleanup(&peer->tcp.tx_ring, NULL);
> +
> +	return ret;
> +}
> +
> +int __init ovpn_tcp_init(void)
> +{
> +	/* We need to substitute the recvmsg and the sock_is_readable
> +	 * callbacks in the sk_prot member of the sock object for TCP
> +	 * sockets.
> +	 *
> +	 * However sock->sk_prot is a pointer to a static variable and
> +	 * therefore we can't directly modify it, otherwise every socket
> +	 * pointing to it will be affected.
> +	 *
> +	 * For this reason we create our own static copy and modify what
> +	 * we need. Then we make sk_prot point to this copy
> +	 * (in ovpn_tcp_socket_attach())
> +	 */
> +	ovpn_tcp_prot = tcp_prot;
> +	ovpn_tcp_prot.recvmsg = ovpn_tcp_recvmsg;
> +	ovpn_tcp_prot.sock_is_readable = ovpn_tcp_sock_is_readable;
> +
> +	return 0;
> +}
> diff --git a/drivers/net/ovpn/tcp.h b/drivers/net/ovpn/tcp.h
> new file mode 100644
> index 000000000000..7e73f6e76e6c
> --- /dev/null
> +++ b/drivers/net/ovpn/tcp.h
> @@ -0,0 +1,42 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*  OpenVPN data channel offload
> + *
> + *  Copyright (C) 2019-2024 OpenVPN, Inc.
> + *
> + *  Author:	Antonio Quartulli <antonio@openvpn.net>
> + */
> +
> +#ifndef _NET_OVPN_TCP_H_
> +#define _NET_OVPN_TCP_H_
> +
> +#include <linux/net.h>
> +#include <linux/skbuff.h>
> +#include <linux/types.h>
> +#include <linux/workqueue.h>
> +
> +#include "peer.h"
> +
> +/* Initialize TCP static objects */
> +int __init ovpn_tcp_init(void);
> +
> +void ovpn_queue_tcp_skb(struct ovpn_peer *peer, struct sk_buff *skb);
> +
> +int ovpn_tcp_socket_attach(struct socket *sock, struct ovpn_peer *peer);
> +void ovpn_tcp_socket_detach(struct socket *sock);
> +
> +/* Prepare skb and enqueue it for sending to peer.
> + *
> + * Preparation consist in prepending the skb payload with its size.
> + * Required by the OpenVPN protocol in order to extract packets from
> + * the TCP stream on the receiver side.
> + */
> +static inline void ovpn_tcp_send_skb(struct ovpn_peer *peer,
> +				     struct sk_buff *skb)
> +{
> +	u16 len = skb->len;
> +
> +	*(__be16 *)__skb_push(skb, sizeof(u16)) = htons(len);
> +	ovpn_queue_tcp_skb(peer, skb);
> +}
> +
> +#endif /* _NET_OVPN_TCP_H_ */
Sabrina Dubroca May 13, 2024, 2:50 p.m. UTC | #2
2024-05-06, 03:16:26 +0200, Antonio Quartulli wrote:
> @@ -307,6 +308,7 @@ static bool ovpn_encrypt_one(struct ovpn_peer *peer, struct sk_buff *skb)
>  /* Process packets in TX queue in a transport-specific way.
>   *
>   * UDP transport - encrypt and send across the tunnel.
> + * TCP transport - encrypt and put into TCP TX queue.
>   */
>  void ovpn_encrypt_work(struct work_struct *work)
>  {
> @@ -340,6 +342,9 @@ void ovpn_encrypt_work(struct work_struct *work)
>  					ovpn_udp_send_skb(peer->ovpn, peer,
>  							  curr);
>  					break;
> +				case IPPROTO_TCP:
> +					ovpn_tcp_send_skb(peer, curr);
> +					break;
>  				default:
>  					/* no transport configured yet */
>  					consume_skb(skb);
> diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
> index 9ae9844dd281..a04d6e55a473 100644
> --- a/drivers/net/ovpn/main.c
> +++ b/drivers/net/ovpn/main.c
> @@ -23,6 +23,7 @@
>  #include "io.h"
>  #include "packet.h"
>  #include "peer.h"
> +#include "tcp.h"
>  
>  /* Driver info */
>  #define DRV_DESCRIPTION	"OpenVPN data channel offload (ovpn)"
> @@ -247,8 +248,14 @@ static struct pernet_operations ovpn_pernet_ops = {
>  
>  static int __init ovpn_init(void)
>  {
> -	int err = register_netdevice_notifier(&ovpn_netdev_notifier);
> +	int err = ovpn_tcp_init();
>  
> +	if (err) {

ovpn_tcp_init cannot fail (and if it could, you'd need to clean up
when register_netdevice_notifier fails). I'd make ovpn_tcp_init void
and kill this check.

> +		pr_err("ovpn: cannot initialize TCP component: %d\n", err);
> +		return err;
> +	}
> +
> +	err = register_netdevice_notifier(&ovpn_netdev_notifier);
>  	if (err) {
>  		pr_err("ovpn: can't register netdevice notifier: %d\n", err);
>  		return err;
> diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h
> index b5ff59a4b40f..ac4907705d98 100644
> --- a/drivers/net/ovpn/peer.h
> +++ b/drivers/net/ovpn/peer.h
> @@ -33,6 +33,16 @@
>   * @netif_rx_ring: queue of packets to be sent to the netdevice via NAPI
>   * @napi: NAPI object
>   * @sock: the socket being used to talk to this peer
> + * @tcp.tx_ring: queue for packets to be forwarded to userspace (TCP only)
> + * @tcp.tx_work: work for processing outgoing socket data (TCP only)
> + * @tcp.rx_work: wok for processing incoming socket data (TCP only)

Never actually used.
If you keep it: s/wok/work/

> + * @tcp.raw_len: next packet length as read from the stream (TCP only)
> + * @tcp.skb: next packet being filled with data from the stream (TCP only)
> + * @tcp.offset: position of the next byte to write in the skb (TCP only)
> + * @tcp.data_len: next packet length converted to host order (TCP only)

It would be nice to add information about whether they're used for TX or RX.

> + * @tcp.sk_cb.sk_data_ready: pointer to original cb
> + * @tcp.sk_cb.sk_write_space: pointer to original cb
> + * @tcp.sk_cb.prot: pointer to original prot object
>   * @crypto: the crypto configuration (ciphers, keys, etc..)
>   * @dst_cache: cache for dst_entry used to send to peer
>   * @bind: remote peer binding
> @@ -59,6 +69,25 @@ struct ovpn_peer {
>  	struct ptr_ring netif_rx_ring;
>  	struct napi_struct napi;
>  	struct ovpn_socket *sock;
> +	/* state of the TCP reading. Needed to keep track of how much of a
> +	 * single packet has already been read from the stream and how much is
> +	 * missing
> +	 */
> +	struct {
> +		struct ptr_ring tx_ring;
> +		struct work_struct tx_work;
> +		struct work_struct rx_work;
> +
> +		u8 raw_len[sizeof(u16)];

Why not u16 or __be16 for this one?

> +		struct sk_buff *skb;
> +		u16 offset;
> +		u16 data_len;
> +		struct {
> +			void (*sk_data_ready)(struct sock *sk);
> +			void (*sk_write_space)(struct sock *sk);
> +			struct proto *prot;
> +		} sk_cb;
> +	} tcp;
>  	struct ovpn_crypto_state crypto;
>  	struct dst_cache dst_cache;
>  	struct ovpn_bind __rcu *bind;
> diff --git a/drivers/net/ovpn/skb.h b/drivers/net/ovpn/skb.h
> new file mode 100644
> index 000000000000..ba92811e12ff
> --- /dev/null
> +++ b/drivers/net/ovpn/skb.h
> @@ -0,0 +1,51 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*  OpenVPN data channel offload
> + *
> + *  Copyright (C) 2020-2024 OpenVPN, Inc.
> + *
> + *  Author:	Antonio Quartulli <antonio@openvpn.net>
> + *		James Yonan <james@openvpn.net>
> + */
> +
> +#ifndef _NET_OVPN_SKB_H_
> +#define _NET_OVPN_SKB_H_
> +
> +#include <linux/in.h>
> +#include <linux/in6.h>
> +#include <linux/ip.h>
> +#include <linux/skbuff.h>
> +#include <linux/socket.h>
> +#include <linux/types.h>
> +
> +#define OVPN_SKB_CB(skb) ((struct ovpn_skb_cb *)&((skb)->cb))
> +
> +struct ovpn_skb_cb {
> +	union {
> +		struct in_addr ipv4;
> +		struct in6_addr ipv6;
> +	} local;
> +	sa_family_t sa_fam;
> +};
> +
> +/* Return IP protocol version from skb header.
> + * Return 0 if protocol is not IPv4/IPv6 or cannot be read.
> + */
> +static inline __be16 ovpn_ip_check_protocol(struct sk_buff *skb)

A dupe of this function exists in drivers/net/ovpn/io.c. I guess you
can just introduce skb.h from the start (with only
ovpn_ip_check_protocol at first).

> +{
> +	__be16 proto = 0;
> +
> +	/* skb could be non-linear,
> +	 * make sure IP header is in non-fragmented part
> +	 */
> +	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
> +		return 0;
> +
> +	if (ip_hdr(skb)->version == 4)
> +		proto = htons(ETH_P_IP);
> +	else if (ip_hdr(skb)->version == 6)
> +		proto = htons(ETH_P_IPV6);
> +
> +	return proto;
> +}
> +
> +#endif /* _NET_OVPN_SKB_H_ */
> diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c
> index e099a61b03fa..004db5b13663 100644
> --- a/drivers/net/ovpn/socket.c
> +++ b/drivers/net/ovpn/socket.c
> @@ -16,6 +16,7 @@
>  #include "packet.h"
>  #include "peer.h"
>  #include "socket.h"
> +#include "tcp.h"
>  #include "udp.h"
>  
>  /* Finalize release of socket, called after RCU grace period */
> @@ -26,6 +27,8 @@ static void ovpn_socket_detach(struct socket *sock)
>  
>  	if (sock->sk->sk_protocol == IPPROTO_UDP)
>  		ovpn_udp_socket_detach(sock);
> +	else if (sock->sk->sk_protocol == IPPROTO_TCP)
> +		ovpn_tcp_socket_detach(sock);
>  
>  	sockfd_put(sock);
>  }
> @@ -69,6 +72,8 @@ static int ovpn_socket_attach(struct socket *sock, struct ovpn_peer *peer)
>  
>  	if (sock->sk->sk_protocol == IPPROTO_UDP)
>  		ret = ovpn_udp_socket_attach(sock, peer->ovpn);
> +	else if (sock->sk->sk_protocol == IPPROTO_TCP)
> +		ret = ovpn_tcp_socket_attach(sock, peer);
>  
>  	return ret;
>  }
> @@ -124,6 +129,21 @@ struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer)
>  	ovpn_sock->sock = sock;

The line above this is:

    ovpn_sock->ovpn = peer->ovpn;

It's technically fine since you then overwrite this with peer in case
we're on TCP, but ovpn_sock->ovpn only exists on UDP since you moved
it into a union in this patch.

>  	kref_init(&ovpn_sock->refcount);
>  
> +	/* TCP sockets are per-peer, therefore they are linked to their unique
> +	 * peer
> +	 */
> +	if (sock->sk->sk_protocol == IPPROTO_TCP) {
> +		ovpn_sock->peer = peer;
> +		ret = ptr_ring_init(&ovpn_sock->recv_ring, OVPN_QUEUE_LEN,
> +				    GFP_KERNEL);
> +		if (ret < 0) {
> +			netdev_err(peer->ovpn->dev, "%s: cannot allocate TCP recv ring\n",
> +				   __func__);

Should you also call ovpn_socket_detach here? (as well when the
kzalloc for ovpn_sock fails a bit earlier)

> +			kfree(ovpn_sock);
> +			return ERR_PTR(ret);
> +		}
> +	}
> +
>  	rcu_assign_sk_user_data(sock->sk, ovpn_sock);
>  
>  	return ovpn_sock;
> diff --git a/drivers/net/ovpn/socket.h b/drivers/net/ovpn/socket.h
> index 0d23de5a9344..88c6271ba5c7 100644
> --- a/drivers/net/ovpn/socket.h
> +++ b/drivers/net/ovpn/socket.h
> @@ -21,12 +21,25 @@ struct ovpn_peer;
>  /**
>   * struct ovpn_socket - a kernel socket referenced in the ovpn code
>   * @ovpn: ovpn instance owning this socket (UDP only)
> + * @peer: unique peer transmitting over this socket (TCP only)
> + * @recv_ring: queue where non-data packets directed to userspace are stored
>   * @sock: the low level sock object
>   * @refcount: amount of contexts currently referencing this object
>   * @rcu: member used to schedule RCU destructor callback
>   */
>  struct ovpn_socket {
> -	struct ovpn_struct *ovpn;
> +	union {
> +		/* the VPN session object owning this socket (UDP only) */

nit: Probably not needed

> +		struct ovpn_struct *ovpn;
> +
> +		/* TCP only */
> +		struct {
> +			/** @peer: unique peer transmitting over this socket */

Is kdoc upset about peer but not recv_ring?

> +			struct ovpn_peer *peer;
> +			struct ptr_ring recv_ring;
> +		};
> +	};
> +
>  	struct socket *sock;
>  	struct kref refcount;
>  	struct rcu_head rcu;
> diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c
> new file mode 100644
> index 000000000000..84ad7cd4fc4f
> --- /dev/null
> +++ b/drivers/net/ovpn/tcp.c
> @@ -0,0 +1,511 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*  OpenVPN data channel offload
> + *
> + *  Copyright (C) 2019-2024 OpenVPN, Inc.
> + *
> + *  Author:	Antonio Quartulli <antonio@openvpn.net>
> + */
> +
> +#include <linux/ptr_ring.h>
> +#include <linux/skbuff.h>
> +#include <net/tcp.h>
> +#include <net/route.h>
> +
> +#include "ovpnstruct.h"
> +#include "main.h"
> +#include "io.h"
> +#include "packet.h"
> +#include "peer.h"
> +#include "proto.h"
> +#include "skb.h"
> +#include "socket.h"
> +#include "tcp.h"
> +
> +static struct proto ovpn_tcp_prot;
> +
> +static int ovpn_tcp_read_sock(read_descriptor_t *desc, struct sk_buff *in_skb,
> +			      unsigned int in_offset, size_t in_len)
> +{
> +	struct sock *sk = desc->arg.data;
> +	struct ovpn_socket *sock;
> +	struct ovpn_skb_cb *cb;
> +	struct ovpn_peer *peer;
> +	size_t chunk, copied = 0;
> +	void *data;
> +	u16 len;
> +	int st;
> +
> +	rcu_read_lock();
> +	sock = rcu_dereference_sk_user_data(sk);
> +	rcu_read_unlock();

You can't just release rcu_read_lock and keep using sock (here and in
the rest of this file). Either you keep rcu_read_lock, or you can take
a reference on the ovpn_socket.


Anyway, this looks like you're reinventing strparser. Overall this is
very similar to net/xfrm/espintcp.c, but the receive side of espintcp
uses strp and is much shorter (recv_ring looks equivalent to
ike_queue, both sending a few messages to userspace -- look for
strp_init, espintcp_rcv, espintcp_parse in that file).

> +/* Set TCP encapsulation callbacks */
> +int ovpn_tcp_socket_attach(struct socket *sock, struct ovpn_peer *peer)
> +{
> +	void *old_data;
> +	int ret;
> +
> +	INIT_WORK(&peer->tcp.tx_work, ovpn_tcp_tx_work);
> +
> +	ret = ptr_ring_init(&peer->tcp.tx_ring, OVPN_QUEUE_LEN, GFP_KERNEL);
> +	if (ret < 0) {
> +		netdev_err(peer->ovpn->dev, "cannot allocate TCP TX ring\n");
> +		return ret;
> +	}
> +
> +	peer->tcp.skb = NULL;
> +	peer->tcp.offset = 0;
> +	peer->tcp.data_len = 0;
> +
> +	write_lock_bh(&sock->sk->sk_callback_lock);
> +
> +	/* make sure no pre-existing encapsulation handler exists */
> +	rcu_read_lock();
> +	old_data = rcu_dereference_sk_user_data(sock->sk);
> +	rcu_read_unlock();
> +	if (old_data) {
> +		netdev_err(peer->ovpn->dev,
> +			   "provided socket already taken by other user\n");
> +		ret = -EBUSY;
> +		goto err;

The UDP code differentiates "socket already owned by this interface"
from "already taken by other user". That doesn't apply to TCP?



> +int __init ovpn_tcp_init(void)
> +{
> +	/* We need to substitute the recvmsg and the sock_is_readable
> +	 * callbacks in the sk_prot member of the sock object for TCP
> +	 * sockets.
> +	 *
> +	 * However sock->sk_prot is a pointer to a static variable and
> +	 * therefore we can't directly modify it, otherwise every socket
> +	 * pointing to it will be affected.
> +	 *
> +	 * For this reason we create our own static copy and modify what
> +	 * we need. Then we make sk_prot point to this copy
> +	 * (in ovpn_tcp_socket_attach())
> +	 */
> +	ovpn_tcp_prot = tcp_prot;

Don't you need a separate variant for IPv6, like TLS does?

> +	ovpn_tcp_prot.recvmsg = ovpn_tcp_recvmsg;

You don't need to replace ->sendmsg as well? The userspace client is
not expected to send messages?
Jakub Kicinski May 13, 2024, 3:34 p.m. UTC | #3
On Mon, 13 May 2024 15:37:54 +0200 Antonio Quartulli wrote:
> >    * @netif_rx_ring: queue of packets to be sent to the netdevice via NAPI
> >    * @napi: NAPI object
> >    * @sock: the socket being used to talk to this peer
> > + * @tcp.tx_ring: queue for packets to be forwarded to userspace (TCP only)
> > + * @tcp.tx_work: work for processing outgoing socket data (TCP only)
> > + * @tcp.rx_work: wok for processing incoming socket data (TCP only)
> > + * @tcp.raw_len: next packet length as read from the stream (TCP only)  
> 
> can you please help me with the following warning from kerneldoc?
> As you can see below, raw_len is an array.
> 
> May that be the reason why the script isn't picking it up correctly?
> 
> drivers/net/ovpn/peer.h:101: warning: Function parameter or struct 
> member 'raw_len' not described in 'ovpn_peer'
> drivers/net/ovpn/peer.h:101: warning: Excess struct member 'tcp.raw_len' 
> description in 'ovpn_peer'
> 
> (line number may differ as I am in the middle of a rebase)

Hm, the script itself is a fairly simple file of perl regexps
You can try to tweak it and send a fix to the list.
I presume using sizeof() to declare an array is fairly uncommon.
Or forgo the sizeof() and use literal 2? :)
Antonio Quartulli May 13, 2024, 10:20 p.m. UTC | #4
On 13/05/2024 16:50, Sabrina Dubroca wrote:
> 2024-05-06, 03:16:26 +0200, Antonio Quartulli wrote:
>> @@ -307,6 +308,7 @@ static bool ovpn_encrypt_one(struct ovpn_peer *peer, struct sk_buff *skb)
>>   /* Process packets in TX queue in a transport-specific way.
>>    *
>>    * UDP transport - encrypt and send across the tunnel.
>> + * TCP transport - encrypt and put into TCP TX queue.
>>    */
>>   void ovpn_encrypt_work(struct work_struct *work)
>>   {
>> @@ -340,6 +342,9 @@ void ovpn_encrypt_work(struct work_struct *work)
>>   					ovpn_udp_send_skb(peer->ovpn, peer,
>>   							  curr);
>>   					break;
>> +				case IPPROTO_TCP:
>> +					ovpn_tcp_send_skb(peer, curr);
>> +					break;
>>   				default:
>>   					/* no transport configured yet */
>>   					consume_skb(skb);
>> diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
>> index 9ae9844dd281..a04d6e55a473 100644
>> --- a/drivers/net/ovpn/main.c
>> +++ b/drivers/net/ovpn/main.c
>> @@ -23,6 +23,7 @@
>>   #include "io.h"
>>   #include "packet.h"
>>   #include "peer.h"
>> +#include "tcp.h"
>>   
>>   /* Driver info */
>>   #define DRV_DESCRIPTION	"OpenVPN data channel offload (ovpn)"
>> @@ -247,8 +248,14 @@ static struct pernet_operations ovpn_pernet_ops = {
>>   
>>   static int __init ovpn_init(void)
>>   {
>> -	int err = register_netdevice_notifier(&ovpn_netdev_notifier);
>> +	int err = ovpn_tcp_init();
>>   
>> +	if (err) {
> 
> ovpn_tcp_init cannot fail (and if it could, you'd need to clean up
> when register_netdevice_notifier fails). I'd make ovpn_tcp_init void
> and kill this check.

I like to have all init functions returning int by design, even though 
they may not fail.

But I can undersand this is not necessarily good practice (somebody will 
always ask "when does it fail?" and there will will be no answer, which 
is confusing)

> 
>> +		pr_err("ovpn: cannot initialize TCP component: %d\n", err);
>> +		return err;
>> +	}
>> +
>> +	err = register_netdevice_notifier(&ovpn_netdev_notifier);
>>   	if (err) {
>>   		pr_err("ovpn: can't register netdevice notifier: %d\n", err);
>>   		return err;
>> diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h
>> index b5ff59a4b40f..ac4907705d98 100644
>> --- a/drivers/net/ovpn/peer.h
>> +++ b/drivers/net/ovpn/peer.h
>> @@ -33,6 +33,16 @@
>>    * @netif_rx_ring: queue of packets to be sent to the netdevice via NAPI
>>    * @napi: NAPI object
>>    * @sock: the socket being used to talk to this peer
>> + * @tcp.tx_ring: queue for packets to be forwarded to userspace (TCP only)
>> + * @tcp.tx_work: work for processing outgoing socket data (TCP only)
>> + * @tcp.rx_work: wok for processing incoming socket data (TCP only)
> 
> Never actually used.
> If you keep it: s/wok/work/

Indeed, I think this is another leftover.

> 
>> + * @tcp.raw_len: next packet length as read from the stream (TCP only)
>> + * @tcp.skb: next packet being filled with data from the stream (TCP only)
>> + * @tcp.offset: position of the next byte to write in the skb (TCP only)
>> + * @tcp.data_len: next packet length converted to host order (TCP only)
> 
> It would be nice to add information about whether they're used for TX or RX.

they are all about "from the stream" and "to the skb", meaning that we 
are doing RX.
Will make it more explicit.

> 
>> + * @tcp.sk_cb.sk_data_ready: pointer to original cb
>> + * @tcp.sk_cb.sk_write_space: pointer to original cb
>> + * @tcp.sk_cb.prot: pointer to original prot object
>>    * @crypto: the crypto configuration (ciphers, keys, etc..)
>>    * @dst_cache: cache for dst_entry used to send to peer
>>    * @bind: remote peer binding
>> @@ -59,6 +69,25 @@ struct ovpn_peer {
>>   	struct ptr_ring netif_rx_ring;
>>   	struct napi_struct napi;
>>   	struct ovpn_socket *sock;
>> +	/* state of the TCP reading. Needed to keep track of how much of a
>> +	 * single packet has already been read from the stream and how much is
>> +	 * missing
>> +	 */
>> +	struct {
>> +		struct ptr_ring tx_ring;
>> +		struct work_struct tx_work;
>> +		struct work_struct rx_work;
>> +
>> +		u8 raw_len[sizeof(u16)];
> 
> Why not u16 or __be16 for this one?

because in this array we are putting the bytes as we get them from the 
stream.
We may be at the point where one out of two bytes is available on the 
stream. For this reason I use an array to store this u16 byte by byte.

Once thw two bytes are ready, we convert the content in an actual int 
and store it in "data_len" (a few lines below).

> 
>> +		struct sk_buff *skb;
>> +		u16 offset;
>> +		u16 data_len;
>> +		struct {
>> +			void (*sk_data_ready)(struct sock *sk);
>> +			void (*sk_write_space)(struct sock *sk);
>> +			struct proto *prot;
>> +		} sk_cb;
>> +	} tcp;
>>   	struct ovpn_crypto_state crypto;
>>   	struct dst_cache dst_cache;
>>   	struct ovpn_bind __rcu *bind;
>> diff --git a/drivers/net/ovpn/skb.h b/drivers/net/ovpn/skb.h
>> new file mode 100644
>> index 000000000000..ba92811e12ff
>> --- /dev/null
>> +++ b/drivers/net/ovpn/skb.h
>> @@ -0,0 +1,51 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*  OpenVPN data channel offload
>> + *
>> + *  Copyright (C) 2020-2024 OpenVPN, Inc.
>> + *
>> + *  Author:	Antonio Quartulli <antonio@openvpn.net>
>> + *		James Yonan <james@openvpn.net>
>> + */
>> +
>> +#ifndef _NET_OVPN_SKB_H_
>> +#define _NET_OVPN_SKB_H_
>> +
>> +#include <linux/in.h>
>> +#include <linux/in6.h>
>> +#include <linux/ip.h>
>> +#include <linux/skbuff.h>
>> +#include <linux/socket.h>
>> +#include <linux/types.h>
>> +
>> +#define OVPN_SKB_CB(skb) ((struct ovpn_skb_cb *)&((skb)->cb))
>> +
>> +struct ovpn_skb_cb {
>> +	union {
>> +		struct in_addr ipv4;
>> +		struct in6_addr ipv6;
>> +	} local;
>> +	sa_family_t sa_fam;
>> +};
>> +
>> +/* Return IP protocol version from skb header.
>> + * Return 0 if protocol is not IPv4/IPv6 or cannot be read.
>> + */
>> +static inline __be16 ovpn_ip_check_protocol(struct sk_buff *skb)
> 
> A dupe of this function exists in drivers/net/ovpn/io.c. I guess you
> can just introduce skb.h from the start (with only
> ovpn_ip_check_protocol at first).

thanks. I think that was the idea, but something went horribly wrong.

> 
>> +{
>> +	__be16 proto = 0;
>> +
>> +	/* skb could be non-linear,
>> +	 * make sure IP header is in non-fragmented part
>> +	 */
>> +	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
>> +		return 0;
>> +
>> +	if (ip_hdr(skb)->version == 4)
>> +		proto = htons(ETH_P_IP);
>> +	else if (ip_hdr(skb)->version == 6)
>> +		proto = htons(ETH_P_IPV6);
>> +
>> +	return proto;
>> +}
>> +
>> +#endif /* _NET_OVPN_SKB_H_ */
>> diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c
>> index e099a61b03fa..004db5b13663 100644
>> --- a/drivers/net/ovpn/socket.c
>> +++ b/drivers/net/ovpn/socket.c
>> @@ -16,6 +16,7 @@
>>   #include "packet.h"
>>   #include "peer.h"
>>   #include "socket.h"
>> +#include "tcp.h"
>>   #include "udp.h"
>>   
>>   /* Finalize release of socket, called after RCU grace period */
>> @@ -26,6 +27,8 @@ static void ovpn_socket_detach(struct socket *sock)
>>   
>>   	if (sock->sk->sk_protocol == IPPROTO_UDP)
>>   		ovpn_udp_socket_detach(sock);
>> +	else if (sock->sk->sk_protocol == IPPROTO_TCP)
>> +		ovpn_tcp_socket_detach(sock);
>>   
>>   	sockfd_put(sock);
>>   }
>> @@ -69,6 +72,8 @@ static int ovpn_socket_attach(struct socket *sock, struct ovpn_peer *peer)
>>   
>>   	if (sock->sk->sk_protocol == IPPROTO_UDP)
>>   		ret = ovpn_udp_socket_attach(sock, peer->ovpn);
>> +	else if (sock->sk->sk_protocol == IPPROTO_TCP)
>> +		ret = ovpn_tcp_socket_attach(sock, peer);
>>   
>>   	return ret;
>>   }
>> @@ -124,6 +129,21 @@ struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer)
>>   	ovpn_sock->sock = sock;
> 
> The line above this is:
> 
>      ovpn_sock->ovpn = peer->ovpn;
> 
> It's technically fine since you then overwrite this with peer in case
> we're on TCP, but ovpn_sock->ovpn only exists on UDP since you moved
> it into a union in this patch.

Yeah, I did not want to make another branch, but having a UDP specific 
case will make code easier to read.

> 
>>   	kref_init(&ovpn_sock->refcount);
>>   
>> +	/* TCP sockets are per-peer, therefore they are linked to their unique
>> +	 * peer
>> +	 */
>> +	if (sock->sk->sk_protocol == IPPROTO_TCP) {
>> +		ovpn_sock->peer = peer;
>> +		ret = ptr_ring_init(&ovpn_sock->recv_ring, OVPN_QUEUE_LEN,
>> +				    GFP_KERNEL);
>> +		if (ret < 0) {
>> +			netdev_err(peer->ovpn->dev, "%s: cannot allocate TCP recv ring\n",
>> +				   __func__);
> 
> Should you also call ovpn_socket_detach here? (as well when the
> kzalloc for ovpn_sock fails a bit earlier)

mh, the attach is performed as first thing when we enter this function 
therefore you are right. we must undo the attach in case of failure.

> 
>> +			kfree(ovpn_sock);
>> +			return ERR_PTR(ret);
>> +		}
>> +	}
>> +
>>   	rcu_assign_sk_user_data(sock->sk, ovpn_sock);
>>   
>>   	return ovpn_sock;
>> diff --git a/drivers/net/ovpn/socket.h b/drivers/net/ovpn/socket.h
>> index 0d23de5a9344..88c6271ba5c7 100644
>> --- a/drivers/net/ovpn/socket.h
>> +++ b/drivers/net/ovpn/socket.h
>> @@ -21,12 +21,25 @@ struct ovpn_peer;
>>   /**
>>    * struct ovpn_socket - a kernel socket referenced in the ovpn code
>>    * @ovpn: ovpn instance owning this socket (UDP only)
>> + * @peer: unique peer transmitting over this socket (TCP only)
>> + * @recv_ring: queue where non-data packets directed to userspace are stored
>>    * @sock: the low level sock object
>>    * @refcount: amount of contexts currently referencing this object
>>    * @rcu: member used to schedule RCU destructor callback
>>    */
>>   struct ovpn_socket {
>> -	struct ovpn_struct *ovpn;
>> +	union {
>> +		/* the VPN session object owning this socket (UDP only) */
> 
> nit: Probably not needed
> 
>> +		struct ovpn_struct *ovpn;
>> +
>> +		/* TCP only */
>> +		struct {
>> +			/** @peer: unique peer transmitting over this socket */
> 
> Is kdoc upset about peer but not recv_ring?

leftovers from before having the kdoc. I am removing them.

> 
>> +			struct ovpn_peer *peer;
>> +			struct ptr_ring recv_ring;
>> +		};
>> +	};
>> +
>>   	struct socket *sock;
>>   	struct kref refcount;
>>   	struct rcu_head rcu;
>> diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c
>> new file mode 100644
>> index 000000000000..84ad7cd4fc4f
>> --- /dev/null
>> +++ b/drivers/net/ovpn/tcp.c
>> @@ -0,0 +1,511 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*  OpenVPN data channel offload
>> + *
>> + *  Copyright (C) 2019-2024 OpenVPN, Inc.
>> + *
>> + *  Author:	Antonio Quartulli <antonio@openvpn.net>
>> + */
>> +
>> +#include <linux/ptr_ring.h>
>> +#include <linux/skbuff.h>
>> +#include <net/tcp.h>
>> +#include <net/route.h>
>> +
>> +#include "ovpnstruct.h"
>> +#include "main.h"
>> +#include "io.h"
>> +#include "packet.h"
>> +#include "peer.h"
>> +#include "proto.h"
>> +#include "skb.h"
>> +#include "socket.h"
>> +#include "tcp.h"
>> +
>> +static struct proto ovpn_tcp_prot;
>> +
>> +static int ovpn_tcp_read_sock(read_descriptor_t *desc, struct sk_buff *in_skb,
>> +			      unsigned int in_offset, size_t in_len)
>> +{
>> +	struct sock *sk = desc->arg.data;
>> +	struct ovpn_socket *sock;
>> +	struct ovpn_skb_cb *cb;
>> +	struct ovpn_peer *peer;
>> +	size_t chunk, copied = 0;
>> +	void *data;
>> +	u16 len;
>> +	int st;
>> +
>> +	rcu_read_lock();
>> +	sock = rcu_dereference_sk_user_data(sk);
>> +	rcu_read_unlock();
> 
> You can't just release rcu_read_lock and keep using sock (here and in
> the rest of this file). Either you keep rcu_read_lock, or you can take
> a reference on the ovpn_socket.

I was just staring at this today, after having worked on the 
rcu_read_lock/unlock for the peer get()s..

I thinkt the assumption was: if we are in this read_sock callback, it's 
impossible that the ovpn_socket was invalidated, because it gets 
invalidated upon detach, which also prevents any further calling of this 
callback. But this sounds racy, and I guess we should somewhat hold a 
reference..

> 
> 
> Anyway, this looks like you're reinventing strparser. Overall this is
> very similar to net/xfrm/espintcp.c, but the receive side of espintcp
> uses strp and is much shorter (recv_ring looks equivalent to
> ike_queue, both sending a few messages to userspace -- look for
> strp_init, espintcp_rcv, espintcp_parse in that file).

I think I did have a look at strparser once, but I wasn't sure to be 
grasping all details.

Will have another look and see what I can re-use.

> 
>> +/* Set TCP encapsulation callbacks */
>> +int ovpn_tcp_socket_attach(struct socket *sock, struct ovpn_peer *peer)
>> +{
>> +	void *old_data;
>> +	int ret;
>> +
>> +	INIT_WORK(&peer->tcp.tx_work, ovpn_tcp_tx_work);
>> +
>> +	ret = ptr_ring_init(&peer->tcp.tx_ring, OVPN_QUEUE_LEN, GFP_KERNEL);
>> +	if (ret < 0) {
>> +		netdev_err(peer->ovpn->dev, "cannot allocate TCP TX ring\n");
>> +		return ret;
>> +	}
>> +
>> +	peer->tcp.skb = NULL;
>> +	peer->tcp.offset = 0;
>> +	peer->tcp.data_len = 0;
>> +
>> +	write_lock_bh(&sock->sk->sk_callback_lock);
>> +
>> +	/* make sure no pre-existing encapsulation handler exists */
>> +	rcu_read_lock();
>> +	old_data = rcu_dereference_sk_user_data(sock->sk);
>> +	rcu_read_unlock();
>> +	if (old_data) {
>> +		netdev_err(peer->ovpn->dev,
>> +			   "provided socket already taken by other user\n");
>> +		ret = -EBUSY;
>> +		goto err;
> 
> The UDP code differentiates "socket already owned by this interface"
> from "already taken by other user". That doesn't apply to TCP?

This makes me wonder: how safe it is to interpret the user data as an 
object of type ovpn_socket?

When we find the user data already assigned, we don't know what was 
really stored in there, right?
Technically this socket could have gone through another module which 
assigned its own state.

Therefore I think that what UDP does [ dereferencing ((struct 
ovpn_socket *)user_data)->ovpn ] is probably not safe. Would you agree?

> 
> 
> 
>> +int __init ovpn_tcp_init(void)
>> +{
>> +	/* We need to substitute the recvmsg and the sock_is_readable
>> +	 * callbacks in the sk_prot member of the sock object for TCP
>> +	 * sockets.
>> +	 *
>> +	 * However sock->sk_prot is a pointer to a static variable and
>> +	 * therefore we can't directly modify it, otherwise every socket
>> +	 * pointing to it will be affected.
>> +	 *
>> +	 * For this reason we create our own static copy and modify what
>> +	 * we need. Then we make sk_prot point to this copy
>> +	 * (in ovpn_tcp_socket_attach())
>> +	 */
>> +	ovpn_tcp_prot = tcp_prot;
> 
> Don't you need a separate variant for IPv6, like TLS does?

Never did so far.

My wild wild wild guess: for the time this socket is owned by ovpn, we 
only use callbacks that are IPvX agnostic, hence v4 vs v6 doesn't make 
any difference.
When this socket is released, we reassigned the original prot.

> 
>> +	ovpn_tcp_prot.recvmsg = ovpn_tcp_recvmsg;
> 
> You don't need to replace ->sendmsg as well? The userspace client is
> not expected to send messages?

It is, but my assumption is that those packets will just go through the 
socket as usual. No need to be handled by ovpn (those packets are not 
encrypted/decrypted, like data traffic is).
And this is how it has worked so far.

Makes sense?

Thanks a lot!
Sabrina Dubroca May 14, 2024, 8:58 a.m. UTC | #5
2024-05-14, 00:20:24 +0200, Antonio Quartulli wrote:
> On 13/05/2024 16:50, Sabrina Dubroca wrote:
> > 2024-05-06, 03:16:26 +0200, Antonio Quartulli wrote:
> > > diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
> > > index 9ae9844dd281..a04d6e55a473 100644
> > > --- a/drivers/net/ovpn/main.c
> > > +++ b/drivers/net/ovpn/main.c
> > > @@ -23,6 +23,7 @@
> > >   #include "io.h"
> > >   #include "packet.h"
> > >   #include "peer.h"
> > > +#include "tcp.h"
> > >   /* Driver info */
> > >   #define DRV_DESCRIPTION	"OpenVPN data channel offload (ovpn)"
> > > @@ -247,8 +248,14 @@ static struct pernet_operations ovpn_pernet_ops = {
> > >   static int __init ovpn_init(void)
> > >   {
> > > -	int err = register_netdevice_notifier(&ovpn_netdev_notifier);
> > > +	int err = ovpn_tcp_init();
> > > +	if (err) {
> > 
> > ovpn_tcp_init cannot fail (and if it could, you'd need to clean up
> > when register_netdevice_notifier fails). I'd make ovpn_tcp_init void
> > and kill this check.
> 
> I like to have all init functions returning int by design, even though they
> may not fail.
> 
> But I can undersand this is not necessarily good practice (somebody will
> always ask "when does it fail?" and there will will be no answer, which is
> confusing)

Yes, pretty much.


> > > diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h
> > > index b5ff59a4b40f..ac4907705d98 100644
> > > --- a/drivers/net/ovpn/peer.h
> > > +++ b/drivers/net/ovpn/peer.h
> > > + * @tcp.raw_len: next packet length as read from the stream (TCP only)
> > > + * @tcp.skb: next packet being filled with data from the stream (TCP only)
> > > + * @tcp.offset: position of the next byte to write in the skb (TCP only)
> > > + * @tcp.data_len: next packet length converted to host order (TCP only)
> > 
> > It would be nice to add information about whether they're used for TX or RX.
> 
> they are all about "from the stream" and "to the skb", meaning that we are
> doing RX.
> Will make it more explicit.

Maybe group them in a struct rx?

> > > + * @tcp.sk_cb.sk_data_ready: pointer to original cb
> > > + * @tcp.sk_cb.sk_write_space: pointer to original cb
> > > + * @tcp.sk_cb.prot: pointer to original prot object
> > >    * @crypto: the crypto configuration (ciphers, keys, etc..)
> > >    * @dst_cache: cache for dst_entry used to send to peer
> > >    * @bind: remote peer binding
> > > @@ -59,6 +69,25 @@ struct ovpn_peer {
> > >   	struct ptr_ring netif_rx_ring;
> > >   	struct napi_struct napi;
> > >   	struct ovpn_socket *sock;
> > > +	/* state of the TCP reading. Needed to keep track of how much of a
> > > +	 * single packet has already been read from the stream and how much is
> > > +	 * missing
> > > +	 */
> > > +	struct {
> > > +		struct ptr_ring tx_ring;
> > > +		struct work_struct tx_work;
> > > +		struct work_struct rx_work;
> > > +
> > > +		u8 raw_len[sizeof(u16)];
> > 
> > Why not u16 or __be16 for this one?
> 
> because in this array we are putting the bytes as we get them from the
> stream.
> We may be at the point where one out of two bytes is available on the
> stream. For this reason I use an array to store this u16 byte by byte.
> 
> Once thw two bytes are ready, we convert the content in an actual int and
> store it in "data_len" (a few lines below).

Ok, I see. Hopefully you can switch to strparser and make this one go
away.


> > > diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c
> > > index e099a61b03fa..004db5b13663 100644
> > > --- a/drivers/net/ovpn/socket.c
> > > +++ b/drivers/net/ovpn/socket.c
> > > @@ -16,6 +16,7 @@
> > >   #include "packet.h"
> > >   #include "peer.h"
> > >   #include "socket.h"
> > > +#include "tcp.h"
> > >   #include "udp.h"
> > >   /* Finalize release of socket, called after RCU grace period */
> > > @@ -26,6 +27,8 @@ static void ovpn_socket_detach(struct socket *sock)
> > >   	if (sock->sk->sk_protocol == IPPROTO_UDP)
> > >   		ovpn_udp_socket_detach(sock);
> > > +	else if (sock->sk->sk_protocol == IPPROTO_TCP)
> > > +		ovpn_tcp_socket_detach(sock);
> > >   	sockfd_put(sock);
> > >   }
> > > @@ -69,6 +72,8 @@ static int ovpn_socket_attach(struct socket *sock, struct ovpn_peer *peer)
> > >   	if (sock->sk->sk_protocol == IPPROTO_UDP)
> > >   		ret = ovpn_udp_socket_attach(sock, peer->ovpn);
> > > +	else if (sock->sk->sk_protocol == IPPROTO_TCP)
> > > +		ret = ovpn_tcp_socket_attach(sock, peer);
> > >   	return ret;
> > >   }
> > > @@ -124,6 +129,21 @@ struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer)
> > >   	ovpn_sock->sock = sock;
> > 
> > The line above this is:
> > 
> >      ovpn_sock->ovpn = peer->ovpn;
> > 
> > It's technically fine since you then overwrite this with peer in case
> > we're on TCP, but ovpn_sock->ovpn only exists on UDP since you moved
> > it into a union in this patch.
> 
> Yeah, I did not want to make another branch, but having a UDP specific case
> will make code easier to read.

Either that, or drop the union.


> > > diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c
> > > new file mode 100644
> > > index 000000000000..84ad7cd4fc4f
> > > --- /dev/null
> > > +++ b/drivers/net/ovpn/tcp.c
> > > @@ -0,0 +1,511 @@
> > > +static int ovpn_tcp_read_sock(read_descriptor_t *desc, struct sk_buff *in_skb,
> > > +			      unsigned int in_offset, size_t in_len)
> > > +{
> > > +	struct sock *sk = desc->arg.data;
> > > +	struct ovpn_socket *sock;
> > > +	struct ovpn_skb_cb *cb;
> > > +	struct ovpn_peer *peer;
> > > +	size_t chunk, copied = 0;
> > > +	void *data;
> > > +	u16 len;
> > > +	int st;
> > > +
> > > +	rcu_read_lock();
> > > +	sock = rcu_dereference_sk_user_data(sk);
> > > +	rcu_read_unlock();
> > 
> > You can't just release rcu_read_lock and keep using sock (here and in
> > the rest of this file). Either you keep rcu_read_lock, or you can take
> > a reference on the ovpn_socket.
> 
> I was just staring at this today, after having worked on the
> rcu_read_lock/unlock for the peer get()s..
> 
> I thinkt the assumption was: if we are in this read_sock callback, it's
> impossible that the ovpn_socket was invalidated, because it gets invalidated
> upon detach, which also prevents any further calling of this callback. But
> this sounds racy, and I guess we should somewhat hold a reference..

ovpn_tcp_read_sock starts

detach
kfree_rcu(ovpn_socket)
...
ovpn_socket actually freed
...
ovpn_tcp_read_sock continues with freed ovpn_socket


I don't think anything in the current code prevents this.


> > > +/* Set TCP encapsulation callbacks */
> > > +int ovpn_tcp_socket_attach(struct socket *sock, struct ovpn_peer *peer)
> > > +{
> > > +	void *old_data;
> > > +	int ret;
> > > +
> > > +	INIT_WORK(&peer->tcp.tx_work, ovpn_tcp_tx_work);
> > > +
> > > +	ret = ptr_ring_init(&peer->tcp.tx_ring, OVPN_QUEUE_LEN, GFP_KERNEL);
> > > +	if (ret < 0) {
> > > +		netdev_err(peer->ovpn->dev, "cannot allocate TCP TX ring\n");
> > > +		return ret;
> > > +	}
> > > +
> > > +	peer->tcp.skb = NULL;
> > > +	peer->tcp.offset = 0;
> > > +	peer->tcp.data_len = 0;
> > > +
> > > +	write_lock_bh(&sock->sk->sk_callback_lock);
> > > +
> > > +	/* make sure no pre-existing encapsulation handler exists */
> > > +	rcu_read_lock();
> > > +	old_data = rcu_dereference_sk_user_data(sock->sk);
> > > +	rcu_read_unlock();
> > > +	if (old_data) {
> > > +		netdev_err(peer->ovpn->dev,
> > > +			   "provided socket already taken by other user\n");
> > > +		ret = -EBUSY;
> > > +		goto err;
> > 
> > The UDP code differentiates "socket already owned by this interface"
> > from "already taken by other user". That doesn't apply to TCP?
> 
> This makes me wonder: how safe it is to interpret the user data as an object
> of type ovpn_socket?
>
> When we find the user data already assigned, we don't know what was really
> stored in there, right?
> Technically this socket could have gone through another module which
> assigned its own state.
> 
> Therefore I think that what UDP does [ dereferencing ((struct ovpn_socket
> *)user_data)->ovpn ] is probably not safe. Would you agree?

Hmmm, yeah, I think you're right. If you checked encap_type ==
UDP_ENCAP_OVPNINUDP before (sk_prot for TCP), then you'd know it's
really your data. Basically call ovpn_from_udp_sock during attach if
you want to check something beyond EBUSY.

Once you're in your own callbacks, it should be safe. If some other
code sends packet with a non-ovpn socket to ovpn's ->encap_rcv,
something is really broken.

> > > +int __init ovpn_tcp_init(void)
> > > +{
> > > +	/* We need to substitute the recvmsg and the sock_is_readable
> > > +	 * callbacks in the sk_prot member of the sock object for TCP
> > > +	 * sockets.
> > > +	 *
> > > +	 * However sock->sk_prot is a pointer to a static variable and
> > > +	 * therefore we can't directly modify it, otherwise every socket
> > > +	 * pointing to it will be affected.
> > > +	 *
> > > +	 * For this reason we create our own static copy and modify what
> > > +	 * we need. Then we make sk_prot point to this copy
> > > +	 * (in ovpn_tcp_socket_attach())
> > > +	 */
> > > +	ovpn_tcp_prot = tcp_prot;
> > 
> > Don't you need a separate variant for IPv6, like TLS does?
> 
> Never did so far.
> 
> My wild wild wild guess: for the time this socket is owned by ovpn, we only
> use callbacks that are IPvX agnostic, hence v4 vs v6 doesn't make any
> difference.
> When this socket is released, we reassigned the original prot.

That seems a bit suspicious to me. For example, tcpv6_prot has a
different backlog_rcv. And you don't control if the socket is detached
before being closed, or which callbacks are needed. Your userspace
client doesn't use them, but someone else's might.

> > > +	ovpn_tcp_prot.recvmsg = ovpn_tcp_recvmsg;
> > 
> > You don't need to replace ->sendmsg as well? The userspace client is
> > not expected to send messages?
> 
> It is, but my assumption is that those packets will just go through the
> socket as usual. No need to be handled by ovpn (those packets are not
> encrypted/decrypted, like data traffic is).
> And this is how it has worked so far.
> 
> Makes sense?

Two things come to mind:

- userspace is expected to prefix the messages it inserts on the
  stream with the 2-byte length field? otherwise, the peer won't be
  able to parse them out of the stream

- I'm not convinced this would be safe wrt kernel writing partial
  messages. if ovpn_tcp_send_one doesn't send the full message, you
  could interleave two messages:

  +------+-------------------+------+--------+----------------+
  | len1 | (bytes from msg1) | len2 | (msg2) | (rest of msg1) |
  +------+-------------------+------+--------+----------------+

  and the RX side would parse that as:

  +------+-----------------------------------+------+---------
  | len1 | (bytes from msg1) | len2 | (msg2) | ???? | ...     
  +------+-------------------+---------------+------+---------

  and try to interpret some random bytes out of either msg1 or msg2 as
  a length prefix, resulting in a broken stream.


The stream format looks identical to ESP in TCP [1] (2B length prefix
followed by the actual message), so I think the espintcp code (both tx
and rx, except for actual protocol parsing) should look very
similar. The problems that need to be solved for both protocols are
pretty much the same.

[1] https://www.rfc-editor.org/rfc/rfc8229#section-3
Antonio Quartulli May 14, 2024, 10:11 p.m. UTC | #6
On 14/05/2024 10:58, Sabrina Dubroca wrote:
>>>> diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h
>>>> index b5ff59a4b40f..ac4907705d98 100644
>>>> --- a/drivers/net/ovpn/peer.h
>>>> +++ b/drivers/net/ovpn/peer.h
>>>> + * @tcp.raw_len: next packet length as read from the stream (TCP only)
>>>> + * @tcp.skb: next packet being filled with data from the stream (TCP only)
>>>> + * @tcp.offset: position of the next byte to write in the skb (TCP only)
>>>> + * @tcp.data_len: next packet length converted to host order (TCP only)
>>>
>>> It would be nice to add information about whether they're used for TX or RX.
>>
>> they are all about "from the stream" and "to the skb", meaning that we are
>> doing RX.
>> Will make it more explicit.
> 
> Maybe group them in a struct rx?

yap, makes sense.

> 
>>>> + * @tcp.sk_cb.sk_data_ready: pointer to original cb
>>>> + * @tcp.sk_cb.sk_write_space: pointer to original cb
>>>> + * @tcp.sk_cb.prot: pointer to original prot object
>>>>     * @crypto: the crypto configuration (ciphers, keys, etc..)
>>>>     * @dst_cache: cache for dst_entry used to send to peer
>>>>     * @bind: remote peer binding
>>>> @@ -59,6 +69,25 @@ struct ovpn_peer {
>>>>    	struct ptr_ring netif_rx_ring;
>>>>    	struct napi_struct napi;
>>>>    	struct ovpn_socket *sock;
>>>> +	/* state of the TCP reading. Needed to keep track of how much of a
>>>> +	 * single packet has already been read from the stream and how much is
>>>> +	 * missing
>>>> +	 */
>>>> +	struct {
>>>> +		struct ptr_ring tx_ring;
>>>> +		struct work_struct tx_work;
>>>> +		struct work_struct rx_work;
>>>> +
>>>> +		u8 raw_len[sizeof(u16)];
>>>
>>> Why not u16 or __be16 for this one?
>>
>> because in this array we are putting the bytes as we get them from the
>> stream.
>> We may be at the point where one out of two bytes is available on the
>> stream. For this reason I use an array to store this u16 byte by byte.
>>
>> Once thw two bytes are ready, we convert the content in an actual int and
>> store it in "data_len" (a few lines below).
> 
> Ok, I see. Hopefully you can switch to strparser and make this one go
> away.
> 
> 
>>>> diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c
>>>> index e099a61b03fa..004db5b13663 100644
>>>> --- a/drivers/net/ovpn/socket.c
>>>> +++ b/drivers/net/ovpn/socket.c
>>>> @@ -16,6 +16,7 @@
>>>>    #include "packet.h"
>>>>    #include "peer.h"
>>>>    #include "socket.h"
>>>> +#include "tcp.h"
>>>>    #include "udp.h"
>>>>    /* Finalize release of socket, called after RCU grace period */
>>>> @@ -26,6 +27,8 @@ static void ovpn_socket_detach(struct socket *sock)
>>>>    	if (sock->sk->sk_protocol == IPPROTO_UDP)
>>>>    		ovpn_udp_socket_detach(sock);
>>>> +	else if (sock->sk->sk_protocol == IPPROTO_TCP)
>>>> +		ovpn_tcp_socket_detach(sock);
>>>>    	sockfd_put(sock);
>>>>    }
>>>> @@ -69,6 +72,8 @@ static int ovpn_socket_attach(struct socket *sock, struct ovpn_peer *peer)
>>>>    	if (sock->sk->sk_protocol == IPPROTO_UDP)
>>>>    		ret = ovpn_udp_socket_attach(sock, peer->ovpn);
>>>> +	else if (sock->sk->sk_protocol == IPPROTO_TCP)
>>>> +		ret = ovpn_tcp_socket_attach(sock, peer);
>>>>    	return ret;
>>>>    }
>>>> @@ -124,6 +129,21 @@ struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer)
>>>>    	ovpn_sock->sock = sock;
>>>
>>> The line above this is:
>>>
>>>       ovpn_sock->ovpn = peer->ovpn;
>>>
>>> It's technically fine since you then overwrite this with peer in case
>>> we're on TCP, but ovpn_sock->ovpn only exists on UDP since you moved
>>> it into a union in this patch.
>>
>> Yeah, I did not want to make another branch, but having a UDP specific case
>> will make code easier to read.
> 
> Either that, or drop the union.

ACK

> 
> 
>>>> diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c
>>>> new file mode 100644
>>>> index 000000000000..84ad7cd4fc4f
>>>> --- /dev/null
>>>> +++ b/drivers/net/ovpn/tcp.c
>>>> @@ -0,0 +1,511 @@
>>>> +static int ovpn_tcp_read_sock(read_descriptor_t *desc, struct sk_buff *in_skb,
>>>> +			      unsigned int in_offset, size_t in_len)
>>>> +{
>>>> +	struct sock *sk = desc->arg.data;
>>>> +	struct ovpn_socket *sock;
>>>> +	struct ovpn_skb_cb *cb;
>>>> +	struct ovpn_peer *peer;
>>>> +	size_t chunk, copied = 0;
>>>> +	void *data;
>>>> +	u16 len;
>>>> +	int st;
>>>> +
>>>> +	rcu_read_lock();
>>>> +	sock = rcu_dereference_sk_user_data(sk);
>>>> +	rcu_read_unlock();
>>>
>>> You can't just release rcu_read_lock and keep using sock (here and in
>>> the rest of this file). Either you keep rcu_read_lock, or you can take
>>> a reference on the ovpn_socket.
>>
>> I was just staring at this today, after having worked on the
>> rcu_read_lock/unlock for the peer get()s..
>>
>> I thinkt the assumption was: if we are in this read_sock callback, it's
>> impossible that the ovpn_socket was invalidated, because it gets invalidated
>> upon detach, which also prevents any further calling of this callback. But
>> this sounds racy, and I guess we should somewhat hold a reference..
> 
> ovpn_tcp_read_sock starts
> 
> detach
> kfree_rcu(ovpn_socket)
> ...
> ovpn_socket actually freed
> ...
> ovpn_tcp_read_sock continues with freed ovpn_socket
> 
> 
> I don't think anything in the current code prevents this.

mh yeah, if something like this happens right after having started the 
read_sock we are doomed.
Will fix this.


> 
> 
>>>> +/* Set TCP encapsulation callbacks */
>>>> +int ovpn_tcp_socket_attach(struct socket *sock, struct ovpn_peer *peer)
>>>> +{
>>>> +	void *old_data;
>>>> +	int ret;
>>>> +
>>>> +	INIT_WORK(&peer->tcp.tx_work, ovpn_tcp_tx_work);
>>>> +
>>>> +	ret = ptr_ring_init(&peer->tcp.tx_ring, OVPN_QUEUE_LEN, GFP_KERNEL);
>>>> +	if (ret < 0) {
>>>> +		netdev_err(peer->ovpn->dev, "cannot allocate TCP TX ring\n");
>>>> +		return ret;
>>>> +	}
>>>> +
>>>> +	peer->tcp.skb = NULL;
>>>> +	peer->tcp.offset = 0;
>>>> +	peer->tcp.data_len = 0;
>>>> +
>>>> +	write_lock_bh(&sock->sk->sk_callback_lock);
>>>> +
>>>> +	/* make sure no pre-existing encapsulation handler exists */
>>>> +	rcu_read_lock();
>>>> +	old_data = rcu_dereference_sk_user_data(sock->sk);
>>>> +	rcu_read_unlock();
>>>> +	if (old_data) {
>>>> +		netdev_err(peer->ovpn->dev,
>>>> +			   "provided socket already taken by other user\n");
>>>> +		ret = -EBUSY;
>>>> +		goto err;
>>>
>>> The UDP code differentiates "socket already owned by this interface"
>>> from "already taken by other user". That doesn't apply to TCP?
>>
>> This makes me wonder: how safe it is to interpret the user data as an object
>> of type ovpn_socket?
>>
>> When we find the user data already assigned, we don't know what was really
>> stored in there, right?
>> Technically this socket could have gone through another module which
>> assigned its own state.
>>
>> Therefore I think that what UDP does [ dereferencing ((struct ovpn_socket
>> *)user_data)->ovpn ] is probably not safe. Would you agree?
> 
> Hmmm, yeah, I think you're right. If you checked encap_type ==
> UDP_ENCAP_OVPNINUDP before (sk_prot for TCP), then you'd know it's
> really your data. Basically call ovpn_from_udp_sock during attach if
> you want to check something beyond EBUSY.

right. Maybe we can leave with simply reporting EBUSY and be done with 
it, without adding extra checks and what not.

> 
> Once you're in your own callbacks, it should be safe. If some other
> code sends packet with a non-ovpn socket to ovpn's ->encap_rcv,
> something is really broken.

yup

> 
>>>> +int __init ovpn_tcp_init(void)
>>>> +{
>>>> +	/* We need to substitute the recvmsg and the sock_is_readable
>>>> +	 * callbacks in the sk_prot member of the sock object for TCP
>>>> +	 * sockets.
>>>> +	 *
>>>> +	 * However sock->sk_prot is a pointer to a static variable and
>>>> +	 * therefore we can't directly modify it, otherwise every socket
>>>> +	 * pointing to it will be affected.
>>>> +	 *
>>>> +	 * For this reason we create our own static copy and modify what
>>>> +	 * we need. Then we make sk_prot point to this copy
>>>> +	 * (in ovpn_tcp_socket_attach())
>>>> +	 */
>>>> +	ovpn_tcp_prot = tcp_prot;
>>>
>>> Don't you need a separate variant for IPv6, like TLS does?
>>
>> Never did so far.
>>
>> My wild wild wild guess: for the time this socket is owned by ovpn, we only
>> use callbacks that are IPvX agnostic, hence v4 vs v6 doesn't make any
>> difference.
>> When this socket is released, we reassigned the original prot.
> 
> That seems a bit suspicious to me. For example, tcpv6_prot has a
> different backlog_rcv. And you don't control if the socket is detached
> before being closed, or which callbacks are needed. Your userspace
> client doesn't use them, but someone else's might.
> 
>>>> +	ovpn_tcp_prot.recvmsg = ovpn_tcp_recvmsg;
>>>
>>> You don't need to replace ->sendmsg as well? The userspace client is
>>> not expected to send messages?
>>
>> It is, but my assumption is that those packets will just go through the
>> socket as usual. No need to be handled by ovpn (those packets are not
>> encrypted/decrypted, like data traffic is).
>> And this is how it has worked so far.
>>
>> Makes sense?
> 
> Two things come to mind:
> 
> - userspace is expected to prefix the messages it inserts on the
>    stream with the 2-byte length field? otherwise, the peer won't be
>    able to parse them out of the stream

correct. userspace sends those packets as if ovpn is not running, 
therefore this happens naturally.

> 
> - I'm not convinced this would be safe wrt kernel writing partial
>    messages. if ovpn_tcp_send_one doesn't send the full message, you
>    could interleave two messages:
> 
>    +------+-------------------+------+--------+----------------+
>    | len1 | (bytes from msg1) | len2 | (msg2) | (rest of msg1) |
>    +------+-------------------+------+--------+----------------+
> 
>    and the RX side would parse that as:
> 
>    +------+-----------------------------------+------+---------
>    | len1 | (bytes from msg1) | len2 | (msg2) | ???? | ...
>    +------+-------------------+---------------+------+---------
> 
>    and try to interpret some random bytes out of either msg1 or msg2 as
>    a length prefix, resulting in a broken stream.

hm you are correct. if multiple sendmsg can overlap, then we might be in 
troubles, but are we sure this can truly happen?

> 
> 
> The stream format looks identical to ESP in TCP [1] (2B length prefix
> followed by the actual message), so I think the espintcp code (both tx
> and rx, except for actual protocol parsing) should look very
> similar. The problems that need to be solved for both protocols are
> pretty much the same.

ok, will have a look. maybe this will simplify the code even more and we 
will get rid of some of the issues we were discussing above.

Thanks!

> 
> [1] https://www.rfc-editor.org/rfc/rfc8229#section-3
>
Sabrina Dubroca May 15, 2024, 10:19 a.m. UTC | #7
2024-05-15, 00:11:28 +0200, Antonio Quartulli wrote:
> On 14/05/2024 10:58, Sabrina Dubroca wrote:
> > > > The UDP code differentiates "socket already owned by this interface"
> > > > from "already taken by other user". That doesn't apply to TCP?
> > > 
> > > This makes me wonder: how safe it is to interpret the user data as an object
> > > of type ovpn_socket?
> > > 
> > > When we find the user data already assigned, we don't know what was really
> > > stored in there, right?
> > > Technically this socket could have gone through another module which
> > > assigned its own state.
> > > 
> > > Therefore I think that what UDP does [ dereferencing ((struct ovpn_socket
> > > *)user_data)->ovpn ] is probably not safe. Would you agree?
> > 
> > Hmmm, yeah, I think you're right. If you checked encap_type ==
> > UDP_ENCAP_OVPNINUDP before (sk_prot for TCP), then you'd know it's
> > really your data. Basically call ovpn_from_udp_sock during attach if
> > you want to check something beyond EBUSY.
> 
> right. Maybe we can leave with simply reporting EBUSY and be done with it,
> without adding extra checks and what not.

I don't know. What was the reason for the EALREADY handling in udp.c
and the corresponding refcount increase in ovpn_socket_new?


> > > > > +int __init ovpn_tcp_init(void)
> > > > > +{
> > > > > +	/* We need to substitute the recvmsg and the sock_is_readable
> > > > > +	 * callbacks in the sk_prot member of the sock object for TCP
> > > > > +	 * sockets.
> > > > > +	 *
> > > > > +	 * However sock->sk_prot is a pointer to a static variable and
> > > > > +	 * therefore we can't directly modify it, otherwise every socket
> > > > > +	 * pointing to it will be affected.
> > > > > +	 *
> > > > > +	 * For this reason we create our own static copy and modify what
> > > > > +	 * we need. Then we make sk_prot point to this copy
> > > > > +	 * (in ovpn_tcp_socket_attach())
> > > > > +	 */
> > > > > +	ovpn_tcp_prot = tcp_prot;
> > > > 
> > > > Don't you need a separate variant for IPv6, like TLS does?
> > > 
> > > Never did so far.
> > > 
> > > My wild wild wild guess: for the time this socket is owned by ovpn, we only
> > > use callbacks that are IPvX agnostic, hence v4 vs v6 doesn't make any
> > > difference.
> > > When this socket is released, we reassigned the original prot.
> > 
> > That seems a bit suspicious to me. For example, tcpv6_prot has a
> > different backlog_rcv. And you don't control if the socket is detached
> > before being closed, or which callbacks are needed. Your userspace
> > client doesn't use them, but someone else's might.
> > 
> > > > > +	ovpn_tcp_prot.recvmsg = ovpn_tcp_recvmsg;
> > > > 
> > > > You don't need to replace ->sendmsg as well? The userspace client is
> > > > not expected to send messages?
> > > 
> > > It is, but my assumption is that those packets will just go through the
> > > socket as usual. No need to be handled by ovpn (those packets are not
> > > encrypted/decrypted, like data traffic is).
> > > And this is how it has worked so far.
> > > 
> > > Makes sense?
> > 
> > Two things come to mind:
> > 
> > - userspace is expected to prefix the messages it inserts on the
> >    stream with the 2-byte length field? otherwise, the peer won't be
> >    able to parse them out of the stream
> 
> correct. userspace sends those packets as if ovpn is not running, therefore
> this happens naturally.

ok.


> > - I'm not convinced this would be safe wrt kernel writing partial
> >    messages. if ovpn_tcp_send_one doesn't send the full message, you
> >    could interleave two messages:
> > 
> >    +------+-------------------+------+--------+----------------+
> >    | len1 | (bytes from msg1) | len2 | (msg2) | (rest of msg1) |
> >    +------+-------------------+------+--------+----------------+
> > 
> >    and the RX side would parse that as:
> > 
> >    +------+-----------------------------------+------+---------
> >    | len1 | (bytes from msg1) | len2 | (msg2) | ???? | ...
> >    +------+-------------------+---------------+------+---------
> > 
> >    and try to interpret some random bytes out of either msg1 or msg2 as
> >    a length prefix, resulting in a broken stream.
> 
> hm you are correct. if multiple sendmsg can overlap, then we might be in
> troubles, but are we sure this can truly happen?

What would prevent this? The kernel_sendmsg call in ovpn_tcp_send_one
could send a partial message, and then what would stop userspace from
sending its own message during the cond_resched from ovpn_tcp_tx_work?

> > The stream format looks identical to ESP in TCP [1] (2B length prefix
> > followed by the actual message), so I think the espintcp code (both tx
> > and rx, except for actual protocol parsing) should look very
> > similar. The problems that need to be solved for both protocols are
> > pretty much the same.
> 
> ok, will have a look. maybe this will simplify the code even more and we
> will get rid of some of the issues we were discussing above.

I doubt dealing with possible interleaving will make the code simpler,
but I think it has to be done.
Antonio Quartulli May 15, 2024, 12:54 p.m. UTC | #8
On 15/05/2024 12:19, Sabrina Dubroca wrote:
> 2024-05-15, 00:11:28 +0200, Antonio Quartulli wrote:
>> On 14/05/2024 10:58, Sabrina Dubroca wrote:
>>>>> The UDP code differentiates "socket already owned by this interface"
>>>>> from "already taken by other user". That doesn't apply to TCP?
>>>>
>>>> This makes me wonder: how safe it is to interpret the user data as an object
>>>> of type ovpn_socket?
>>>>
>>>> When we find the user data already assigned, we don't know what was really
>>>> stored in there, right?
>>>> Technically this socket could have gone through another module which
>>>> assigned its own state.
>>>>
>>>> Therefore I think that what UDP does [ dereferencing ((struct ovpn_socket
>>>> *)user_data)->ovpn ] is probably not safe. Would you agree?
>>>
>>> Hmmm, yeah, I think you're right. If you checked encap_type ==
>>> UDP_ENCAP_OVPNINUDP before (sk_prot for TCP), then you'd know it's
>>> really your data. Basically call ovpn_from_udp_sock during attach if
>>> you want to check something beyond EBUSY.
>>
>> right. Maybe we can leave with simply reporting EBUSY and be done with it,
>> without adding extra checks and what not.
> 
> I don't know. What was the reason for the EALREADY handling in udp.c
> and the corresponding refcount increase in ovpn_socket_new?

it's just me that likes to be verbose when doing error reporting.
But eventually the exact error is ignored and we release the reference. 
 From netlink.c:

342                 peer->sock = ovpn_socket_new(sock, peer);
343                 if (IS_ERR(peer->sock)) {
344                         sockfd_put(sock);
345                         peer->sock = NULL;
346                         ret = -ENOTSOCK;

so no added value in distinguishing the two cases.

> 
> 
>>>>>> +int __init ovpn_tcp_init(void)
>>>>>> +{
>>>>>> +	/* We need to substitute the recvmsg and the sock_is_readable
>>>>>> +	 * callbacks in the sk_prot member of the sock object for TCP
>>>>>> +	 * sockets.
>>>>>> +	 *
>>>>>> +	 * However sock->sk_prot is a pointer to a static variable and
>>>>>> +	 * therefore we can't directly modify it, otherwise every socket
>>>>>> +	 * pointing to it will be affected.
>>>>>> +	 *
>>>>>> +	 * For this reason we create our own static copy and modify what
>>>>>> +	 * we need. Then we make sk_prot point to this copy
>>>>>> +	 * (in ovpn_tcp_socket_attach())
>>>>>> +	 */
>>>>>> +	ovpn_tcp_prot = tcp_prot;
>>>>>
>>>>> Don't you need a separate variant for IPv6, like TLS does?
>>>>
>>>> Never did so far.
>>>>
>>>> My wild wild wild guess: for the time this socket is owned by ovpn, we only
>>>> use callbacks that are IPvX agnostic, hence v4 vs v6 doesn't make any
>>>> difference.
>>>> When this socket is released, we reassigned the original prot.
>>>
>>> That seems a bit suspicious to me. For example, tcpv6_prot has a
>>> different backlog_rcv. And you don't control if the socket is detached
>>> before being closed, or which callbacks are needed. Your userspace
>>> client doesn't use them, but someone else's might.
>>>
>>>>>> +	ovpn_tcp_prot.recvmsg = ovpn_tcp_recvmsg;
>>>>>
>>>>> You don't need to replace ->sendmsg as well? The userspace client is
>>>>> not expected to send messages?
>>>>
>>>> It is, but my assumption is that those packets will just go through the
>>>> socket as usual. No need to be handled by ovpn (those packets are not
>>>> encrypted/decrypted, like data traffic is).
>>>> And this is how it has worked so far.
>>>>
>>>> Makes sense?
>>>
>>> Two things come to mind:
>>>
>>> - userspace is expected to prefix the messages it inserts on the
>>>     stream with the 2-byte length field? otherwise, the peer won't be
>>>     able to parse them out of the stream
>>
>> correct. userspace sends those packets as if ovpn is not running, therefore
>> this happens naturally.
> 
> ok.
> 
> 
>>> - I'm not convinced this would be safe wrt kernel writing partial
>>>     messages. if ovpn_tcp_send_one doesn't send the full message, you
>>>     could interleave two messages:
>>>
>>>     +------+-------------------+------+--------+----------------+
>>>     | len1 | (bytes from msg1) | len2 | (msg2) | (rest of msg1) |
>>>     +------+-------------------+------+--------+----------------+
>>>
>>>     and the RX side would parse that as:
>>>
>>>     +------+-----------------------------------+------+---------
>>>     | len1 | (bytes from msg1) | len2 | (msg2) | ???? | ...
>>>     +------+-------------------+---------------+------+---------
>>>
>>>     and try to interpret some random bytes out of either msg1 or msg2 as
>>>     a length prefix, resulting in a broken stream.
>>
>> hm you are correct. if multiple sendmsg can overlap, then we might be in
>> troubles, but are we sure this can truly happen?
> 
> What would prevent this? The kernel_sendmsg call in ovpn_tcp_send_one
> could send a partial message, and then what would stop userspace from
> sending its own message during the cond_resched from ovpn_tcp_tx_work?

I was under the impression that ovpn_tcp_send_one() would always send an 
entire packet, but this may not be the case. So you're definitely right.

We may end up having interleaving sendmsg from kernelspace and userspace.

> 
>>> The stream format looks identical to ESP in TCP [1] (2B length prefix
>>> followed by the actual message), so I think the espintcp code (both tx
>>> and rx, except for actual protocol parsing) should look very
>>> similar. The problems that need to be solved for both protocols are
>>> pretty much the same.
>>
>> ok, will have a look. maybe this will simplify the code even more and we
>> will get rid of some of the issues we were discussing above.
> 
> I doubt dealing with possible interleaving will make the code simpler,
> but I think it has to be done.

Yap.

Thanks a lot for pointing this out and for the pointers you gave me.

>
Sabrina Dubroca May 15, 2024, 2:55 p.m. UTC | #9
2024-05-15, 14:54:49 +0200, Antonio Quartulli wrote:
> On 15/05/2024 12:19, Sabrina Dubroca wrote:
> > 2024-05-15, 00:11:28 +0200, Antonio Quartulli wrote:
> > > On 14/05/2024 10:58, Sabrina Dubroca wrote:
> > > > > > The UDP code differentiates "socket already owned by this interface"
> > > > > > from "already taken by other user". That doesn't apply to TCP?
> > > > > 
> > > > > This makes me wonder: how safe it is to interpret the user data as an object
> > > > > of type ovpn_socket?
> > > > > 
> > > > > When we find the user data already assigned, we don't know what was really
> > > > > stored in there, right?
> > > > > Technically this socket could have gone through another module which
> > > > > assigned its own state.
> > > > > 
> > > > > Therefore I think that what UDP does [ dereferencing ((struct ovpn_socket
> > > > > *)user_data)->ovpn ] is probably not safe. Would you agree?
> > > > 
> > > > Hmmm, yeah, I think you're right. If you checked encap_type ==
> > > > UDP_ENCAP_OVPNINUDP before (sk_prot for TCP), then you'd know it's
> > > > really your data. Basically call ovpn_from_udp_sock during attach if
> > > > you want to check something beyond EBUSY.
> > > 
> > > right. Maybe we can leave with simply reporting EBUSY and be done with it,
> > > without adding extra checks and what not.
> > 
> > I don't know. What was the reason for the EALREADY handling in udp.c
> > and the corresponding refcount increase in ovpn_socket_new?
> 
> it's just me that likes to be verbose when doing error reporting.

With the "already owned by this interface" message? Sure, I get that.

> But eventually the exact error is ignored and we release the reference. From
> netlink.c:
> 
> 342                 peer->sock = ovpn_socket_new(sock, peer);
> 343                 if (IS_ERR(peer->sock)) {
> 344                         sockfd_put(sock);
> 345                         peer->sock = NULL;
> 346                         ret = -ENOTSOCK;
> 
> so no added value in distinguishing the two cases.

But ovpn_socket_new currently turns EALREADY into a valid result, so
we won't go through the error hanadling here. That's the part I'm
unclear about.
Antonio Quartulli May 15, 2024, 7:44 p.m. UTC | #10
On 15/05/2024 16:55, Sabrina Dubroca wrote:
> 2024-05-15, 14:54:49 +0200, Antonio Quartulli wrote:
>> On 15/05/2024 12:19, Sabrina Dubroca wrote:
>>> 2024-05-15, 00:11:28 +0200, Antonio Quartulli wrote:
>>>> On 14/05/2024 10:58, Sabrina Dubroca wrote:
>>>>>>> The UDP code differentiates "socket already owned by this interface"
>>>>>>> from "already taken by other user". That doesn't apply to TCP?
>>>>>>
>>>>>> This makes me wonder: how safe it is to interpret the user data as an object
>>>>>> of type ovpn_socket?
>>>>>>
>>>>>> When we find the user data already assigned, we don't know what was really
>>>>>> stored in there, right?
>>>>>> Technically this socket could have gone through another module which
>>>>>> assigned its own state.
>>>>>>
>>>>>> Therefore I think that what UDP does [ dereferencing ((struct ovpn_socket
>>>>>> *)user_data)->ovpn ] is probably not safe. Would you agree?
>>>>>
>>>>> Hmmm, yeah, I think you're right. If you checked encap_type ==
>>>>> UDP_ENCAP_OVPNINUDP before (sk_prot for TCP), then you'd know it's
>>>>> really your data. Basically call ovpn_from_udp_sock during attach if
>>>>> you want to check something beyond EBUSY.
>>>>
>>>> right. Maybe we can leave with simply reporting EBUSY and be done with it,
>>>> without adding extra checks and what not.
>>>
>>> I don't know. What was the reason for the EALREADY handling in udp.c
>>> and the corresponding refcount increase in ovpn_socket_new?
>>
>> it's just me that likes to be verbose when doing error reporting.
> 
> With the "already owned by this interface" message? Sure, I get that.
> 
>> But eventually the exact error is ignored and we release the reference. From
>> netlink.c:
>>
>> 342                 peer->sock = ovpn_socket_new(sock, peer);
>> 343                 if (IS_ERR(peer->sock)) {
>> 344                         sockfd_put(sock);
>> 345                         peer->sock = NULL;
>> 346                         ret = -ENOTSOCK;
>>
>> so no added value in distinguishing the two cases.
> 
> But ovpn_socket_new currently turns EALREADY into a valid result, so
> we won't go through the error hanadling here. That's the part I'm
> unclear about.

you're right. I had forgotten a little but important detail.

With UDP OpenVPN creates one socket and uses it for all peers.
With TCP we forcefully need one socket per client.

Consequently, when a UDP socket is found to be used by our own instance, 
  we can happily increase the refcounter and use it as if it was free 
(we are just attaching it to yet another peer).

In TCP this is not possible, so the socket must be unused, otherwise we 
can't attach it.

I hope it makes sense.

>
Sabrina Dubroca May 15, 2024, 8:35 p.m. UTC | #11
2024-05-15, 21:44:44 +0200, Antonio Quartulli wrote:
> On 15/05/2024 16:55, Sabrina Dubroca wrote:
> > 2024-05-15, 14:54:49 +0200, Antonio Quartulli wrote:
> > > On 15/05/2024 12:19, Sabrina Dubroca wrote:
> > > > 2024-05-15, 00:11:28 +0200, Antonio Quartulli wrote:
> > > > > On 14/05/2024 10:58, Sabrina Dubroca wrote:
> > > > > > > > The UDP code differentiates "socket already owned by this interface"
> > > > > > > > from "already taken by other user". That doesn't apply to TCP?
> > > > > > > 
> > > > > > > This makes me wonder: how safe it is to interpret the user data as an object
> > > > > > > of type ovpn_socket?
> > > > > > > 
> > > > > > > When we find the user data already assigned, we don't know what was really
> > > > > > > stored in there, right?
> > > > > > > Technically this socket could have gone through another module which
> > > > > > > assigned its own state.
> > > > > > > 
> > > > > > > Therefore I think that what UDP does [ dereferencing ((struct ovpn_socket
> > > > > > > *)user_data)->ovpn ] is probably not safe. Would you agree?
> > > > > > 
> > > > > > Hmmm, yeah, I think you're right. If you checked encap_type ==
> > > > > > UDP_ENCAP_OVPNINUDP before (sk_prot for TCP), then you'd know it's
> > > > > > really your data. Basically call ovpn_from_udp_sock during attach if
> > > > > > you want to check something beyond EBUSY.
> > > > > 
> > > > > right. Maybe we can leave with simply reporting EBUSY and be done with it,
> > > > > without adding extra checks and what not.
> > > > 
> > > > I don't know. What was the reason for the EALREADY handling in udp.c
> > > > and the corresponding refcount increase in ovpn_socket_new?
> > > 
> > > it's just me that likes to be verbose when doing error reporting.
> > 
> > With the "already owned by this interface" message? Sure, I get that.
> > 
> > > But eventually the exact error is ignored and we release the reference. From
> > > netlink.c:
> > > 
> > > 342                 peer->sock = ovpn_socket_new(sock, peer);
> > > 343                 if (IS_ERR(peer->sock)) {
> > > 344                         sockfd_put(sock);
> > > 345                         peer->sock = NULL;
> > > 346                         ret = -ENOTSOCK;
> > > 
> > > so no added value in distinguishing the two cases.
> > 
> > But ovpn_socket_new currently turns EALREADY into a valid result, so
> > we won't go through the error hanadling here. That's the part I'm
> > unclear about.
> 
> you're right. I had forgotten a little but important detail.
> 
> With UDP OpenVPN creates one socket and uses it for all peers.
> With TCP we forcefully need one socket per client.
> 
> Consequently, when a UDP socket is found to be used by our own instance,  we
> can happily increase the refcounter and use it as if it was free (we are
> just attaching it to yet another peer).
> 
> In TCP this is not possible, so the socket must be unused, otherwise we
> can't attach it.
> 
> I hope it makes sense.

Yes, thanks. This behavior should be documented (for example, by
putting exactly what you just wrote in a comment above
ovpn_socket_new).

So for TCP you just need the existing check and EBUSY return. For UDP,
you need the EALREADY check, but with an extra encap_type test before
looking at the contents of the sk_user_data.
Antonio Quartulli May 15, 2024, 8:39 p.m. UTC | #12
On 15/05/2024 22:35, Sabrina Dubroca wrote:
> 2024-05-15, 21:44:44 +0200, Antonio Quartulli wrote:
>> On 15/05/2024 16:55, Sabrina Dubroca wrote:
>>> 2024-05-15, 14:54:49 +0200, Antonio Quartulli wrote:
>>>> On 15/05/2024 12:19, Sabrina Dubroca wrote:
>>>>> 2024-05-15, 00:11:28 +0200, Antonio Quartulli wrote:
>>>>>> On 14/05/2024 10:58, Sabrina Dubroca wrote:
>>>>>>>>> The UDP code differentiates "socket already owned by this interface"
>>>>>>>>> from "already taken by other user". That doesn't apply to TCP?
>>>>>>>>
>>>>>>>> This makes me wonder: how safe it is to interpret the user data as an object
>>>>>>>> of type ovpn_socket?
>>>>>>>>
>>>>>>>> When we find the user data already assigned, we don't know what was really
>>>>>>>> stored in there, right?
>>>>>>>> Technically this socket could have gone through another module which
>>>>>>>> assigned its own state.
>>>>>>>>
>>>>>>>> Therefore I think that what UDP does [ dereferencing ((struct ovpn_socket
>>>>>>>> *)user_data)->ovpn ] is probably not safe. Would you agree?
>>>>>>>
>>>>>>> Hmmm, yeah, I think you're right. If you checked encap_type ==
>>>>>>> UDP_ENCAP_OVPNINUDP before (sk_prot for TCP), then you'd know it's
>>>>>>> really your data. Basically call ovpn_from_udp_sock during attach if
>>>>>>> you want to check something beyond EBUSY.
>>>>>>
>>>>>> right. Maybe we can leave with simply reporting EBUSY and be done with it,
>>>>>> without adding extra checks and what not.
>>>>>
>>>>> I don't know. What was the reason for the EALREADY handling in udp.c
>>>>> and the corresponding refcount increase in ovpn_socket_new?
>>>>
>>>> it's just me that likes to be verbose when doing error reporting.
>>>
>>> With the "already owned by this interface" message? Sure, I get that.
>>>
>>>> But eventually the exact error is ignored and we release the reference. From
>>>> netlink.c:
>>>>
>>>> 342                 peer->sock = ovpn_socket_new(sock, peer);
>>>> 343                 if (IS_ERR(peer->sock)) {
>>>> 344                         sockfd_put(sock);
>>>> 345                         peer->sock = NULL;
>>>> 346                         ret = -ENOTSOCK;
>>>>
>>>> so no added value in distinguishing the two cases.
>>>
>>> But ovpn_socket_new currently turns EALREADY into a valid result, so
>>> we won't go through the error hanadling here. That's the part I'm
>>> unclear about.
>>
>> you're right. I had forgotten a little but important detail.
>>
>> With UDP OpenVPN creates one socket and uses it for all peers.
>> With TCP we forcefully need one socket per client.
>>
>> Consequently, when a UDP socket is found to be used by our own instance,  we
>> can happily increase the refcounter and use it as if it was free (we are
>> just attaching it to yet another peer).
>>
>> In TCP this is not possible, so the socket must be unused, otherwise we
>> can't attach it.
>>
>> I hope it makes sense.
> 
> Yes, thanks. This behavior should be documented (for example, by
> putting exactly what you just wrote in a comment above
> ovpn_socket_new).

absolutely, will do.

> 
> So for TCP you just need the existing check and EBUSY return. For UDP,
> you need the EALREADY check, but with an extra encap_type test before
> looking at the contents of the sk_user_data.

ACK
diff mbox series

Patch

diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile
index d43fda72646b..f4d4bd87c851 100644
--- a/drivers/net/ovpn/Makefile
+++ b/drivers/net/ovpn/Makefile
@@ -18,4 +18,5 @@  ovpn-y += peer.o
 ovpn-y += pktid.o
 ovpn-y += socket.o
 ovpn-y += stats.o
+ovpn-y += tcp.o
 ovpn-y += udp.o
diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
index 699e7f1274db..49efcfff963c 100644
--- a/drivers/net/ovpn/io.c
+++ b/drivers/net/ovpn/io.c
@@ -21,6 +21,7 @@ 
 #include "netlink.h"
 #include "proto.h"
 #include "socket.h"
+#include "tcp.h"
 #include "udp.h"
 
 int ovpn_struct_init(struct net_device *dev)
@@ -307,6 +308,7 @@  static bool ovpn_encrypt_one(struct ovpn_peer *peer, struct sk_buff *skb)
 /* Process packets in TX queue in a transport-specific way.
  *
  * UDP transport - encrypt and send across the tunnel.
+ * TCP transport - encrypt and put into TCP TX queue.
  */
 void ovpn_encrypt_work(struct work_struct *work)
 {
@@ -340,6 +342,9 @@  void ovpn_encrypt_work(struct work_struct *work)
 					ovpn_udp_send_skb(peer->ovpn, peer,
 							  curr);
 					break;
+				case IPPROTO_TCP:
+					ovpn_tcp_send_skb(peer, curr);
+					break;
 				default:
 					/* no transport configured yet */
 					consume_skb(skb);
diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
index 9ae9844dd281..a04d6e55a473 100644
--- a/drivers/net/ovpn/main.c
+++ b/drivers/net/ovpn/main.c
@@ -23,6 +23,7 @@ 
 #include "io.h"
 #include "packet.h"
 #include "peer.h"
+#include "tcp.h"
 
 /* Driver info */
 #define DRV_DESCRIPTION	"OpenVPN data channel offload (ovpn)"
@@ -247,8 +248,14 @@  static struct pernet_operations ovpn_pernet_ops = {
 
 static int __init ovpn_init(void)
 {
-	int err = register_netdevice_notifier(&ovpn_netdev_notifier);
+	int err = ovpn_tcp_init();
 
+	if (err) {
+		pr_err("ovpn: cannot initialize TCP component: %d\n", err);
+		return err;
+	}
+
+	err = register_netdevice_notifier(&ovpn_netdev_notifier);
 	if (err) {
 		pr_err("ovpn: can't register netdevice notifier: %d\n", err);
 		return err;
diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h
index b5ff59a4b40f..ac4907705d98 100644
--- a/drivers/net/ovpn/peer.h
+++ b/drivers/net/ovpn/peer.h
@@ -33,6 +33,16 @@ 
  * @netif_rx_ring: queue of packets to be sent to the netdevice via NAPI
  * @napi: NAPI object
  * @sock: the socket being used to talk to this peer
+ * @tcp.tx_ring: queue for packets to be forwarded to userspace (TCP only)
+ * @tcp.tx_work: work for processing outgoing socket data (TCP only)
+ * @tcp.rx_work: wok for processing incoming socket data (TCP only)
+ * @tcp.raw_len: next packet length as read from the stream (TCP only)
+ * @tcp.skb: next packet being filled with data from the stream (TCP only)
+ * @tcp.offset: position of the next byte to write in the skb (TCP only)
+ * @tcp.data_len: next packet length converted to host order (TCP only)
+ * @tcp.sk_cb.sk_data_ready: pointer to original cb
+ * @tcp.sk_cb.sk_write_space: pointer to original cb
+ * @tcp.sk_cb.prot: pointer to original prot object
  * @crypto: the crypto configuration (ciphers, keys, etc..)
  * @dst_cache: cache for dst_entry used to send to peer
  * @bind: remote peer binding
@@ -59,6 +69,25 @@  struct ovpn_peer {
 	struct ptr_ring netif_rx_ring;
 	struct napi_struct napi;
 	struct ovpn_socket *sock;
+	/* state of the TCP reading. Needed to keep track of how much of a
+	 * single packet has already been read from the stream and how much is
+	 * missing
+	 */
+	struct {
+		struct ptr_ring tx_ring;
+		struct work_struct tx_work;
+		struct work_struct rx_work;
+
+		u8 raw_len[sizeof(u16)];
+		struct sk_buff *skb;
+		u16 offset;
+		u16 data_len;
+		struct {
+			void (*sk_data_ready)(struct sock *sk);
+			void (*sk_write_space)(struct sock *sk);
+			struct proto *prot;
+		} sk_cb;
+	} tcp;
 	struct ovpn_crypto_state crypto;
 	struct dst_cache dst_cache;
 	struct ovpn_bind __rcu *bind;
diff --git a/drivers/net/ovpn/skb.h b/drivers/net/ovpn/skb.h
new file mode 100644
index 000000000000..ba92811e12ff
--- /dev/null
+++ b/drivers/net/ovpn/skb.h
@@ -0,0 +1,51 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*  OpenVPN data channel offload
+ *
+ *  Copyright (C) 2020-2024 OpenVPN, Inc.
+ *
+ *  Author:	Antonio Quartulli <antonio@openvpn.net>
+ *		James Yonan <james@openvpn.net>
+ */
+
+#ifndef _NET_OVPN_SKB_H_
+#define _NET_OVPN_SKB_H_
+
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#define OVPN_SKB_CB(skb) ((struct ovpn_skb_cb *)&((skb)->cb))
+
+struct ovpn_skb_cb {
+	union {
+		struct in_addr ipv4;
+		struct in6_addr ipv6;
+	} local;
+	sa_family_t sa_fam;
+};
+
+/* Return IP protocol version from skb header.
+ * Return 0 if protocol is not IPv4/IPv6 or cannot be read.
+ */
+static inline __be16 ovpn_ip_check_protocol(struct sk_buff *skb)
+{
+	__be16 proto = 0;
+
+	/* skb could be non-linear,
+	 * make sure IP header is in non-fragmented part
+	 */
+	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
+		return 0;
+
+	if (ip_hdr(skb)->version == 4)
+		proto = htons(ETH_P_IP);
+	else if (ip_hdr(skb)->version == 6)
+		proto = htons(ETH_P_IPV6);
+
+	return proto;
+}
+
+#endif /* _NET_OVPN_SKB_H_ */
diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c
index e099a61b03fa..004db5b13663 100644
--- a/drivers/net/ovpn/socket.c
+++ b/drivers/net/ovpn/socket.c
@@ -16,6 +16,7 @@ 
 #include "packet.h"
 #include "peer.h"
 #include "socket.h"
+#include "tcp.h"
 #include "udp.h"
 
 /* Finalize release of socket, called after RCU grace period */
@@ -26,6 +27,8 @@  static void ovpn_socket_detach(struct socket *sock)
 
 	if (sock->sk->sk_protocol == IPPROTO_UDP)
 		ovpn_udp_socket_detach(sock);
+	else if (sock->sk->sk_protocol == IPPROTO_TCP)
+		ovpn_tcp_socket_detach(sock);
 
 	sockfd_put(sock);
 }
@@ -69,6 +72,8 @@  static int ovpn_socket_attach(struct socket *sock, struct ovpn_peer *peer)
 
 	if (sock->sk->sk_protocol == IPPROTO_UDP)
 		ret = ovpn_udp_socket_attach(sock, peer->ovpn);
+	else if (sock->sk->sk_protocol == IPPROTO_TCP)
+		ret = ovpn_tcp_socket_attach(sock, peer);
 
 	return ret;
 }
@@ -124,6 +129,21 @@  struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer)
 	ovpn_sock->sock = sock;
 	kref_init(&ovpn_sock->refcount);
 
+	/* TCP sockets are per-peer, therefore they are linked to their unique
+	 * peer
+	 */
+	if (sock->sk->sk_protocol == IPPROTO_TCP) {
+		ovpn_sock->peer = peer;
+		ret = ptr_ring_init(&ovpn_sock->recv_ring, OVPN_QUEUE_LEN,
+				    GFP_KERNEL);
+		if (ret < 0) {
+			netdev_err(peer->ovpn->dev, "%s: cannot allocate TCP recv ring\n",
+				   __func__);
+			kfree(ovpn_sock);
+			return ERR_PTR(ret);
+		}
+	}
+
 	rcu_assign_sk_user_data(sock->sk, ovpn_sock);
 
 	return ovpn_sock;
diff --git a/drivers/net/ovpn/socket.h b/drivers/net/ovpn/socket.h
index 0d23de5a9344..88c6271ba5c7 100644
--- a/drivers/net/ovpn/socket.h
+++ b/drivers/net/ovpn/socket.h
@@ -21,12 +21,25 @@  struct ovpn_peer;
 /**
  * struct ovpn_socket - a kernel socket referenced in the ovpn code
  * @ovpn: ovpn instance owning this socket (UDP only)
+ * @peer: unique peer transmitting over this socket (TCP only)
+ * @recv_ring: queue where non-data packets directed to userspace are stored
  * @sock: the low level sock object
  * @refcount: amount of contexts currently referencing this object
  * @rcu: member used to schedule RCU destructor callback
  */
 struct ovpn_socket {
-	struct ovpn_struct *ovpn;
+	union {
+		/* the VPN session object owning this socket (UDP only) */
+		struct ovpn_struct *ovpn;
+
+		/* TCP only */
+		struct {
+			/** @peer: unique peer transmitting over this socket */
+			struct ovpn_peer *peer;
+			struct ptr_ring recv_ring;
+		};
+	};
+
 	struct socket *sock;
 	struct kref refcount;
 	struct rcu_head rcu;
diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c
new file mode 100644
index 000000000000..84ad7cd4fc4f
--- /dev/null
+++ b/drivers/net/ovpn/tcp.c
@@ -0,0 +1,511 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*  OpenVPN data channel offload
+ *
+ *  Copyright (C) 2019-2024 OpenVPN, Inc.
+ *
+ *  Author:	Antonio Quartulli <antonio@openvpn.net>
+ */
+
+#include <linux/ptr_ring.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include "ovpnstruct.h"
+#include "main.h"
+#include "io.h"
+#include "packet.h"
+#include "peer.h"
+#include "proto.h"
+#include "skb.h"
+#include "socket.h"
+#include "tcp.h"
+
+static struct proto ovpn_tcp_prot;
+
+static int ovpn_tcp_read_sock(read_descriptor_t *desc, struct sk_buff *in_skb,
+			      unsigned int in_offset, size_t in_len)
+{
+	struct sock *sk = desc->arg.data;
+	struct ovpn_socket *sock;
+	struct ovpn_skb_cb *cb;
+	struct ovpn_peer *peer;
+	size_t chunk, copied = 0;
+	void *data;
+	u16 len;
+	int st;
+
+	rcu_read_lock();
+	sock = rcu_dereference_sk_user_data(sk);
+	rcu_read_unlock();
+
+	if (unlikely(!sock || !sock->peer)) {
+		pr_err("ovpn: read_sock triggered for socket with no metadata\n");
+		desc->error = -EINVAL;
+		return 0;
+	}
+
+	peer = sock->peer;
+
+	while (in_len > 0) {
+		/* no skb allocated means that we have to read (or finish
+		 * reading) the 2 bytes prefix containing the actual packet
+		 * size.
+		 */
+		if (!peer->tcp.skb) {
+			chunk = min_t(size_t, in_len,
+				      sizeof(u16) - peer->tcp.offset);
+			WARN_ON(skb_copy_bits(in_skb, in_offset,
+					      peer->tcp.raw_len +
+					      peer->tcp.offset, chunk) < 0);
+			peer->tcp.offset += chunk;
+
+			/* keep on reading until we got the whole packet size */
+			if (peer->tcp.offset != sizeof(u16))
+				goto next_read;
+
+			len = ntohs(*(__be16 *)peer->tcp.raw_len);
+			/* invalid packet length: this is a fatal TCP error */
+			if (!len) {
+				netdev_err(peer->ovpn->dev,
+					   "%s: received invalid packet length: %d\n",
+					   __func__, len);
+				desc->error = -EINVAL;
+				goto err;
+			}
+
+			/* add 2 bytes to allocated space (and immediately
+			 * reserve them) for packet length prepending, in case
+			 * the skb has to be forwarded to userspace
+			 */
+			peer->tcp.skb =
+				netdev_alloc_skb_ip_align(peer->ovpn->dev,
+							  len + sizeof(u16));
+			if (!peer->tcp.skb) {
+				desc->error = -ENOMEM;
+				goto err;
+			}
+			skb_reserve(peer->tcp.skb, sizeof(u16));
+
+			peer->tcp.offset = 0;
+			peer->tcp.data_len = len;
+		} else {
+			chunk = min_t(size_t, in_len,
+				      peer->tcp.data_len - peer->tcp.offset);
+
+			/* extend skb to accommodate the new chunk and copy it
+			 * from the input skb
+			 */
+			data = skb_put(peer->tcp.skb, chunk);
+			WARN_ON(skb_copy_bits(in_skb, in_offset, data,
+					      chunk) < 0);
+			peer->tcp.offset += chunk;
+
+			/* keep on reading until we get the full packet */
+			if (peer->tcp.offset != peer->tcp.data_len)
+				goto next_read;
+
+			/* do not perform IP caching for TCP connections */
+			cb = OVPN_SKB_CB(peer->tcp.skb);
+			cb->sa_fam = AF_UNSPEC;
+
+			/* At this point we know the packet is from a configured
+			 * peer.
+			 * DATA_V2 packets are handled in kernel space, the rest
+			 * goes to user space.
+			 *
+			 * Queue skb for sending to userspace via recvmsg on the
+			 * socket
+			 */
+			if (likely(ovpn_opcode_from_skb(peer->tcp.skb, 0) ==
+				   OVPN_DATA_V2)) {
+				/* hold reference to peer as required by
+				 * ovpn_recv().
+				 *
+				 * NOTE: in this context we should already be
+				 * holding a reference to this peer, therefore
+				 * ovpn_peer_hold() is not expected to fail
+				 */
+				WARN_ON(!ovpn_peer_hold(peer));
+				st = ovpn_recv(peer->ovpn, peer, peer->tcp.skb);
+				if (unlikely(st < 0))
+					ovpn_peer_put(peer);
+
+			} else {
+				/* prepend skb with packet len. this way
+				 * userspace can parse the packet as if it just
+				 * arrived from the remote endpoint
+				 */
+				void *raw_len = __skb_push(peer->tcp.skb,
+							   sizeof(u16));
+
+				memcpy(raw_len, peer->tcp.raw_len, sizeof(u16));
+
+				st = ptr_ring_produce_bh(&peer->sock->recv_ring,
+							 peer->tcp.skb);
+				if (likely(!st))
+					peer->tcp.sk_cb.sk_data_ready(sk);
+			}
+
+			/* skb not consumed - free it now */
+			if (unlikely(st < 0))
+				kfree_skb(peer->tcp.skb);
+
+			peer->tcp.skb = NULL;
+			peer->tcp.offset = 0;
+			peer->tcp.data_len = 0;
+		}
+next_read:
+		in_len -= chunk;
+		in_offset += chunk;
+		copied += chunk;
+	}
+
+	return copied;
+err:
+	netdev_err(peer->ovpn->dev, "cannot process incoming TCP data: %d\n",
+		   desc->error);
+	ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_TRANSPORT_ERROR);
+	return 0;
+}
+
+static void ovpn_tcp_data_ready(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+	read_descriptor_t desc;
+
+	if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
+		return;
+
+	desc.arg.data = sk;
+	desc.error = 0;
+	desc.count = 1;
+
+	sock->ops->read_sock(sk, &desc, ovpn_tcp_read_sock);
+}
+
+static void ovpn_tcp_write_space(struct sock *sk)
+{
+	struct ovpn_socket *sock;
+
+	rcu_read_lock();
+	sock = rcu_dereference_sk_user_data(sk);
+	rcu_read_unlock();
+
+	if (!sock || !sock->peer)
+		return;
+
+	queue_work(sock->peer->ovpn->events_wq, &sock->peer->tcp.tx_work);
+}
+
+static bool ovpn_tcp_sock_is_readable(struct sock *sk)
+
+{
+	struct ovpn_socket *sock;
+
+	rcu_read_lock();
+	sock = rcu_dereference_sk_user_data(sk);
+	rcu_read_unlock();
+
+	if (!sock || !sock->peer)
+		return false;
+
+	return !ptr_ring_empty_bh(&sock->recv_ring);
+}
+
+static int ovpn_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+			    int flags, int *addr_len)
+{
+	bool tmp = flags & MSG_DONTWAIT;
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	int ret, chunk, copied = 0;
+	struct ovpn_socket *sock;
+	struct sk_buff *skb;
+	long timeo;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR);
+
+	timeo = sock_rcvtimeo(sk, tmp);
+
+	rcu_read_lock();
+	sock = rcu_dereference_sk_user_data(sk);
+	rcu_read_unlock();
+
+	if (!sock || !sock->peer) {
+		ret = -EBADF;
+		goto unlock;
+	}
+
+	while (ptr_ring_empty_bh(&sock->recv_ring)) {
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			return 0;
+
+		if (sock_flag(sk, SOCK_DONE))
+			return 0;
+
+		if (!timeo) {
+			ret = -EAGAIN;
+			goto unlock;
+		}
+
+		add_wait_queue(sk_sleep(sk), &wait);
+		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+		sk_wait_event(sk, &timeo, !ptr_ring_empty_bh(&sock->recv_ring),
+			      &wait);
+		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+		remove_wait_queue(sk_sleep(sk), &wait);
+
+		/* take care of signals */
+		if (signal_pending(current)) {
+			ret = sock_intr_errno(timeo);
+			goto unlock;
+		}
+	}
+
+	while (len && (skb = __ptr_ring_peek(&sock->recv_ring))) {
+		chunk = min_t(size_t, len, skb->len);
+		ret = skb_copy_datagram_msg(skb, 0, msg, chunk);
+		if (ret < 0) {
+			pr_err("ovpn: cannot copy TCP data to userspace: %d\n",
+			       ret);
+			kfree_skb(skb);
+			goto unlock;
+		}
+
+		__skb_pull(skb, chunk);
+
+		if (!skb->len) {
+			/* skb was entirely consumed and can now be removed from
+			 * the ring
+			 */
+			__ptr_ring_discard_one(&sock->recv_ring);
+			consume_skb(skb);
+		}
+
+		len -= chunk;
+		copied += chunk;
+	}
+	ret = copied;
+
+unlock:
+	return ret ? : -EAGAIN;
+}
+
+static void ovpn_destroy_skb(void *skb)
+{
+	consume_skb(skb);
+}
+
+void ovpn_tcp_socket_detach(struct socket *sock)
+{
+	struct ovpn_socket *ovpn_sock;
+	struct ovpn_peer *peer;
+
+	if (!sock)
+		return;
+
+	rcu_read_lock();
+	ovpn_sock = rcu_dereference_sk_user_data(sock->sk);
+	rcu_read_unlock();
+
+	if (!ovpn_sock->peer)
+		return;
+
+	peer = ovpn_sock->peer;
+
+	/* restore CBs that were saved in ovpn_sock_set_tcp_cb() */
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_data_ready = peer->tcp.sk_cb.sk_data_ready;
+	sock->sk->sk_write_space = peer->tcp.sk_cb.sk_write_space;
+	sock->sk->sk_prot = peer->tcp.sk_cb.prot;
+	rcu_assign_sk_user_data(sock->sk, NULL);
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	/* cancel any ongoing work. Done after removing the CBs so that these
+	 * workers cannot be re-armed
+	 */
+	cancel_work_sync(&peer->tcp.tx_work);
+
+	ptr_ring_cleanup(&ovpn_sock->recv_ring, ovpn_destroy_skb);
+	ptr_ring_cleanup(&peer->tcp.tx_ring, ovpn_destroy_skb);
+}
+
+/* Try to send one skb (or part of it) over the TCP stream.
+ *
+ * Return 0 on success or a negative error code otherwise.
+ *
+ * Note that the skb is modified by putting away the data being sent, therefore
+ * the caller should check if skb->len is zero to understand if the full skb was
+ * sent or not.
+ */
+static int ovpn_tcp_send_one(struct ovpn_peer *peer, struct sk_buff *skb)
+{
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	struct kvec iv = { 0 };
+	int ret;
+
+	if (skb_linearize(skb) < 0) {
+		net_err_ratelimited("%s: can't linearize packet\n", __func__);
+		return -ENOMEM;
+	}
+
+	/* initialize iv structure now as skb_linearize() may have changed
+	 * skb->data
+	 */
+	iv.iov_base = skb->data;
+	iv.iov_len = skb->len;
+
+	ret = kernel_sendmsg(peer->sock->sock, &msg, &iv, 1, iv.iov_len);
+	if (ret > 0) {
+		__skb_pull(skb, ret);
+
+		/* since we update per-cpu stats in process context,
+		 * we need to disable softirqs
+		 */
+		local_bh_disable();
+		dev_sw_netstats_tx_add(peer->ovpn->dev, 1, ret);
+		local_bh_enable();
+
+		return 0;
+	}
+
+	return ret;
+}
+
+/* Process packets in TCP TX queue */
+static void ovpn_tcp_tx_work(struct work_struct *work)
+{
+	struct ovpn_peer *peer;
+	struct sk_buff *skb;
+	int ret;
+
+	peer = container_of(work, struct ovpn_peer, tcp.tx_work);
+	while ((skb = __ptr_ring_peek(&peer->tcp.tx_ring))) {
+		ret = ovpn_tcp_send_one(peer, skb);
+		if (ret < 0 && ret != -EAGAIN) {
+			net_warn_ratelimited("%s: cannot send TCP packet to peer %u: %d\n",
+					     __func__, peer->id, ret);
+			/* in case of TCP error stop sending loop and delete
+			 * peer
+			 */
+			ovpn_peer_del(peer,
+				      OVPN_DEL_PEER_REASON_TRANSPORT_ERROR);
+			break;
+		} else if (!skb->len) {
+			/* skb was entirely consumed and can now be removed from
+			 * the ring
+			 */
+			__ptr_ring_discard_one(&peer->tcp.tx_ring);
+			consume_skb(skb);
+		}
+
+		/* give a chance to be rescheduled if needed */
+		cond_resched();
+	}
+}
+
+/* Put packet into TCP TX queue and schedule a consumer */
+void ovpn_queue_tcp_skb(struct ovpn_peer *peer, struct sk_buff *skb)
+{
+	int ret;
+
+	ret = ptr_ring_produce_bh(&peer->tcp.tx_ring, skb);
+	if (ret < 0) {
+		kfree_skb_list(skb);
+		return;
+	}
+
+	queue_work(peer->ovpn->events_wq, &peer->tcp.tx_work);
+}
+
+/* Set TCP encapsulation callbacks */
+int ovpn_tcp_socket_attach(struct socket *sock, struct ovpn_peer *peer)
+{
+	void *old_data;
+	int ret;
+
+	INIT_WORK(&peer->tcp.tx_work, ovpn_tcp_tx_work);
+
+	ret = ptr_ring_init(&peer->tcp.tx_ring, OVPN_QUEUE_LEN, GFP_KERNEL);
+	if (ret < 0) {
+		netdev_err(peer->ovpn->dev, "cannot allocate TCP TX ring\n");
+		return ret;
+	}
+
+	peer->tcp.skb = NULL;
+	peer->tcp.offset = 0;
+	peer->tcp.data_len = 0;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+
+	/* make sure no pre-existing encapsulation handler exists */
+	rcu_read_lock();
+	old_data = rcu_dereference_sk_user_data(sock->sk);
+	rcu_read_unlock();
+	if (old_data) {
+		netdev_err(peer->ovpn->dev,
+			   "provided socket already taken by other user\n");
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* sanity check */
+	if (sock->sk->sk_protocol != IPPROTO_TCP) {
+		netdev_err(peer->ovpn->dev,
+			   "provided socket is UDP but expected TCP\n");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/* only a fully connected socket are expected. Connection should be
+	 * handled in userspace
+	 */
+	if (sock->sk->sk_state != TCP_ESTABLISHED) {
+		netdev_err(peer->ovpn->dev,
+			   "provided TCP socket is not in ESTABLISHED state: %d\n",
+			   sock->sk->sk_state);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/* save current CBs so that they can be restored upon socket release */
+	peer->tcp.sk_cb.sk_data_ready = sock->sk->sk_data_ready;
+	peer->tcp.sk_cb.sk_write_space = sock->sk->sk_write_space;
+	peer->tcp.sk_cb.prot = sock->sk->sk_prot;
+
+	/* assign our static CBs */
+	sock->sk->sk_data_ready = ovpn_tcp_data_ready;
+	sock->sk->sk_write_space = ovpn_tcp_write_space;
+	sock->sk->sk_prot = &ovpn_tcp_prot;
+
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	return 0;
+err:
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+	ptr_ring_cleanup(&peer->tcp.tx_ring, NULL);
+
+	return ret;
+}
+
+int __init ovpn_tcp_init(void)
+{
+	/* We need to substitute the recvmsg and the sock_is_readable
+	 * callbacks in the sk_prot member of the sock object for TCP
+	 * sockets.
+	 *
+	 * However sock->sk_prot is a pointer to a static variable and
+	 * therefore we can't directly modify it, otherwise every socket
+	 * pointing to it will be affected.
+	 *
+	 * For this reason we create our own static copy and modify what
+	 * we need. Then we make sk_prot point to this copy
+	 * (in ovpn_tcp_socket_attach())
+	 */
+	ovpn_tcp_prot = tcp_prot;
+	ovpn_tcp_prot.recvmsg = ovpn_tcp_recvmsg;
+	ovpn_tcp_prot.sock_is_readable = ovpn_tcp_sock_is_readable;
+
+	return 0;
+}
diff --git a/drivers/net/ovpn/tcp.h b/drivers/net/ovpn/tcp.h
new file mode 100644
index 000000000000..7e73f6e76e6c
--- /dev/null
+++ b/drivers/net/ovpn/tcp.h
@@ -0,0 +1,42 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*  OpenVPN data channel offload
+ *
+ *  Copyright (C) 2019-2024 OpenVPN, Inc.
+ *
+ *  Author:	Antonio Quartulli <antonio@openvpn.net>
+ */
+
+#ifndef _NET_OVPN_TCP_H_
+#define _NET_OVPN_TCP_H_
+
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "peer.h"
+
+/* Initialize TCP static objects */
+int __init ovpn_tcp_init(void);
+
+void ovpn_queue_tcp_skb(struct ovpn_peer *peer, struct sk_buff *skb);
+
+int ovpn_tcp_socket_attach(struct socket *sock, struct ovpn_peer *peer);
+void ovpn_tcp_socket_detach(struct socket *sock);
+
+/* Prepare skb and enqueue it for sending to peer.
+ *
+ * Preparation consist in prepending the skb payload with its size.
+ * Required by the OpenVPN protocol in order to extract packets from
+ * the TCP stream on the receiver side.
+ */
+static inline void ovpn_tcp_send_skb(struct ovpn_peer *peer,
+				     struct sk_buff *skb)
+{
+	u16 len = skb->len;
+
+	*(__be16 *)__skb_push(skb, sizeof(u16)) = htons(len);
+	ovpn_queue_tcp_skb(peer, skb);
+}
+
+#endif /* _NET_OVPN_TCP_H_ */