diff mbox series

[net-next,v3,09/24] ovpn: implement basic TX path (UDP)

Message ID 20240506011637.27272-10-antonio@openvpn.net (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series Introducing OpenVPN Data Channel Offload | expand

Checks

Context Check Description
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; GEN HAS DIFF 2 files changed, 2613 insertions(+);
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 926 this patch: 926
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 1 maintainers not CCed: openvpn-devel@lists.sourceforge.net
netdev/build_clang success Errors and warnings before: 937 this patch: 937
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 937 this patch: 937
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 580 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 3 this patch: 3
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-05-07--03-00 (tests: 1000)

Commit Message

Antonio Quartulli May 6, 2024, 1:16 a.m. UTC
Packets sent over the ovpn interface are processed and transmitted to the
connected peer, if any.

Implementation is UDP only. TCP will be added by a later patch.

Note: no crypto/encapsulation exists yet. packets are just captured and
sent.

Signed-off-by: Antonio Quartulli <antonio@openvpn.net>
---
 drivers/net/ovpn/io.c         | 174 ++++++++++++++++++++++++++-
 drivers/net/ovpn/io.h         |   2 +
 drivers/net/ovpn/main.c       |   2 +
 drivers/net/ovpn/ovpnstruct.h |   2 +
 drivers/net/ovpn/peer.c       |  37 ++++++
 drivers/net/ovpn/peer.h       |   9 ++
 drivers/net/ovpn/udp.c        | 219 ++++++++++++++++++++++++++++++++++
 drivers/net/ovpn/udp.h        |  14 +++
 8 files changed, 458 insertions(+), 1 deletion(-)

Comments

Sabrina Dubroca May 10, 2024, 1:01 p.m. UTC | #1
2024-05-06, 03:16:22 +0200, Antonio Quartulli wrote:
> diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
> index a420bb45f25f..36cfb95edbf4 100644
> --- a/drivers/net/ovpn/io.c
> +++ b/drivers/net/ovpn/io.c
> @@ -28,6 +30,12 @@ int ovpn_struct_init(struct net_device *dev)
>  
>  	spin_lock_init(&ovpn->lock);
>  
> +	ovpn->crypto_wq = alloc_workqueue("ovpn-crypto-wq-%s",
> +					  WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 0,
> +					  dev->name);
> +	if (!ovpn->crypto_wq)
> +		return -ENOMEM;
> +
>  	ovpn->events_wq = alloc_workqueue("ovpn-events-wq-%s", WQ_MEM_RECLAIM,
>  					  0, dev->name);
>  	if (!ovpn->events_wq)
>  		return -ENOMEM;

This will leak crypto_wq on failure. You need to roll back all
previous changes when something fails (also if you move all this stuff
into ndo_init).

> diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h
> index 659df320525c..f915afa260c3 100644
> --- a/drivers/net/ovpn/peer.h
> +++ b/drivers/net/ovpn/peer.h
> @@ -22,9 +23,12 @@
>   * @id: unique identifier
>   * @vpn_addrs.ipv4: IPv4 assigned to peer on the tunnel
>   * @vpn_addrs.ipv6: IPv6 assigned to peer on the tunnel
> + * @encrypt_work: work used to process outgoing packets
> + * @decrypt_work: work used to process incoming packets

nit: Only encrypt_work is used in this patch, decrypt_work is for RX


> diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c
> index 4b7d96a13df0..f434da76dc0a 100644
> --- a/drivers/net/ovpn/udp.c
> +++ b/drivers/net/ovpn/udp.c
> +/**
> + * ovpn_udp4_output - send IPv4 packet over udp socket
> + * @ovpn: the openvpn instance
> + * @bind: the binding related to the destination peer
> + * @cache: dst cache
> + * @sk: the socket to send the packet over
> + * @skb: the packet to send
> + *
> + * Return: 0 on success or a negative error code otherwise
> + */
> +static int ovpn_udp4_output(struct ovpn_struct *ovpn, struct ovpn_bind *bind,
> +			    struct dst_cache *cache, struct sock *sk,
> +			    struct sk_buff *skb)
> +{
> +	struct rtable *rt;
> +	struct flowi4 fl = {
> +		.saddr = bind->local.ipv4.s_addr,
> +		.daddr = bind->sa.in4.sin_addr.s_addr,
> +		.fl4_sport = inet_sk(sk)->inet_sport,
> +		.fl4_dport = bind->sa.in4.sin_port,
> +		.flowi4_proto = sk->sk_protocol,
> +		.flowi4_mark = sk->sk_mark,
> +	};
> +	int ret;
> +
> +	local_bh_disable();
> +	rt = dst_cache_get_ip4(cache, &fl.saddr);
> +	if (rt)
> +		goto transmit;
> +
> +	if (unlikely(!inet_confirm_addr(sock_net(sk), NULL, 0, fl.saddr,
> +					RT_SCOPE_HOST))) {
> +		/* we may end up here when the cached address is not usable
> +		 * anymore. In this case we reset address/cache and perform a
> +		 * new look up

What exactly are you trying to guard against here? The ipv4 address
used for the last packet being removed from the device/host? I don't
see other tunnels using dst_cache doing this kind of thing (except
wireguard).

> +		 */
> +		fl.saddr = 0;
> +		bind->local.ipv4.s_addr = 0;
> +		dst_cache_reset(cache);
> +	}
> +
> +	rt = ip_route_output_flow(sock_net(sk), &fl, sk);
> +	if (IS_ERR(rt) && PTR_ERR(rt) == -EINVAL) {
> +		fl.saddr = 0;
> +		bind->local.ipv4.s_addr = 0;
> +		dst_cache_reset(cache);
> +
> +		rt = ip_route_output_flow(sock_net(sk), &fl, sk);

Why do you need to repeat the lookup? And why only for ipv4, but not
for ipv6?

> +	}
> +
> +	if (IS_ERR(rt)) {
> +		ret = PTR_ERR(rt);
> +		net_dbg_ratelimited("%s: no route to host %pISpc: %d\n",
> +				    ovpn->dev->name, &bind->sa.in4, ret);
> +		goto err;
> +	}
> +	dst_cache_set_ip4(cache, &rt->dst, fl.saddr);

Overall this looks a whole lot like udp_tunnel_dst_lookup, except for:
 - 2nd lookup
 - inet_confirm_addr/dst_cache_reset

(and there's udp_tunnel6_dst_lookup for ipv6)
Antonio Quartulli May 10, 2024, 1:39 p.m. UTC | #2
On 10/05/2024 15:01, Sabrina Dubroca wrote:
> 2024-05-06, 03:16:22 +0200, Antonio Quartulli wrote:
>> diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
>> index a420bb45f25f..36cfb95edbf4 100644
>> --- a/drivers/net/ovpn/io.c
>> +++ b/drivers/net/ovpn/io.c
>> @@ -28,6 +30,12 @@ int ovpn_struct_init(struct net_device *dev)
>>   
>>   	spin_lock_init(&ovpn->lock);
>>   
>> +	ovpn->crypto_wq = alloc_workqueue("ovpn-crypto-wq-%s",
>> +					  WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 0,
>> +					  dev->name);
>> +	if (!ovpn->crypto_wq)
>> +		return -ENOMEM;
>> +
>>   	ovpn->events_wq = alloc_workqueue("ovpn-events-wq-%s", WQ_MEM_RECLAIM,
>>   					  0, dev->name);
>>   	if (!ovpn->events_wq)
>>   		return -ENOMEM;
> 
> This will leak crypto_wq on failure. You need to roll back all
> previous changes when something fails (also if you move all this stuff
> into ndo_init).

ouch, good catch! thanks.

> 
>> diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h
>> index 659df320525c..f915afa260c3 100644
>> --- a/drivers/net/ovpn/peer.h
>> +++ b/drivers/net/ovpn/peer.h
>> @@ -22,9 +23,12 @@
>>    * @id: unique identifier
>>    * @vpn_addrs.ipv4: IPv4 assigned to peer on the tunnel
>>    * @vpn_addrs.ipv6: IPv6 assigned to peer on the tunnel
>> + * @encrypt_work: work used to process outgoing packets
>> + * @decrypt_work: work used to process incoming packets
> 
> nit: Only encrypt_work is used in this patch, decrypt_work is for RX

Right, same for tx_ring actually. will move both to the next patch

> 
> 
>> diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c
>> index 4b7d96a13df0..f434da76dc0a 100644
>> --- a/drivers/net/ovpn/udp.c
>> +++ b/drivers/net/ovpn/udp.c
>> +/**
>> + * ovpn_udp4_output - send IPv4 packet over udp socket
>> + * @ovpn: the openvpn instance
>> + * @bind: the binding related to the destination peer
>> + * @cache: dst cache
>> + * @sk: the socket to send the packet over
>> + * @skb: the packet to send
>> + *
>> + * Return: 0 on success or a negative error code otherwise
>> + */
>> +static int ovpn_udp4_output(struct ovpn_struct *ovpn, struct ovpn_bind *bind,
>> +			    struct dst_cache *cache, struct sock *sk,
>> +			    struct sk_buff *skb)
>> +{
>> +	struct rtable *rt;
>> +	struct flowi4 fl = {
>> +		.saddr = bind->local.ipv4.s_addr,
>> +		.daddr = bind->sa.in4.sin_addr.s_addr,
>> +		.fl4_sport = inet_sk(sk)->inet_sport,
>> +		.fl4_dport = bind->sa.in4.sin_port,
>> +		.flowi4_proto = sk->sk_protocol,
>> +		.flowi4_mark = sk->sk_mark,
>> +	};
>> +	int ret;
>> +
>> +	local_bh_disable();
>> +	rt = dst_cache_get_ip4(cache, &fl.saddr);
>> +	if (rt)
>> +		goto transmit;
>> +
>> +	if (unlikely(!inet_confirm_addr(sock_net(sk), NULL, 0, fl.saddr,
>> +					RT_SCOPE_HOST))) {
>> +		/* we may end up here when the cached address is not usable
>> +		 * anymore. In this case we reset address/cache and perform a
>> +		 * new look up
> 
> What exactly are you trying to guard against here? The ipv4 address
> used for the last packet being removed from the device/host? I don't
> see other tunnels using dst_cache doing this kind of thing (except
> wireguard).

yes, that's the scenario being checked (which hopefully is what the 
comment conveys).

> 
>> +		 */
>> +		fl.saddr = 0;
>> +		bind->local.ipv4.s_addr = 0;
>> +		dst_cache_reset(cache);
>> +	}
>> +
>> +	rt = ip_route_output_flow(sock_net(sk), &fl, sk);
>> +	if (IS_ERR(rt) && PTR_ERR(rt) == -EINVAL) {
>> +		fl.saddr = 0;
>> +		bind->local.ipv4.s_addr = 0;
>> +		dst_cache_reset(cache);
>> +
>> +		rt = ip_route_output_flow(sock_net(sk), &fl, sk);
> 
> Why do you need to repeat the lookup? And why only for ipv4, but not
> for ipv6?

We are repeating the lookup without the saddr.

The first lookup may have failed because the destination is not 
reachable anymore from that specific source address, but it may be 
reachable from another one (i.e. routing table change in a multi-homed 
setup).

Why not for v6..that's a good question..I wonder if I should just do the 
same and repeat the lookup with ip6addr_any as source..I think it would 
make sense as we could end up in the same scenario as described for IPv4.

What do you think?

> 
>> +	}
>> +
>> +	if (IS_ERR(rt)) {
>> +		ret = PTR_ERR(rt);
>> +		net_dbg_ratelimited("%s: no route to host %pISpc: %d\n",
>> +				    ovpn->dev->name, &bind->sa.in4, ret);
>> +		goto err;
>> +	}
>> +	dst_cache_set_ip4(cache, &rt->dst, fl.saddr);
> 
> Overall this looks a whole lot like udp_tunnel_dst_lookup, except for:
>   - 2nd lookup
>   - inet_confirm_addr/dst_cache_reset

but why doesn't udp_tunnel_dst_lookup account for cases where the source 
address is not usable anymore? I think they are reasonable, no?

Maybe I could still use udp_tunnel_dst_lookup, but call it a second time 
without saddr in case of failure?

> 
> (and there's udp_tunnel6_dst_lookup for ipv6)
>
Sabrina Dubroca May 12, 2024, 9:35 p.m. UTC | #3
2024-05-06, 03:16:22 +0200, Antonio Quartulli wrote:
> +/* send skb to connected peer, if any */
> +static void ovpn_queue_skb(struct ovpn_struct *ovpn, struct sk_buff *skb,
> +			   struct ovpn_peer *peer)
> +{
> +	int ret;
> +
> +	if (likely(!peer))
> +		/* retrieve peer serving the destination IP of this packet */
> +		peer = ovpn_peer_get_by_dst(ovpn, skb);
> +	if (unlikely(!peer)) {
> +		net_dbg_ratelimited("%s: no peer to send data to\n",
> +				    ovpn->dev->name);
> +		goto drop;
> +	}
> +
> +	ret = ptr_ring_produce_bh(&peer->tx_ring, skb);
> +	if (unlikely(ret < 0)) {
> +		net_err_ratelimited("%s: cannot queue packet to TX ring\n",
> +				    peer->ovpn->dev->name);
> +		goto drop;
> +	}
> +
> +	if (!queue_work(ovpn->crypto_wq, &peer->encrypt_work))
> +		ovpn_peer_put(peer);

I wanted to come back to this after going through the crypto patch,
because this felt like a strange construct when I first looked at this
patch.

Why are you using a workqueue here? Based on the kdoc for crypto_wq
("used to schedule crypto work that may sleep during TX/RX"), it's to
deal with async crypto.

If so, why not use the more standard way of dealing with async crypto
in contexts that cannot sleep, ie letting the crypto core call the
"done" callback asynchronously? You need to do all the proper refcount
handling, but IMO it's cleaner and simpler than this workqueue and
ptr_ring. You can see an example of that in macsec (macsec_encrypt_*
in drivers/net/macsec.c).
Antonio Quartulli May 13, 2024, 7:37 a.m. UTC | #4
On 12/05/2024 23:35, Sabrina Dubroca wrote:
> 2024-05-06, 03:16:22 +0200, Antonio Quartulli wrote:
>> +/* send skb to connected peer, if any */
>> +static void ovpn_queue_skb(struct ovpn_struct *ovpn, struct sk_buff *skb,
>> +			   struct ovpn_peer *peer)
>> +{
>> +	int ret;
>> +
>> +	if (likely(!peer))
>> +		/* retrieve peer serving the destination IP of this packet */
>> +		peer = ovpn_peer_get_by_dst(ovpn, skb);
>> +	if (unlikely(!peer)) {
>> +		net_dbg_ratelimited("%s: no peer to send data to\n",
>> +				    ovpn->dev->name);
>> +		goto drop;
>> +	}
>> +
>> +	ret = ptr_ring_produce_bh(&peer->tx_ring, skb);
>> +	if (unlikely(ret < 0)) {
>> +		net_err_ratelimited("%s: cannot queue packet to TX ring\n",
>> +				    peer->ovpn->dev->name);
>> +		goto drop;
>> +	}
>> +
>> +	if (!queue_work(ovpn->crypto_wq, &peer->encrypt_work))
>> +		ovpn_peer_put(peer);
> 
> I wanted to come back to this after going through the crypto patch,
> because this felt like a strange construct when I first looked at this
> patch.
> 
> Why are you using a workqueue here? Based on the kdoc for crypto_wq
> ("used to schedule crypto work that may sleep during TX/RX"), it's to
> deal with async crypto.
> 
> If so, why not use the more standard way of dealing with async crypto
> in contexts that cannot sleep, ie letting the crypto core call the
> "done" callback asynchronously? You need to do all the proper refcount
> handling, but IMO it's cleaner and simpler than this workqueue and
> ptr_ring. You can see an example of that in macsec (macsec_encrypt_*
> in drivers/net/macsec.c).

Aha! You don't know how happy I was when I found the doc describing how 
to convert the async code into sync-looking :-) With the detail that I 
had to move to a different context, as the code may want to sleep (hence 
the introduction of the workqueue).

It looks like I am little fan of WQs, while you are telling me to avoid 
them if possible.

I presume that using WQs comes with a non-negligible cost, therefore if 
we can just get things done without having to use them, then I should 
just don't.

I think I could go back to no-workqueue encrypt/decrypt.
Do you think this may have any impact on any future multi-core 
optimization? Back then I also thought that going through workers may 
make improvements in this area easier. But I could just be wrong.

Regards,
Sabrina Dubroca May 13, 2024, 9:36 a.m. UTC | #5
2024-05-13, 09:37:06 +0200, Antonio Quartulli wrote:
> On 12/05/2024 23:35, Sabrina Dubroca wrote:
> > 2024-05-06, 03:16:22 +0200, Antonio Quartulli wrote:
> > > +/* send skb to connected peer, if any */
> > > +static void ovpn_queue_skb(struct ovpn_struct *ovpn, struct sk_buff *skb,
> > > +			   struct ovpn_peer *peer)
> > > +{
> > > +	int ret;
> > > +
> > > +	if (likely(!peer))
> > > +		/* retrieve peer serving the destination IP of this packet */
> > > +		peer = ovpn_peer_get_by_dst(ovpn, skb);
> > > +	if (unlikely(!peer)) {
> > > +		net_dbg_ratelimited("%s: no peer to send data to\n",
> > > +				    ovpn->dev->name);
> > > +		goto drop;
> > > +	}
> > > +
> > > +	ret = ptr_ring_produce_bh(&peer->tx_ring, skb);
> > > +	if (unlikely(ret < 0)) {
> > > +		net_err_ratelimited("%s: cannot queue packet to TX ring\n",
> > > +				    peer->ovpn->dev->name);
> > > +		goto drop;
> > > +	}
> > > +
> > > +	if (!queue_work(ovpn->crypto_wq, &peer->encrypt_work))
> > > +		ovpn_peer_put(peer);
> > 
> > I wanted to come back to this after going through the crypto patch,
> > because this felt like a strange construct when I first looked at this
> > patch.
> > 
> > Why are you using a workqueue here? Based on the kdoc for crypto_wq
> > ("used to schedule crypto work that may sleep during TX/RX"), it's to
> > deal with async crypto.
> > 
> > If so, why not use the more standard way of dealing with async crypto
> > in contexts that cannot sleep, ie letting the crypto core call the
> > "done" callback asynchronously? You need to do all the proper refcount
> > handling, but IMO it's cleaner and simpler than this workqueue and
> > ptr_ring. You can see an example of that in macsec (macsec_encrypt_*
> > in drivers/net/macsec.c).
> 
> Aha! You don't know how happy I was when I found the doc describing how to
> convert the async code into sync-looking :-) With the detail that I had to
> move to a different context, as the code may want to sleep (hence the
> introduction of the workqueue).
> 
> It looks like I am little fan of WQs, while you are telling me to avoid them
> if possible.

I'm mainly trying to simplify the code (get rid of some ptr_rings, get
rid of some ping-pong between functions and changes of context,
etc). And here, I'm also trying to make it look more like other
similar pieces of code, because I'm already familiar with a few kernel
implementations of protocols doing crypto (macsec, ipsec, tls).

> I presume that using WQs comes with a non-negligible cost, therefore if we
> can just get things done without having to use them, then I should just
> don't.

If you're using AESNI for your GCM implementation, the crypto API will
also be using a workqueue (see crypto/cryptd.c), but only when the
crypto can't be done immediately (ie, when the FPU is already busing).

In the case of crypto accelerators, there might be benefits from
queueing multiple requests and then letting them live their life,
instead of waiting for each request separately. I don't have access to
that HW so I cannot test this.

> I think I could go back to no-workqueue encrypt/decrypt.
> Do you think this may have any impact on any future multi-core optimization?
> Back then I also thought that going through workers may make improvements in
> this area easier. But I could just be wrong.

Without thinking about it too deeply, the workqueue looks more like a
bottleneck that a no-workqueue approach just wouldn't have. You would
probably need per-CPU WQs (probably separated for encrypt and
decrypt). cryptd_enqueue_request (crypto/cryptd.c) has an example of
that.
Antonio Quartulli May 13, 2024, 9:47 a.m. UTC | #6
On 13/05/2024 11:36, Sabrina Dubroca wrote:
> 2024-05-13, 09:37:06 +0200, Antonio Quartulli wrote:
>> On 12/05/2024 23:35, Sabrina Dubroca wrote:
>>> 2024-05-06, 03:16:22 +0200, Antonio Quartulli wrote:
>>>> +/* send skb to connected peer, if any */
>>>> +static void ovpn_queue_skb(struct ovpn_struct *ovpn, struct sk_buff *skb,
>>>> +			   struct ovpn_peer *peer)
>>>> +{
>>>> +	int ret;
>>>> +
>>>> +	if (likely(!peer))
>>>> +		/* retrieve peer serving the destination IP of this packet */
>>>> +		peer = ovpn_peer_get_by_dst(ovpn, skb);
>>>> +	if (unlikely(!peer)) {
>>>> +		net_dbg_ratelimited("%s: no peer to send data to\n",
>>>> +				    ovpn->dev->name);
>>>> +		goto drop;
>>>> +	}
>>>> +
>>>> +	ret = ptr_ring_produce_bh(&peer->tx_ring, skb);
>>>> +	if (unlikely(ret < 0)) {
>>>> +		net_err_ratelimited("%s: cannot queue packet to TX ring\n",
>>>> +				    peer->ovpn->dev->name);
>>>> +		goto drop;
>>>> +	}
>>>> +
>>>> +	if (!queue_work(ovpn->crypto_wq, &peer->encrypt_work))
>>>> +		ovpn_peer_put(peer);
>>>
>>> I wanted to come back to this after going through the crypto patch,
>>> because this felt like a strange construct when I first looked at this
>>> patch.
>>>
>>> Why are you using a workqueue here? Based on the kdoc for crypto_wq
>>> ("used to schedule crypto work that may sleep during TX/RX"), it's to
>>> deal with async crypto.
>>>
>>> If so, why not use the more standard way of dealing with async crypto
>>> in contexts that cannot sleep, ie letting the crypto core call the
>>> "done" callback asynchronously? You need to do all the proper refcount
>>> handling, but IMO it's cleaner and simpler than this workqueue and
>>> ptr_ring. You can see an example of that in macsec (macsec_encrypt_*
>>> in drivers/net/macsec.c).
>>
>> Aha! You don't know how happy I was when I found the doc describing how to
>> convert the async code into sync-looking :-) With the detail that I had to
>> move to a different context, as the code may want to sleep (hence the
>> introduction of the workqueue).
>>
>> It looks like I am little fan of WQs, while you are telling me to avoid them
>> if possible.
> 
> I'm mainly trying to simplify the code (get rid of some ptr_rings, get
> rid of some ping-pong between functions and changes of context,
> etc). And here, I'm also trying to make it look more like other
> similar pieces of code, because I'm already familiar with a few kernel
> implementations of protocols doing crypto (macsec, ipsec, tls).

Thanks for that, you already helped me getting rid of several constructs 
that weren't really needed.

> 
>> I presume that using WQs comes with a non-negligible cost, therefore if we
>> can just get things done without having to use them, then I should just
>> don't.
> 
> If you're using AESNI for your GCM implementation, the crypto API will
> also be using a workqueue (see crypto/cryptd.c), but only when the
> crypto can't be done immediately (ie, when the FPU is already busing).

Right.

> 
> In the case of crypto accelerators, there might be benefits from
> queueing multiple requests and then letting them live their life,
> instead of waiting for each request separately. I don't have access to
> that HW so I cannot test this.
> 
>> I think I could go back to no-workqueue encrypt/decrypt.
>> Do you think this may have any impact on any future multi-core optimization?
>> Back then I also thought that going through workers may make improvements in
>> this area easier. But I could just be wrong.
> 
> Without thinking about it too deeply, the workqueue looks more like a
> bottleneck that a no-workqueue approach just wouldn't have. You would
> probably need per-CPU WQs (probably separated for encrypt and
> decrypt). cryptd_enqueue_request (crypto/cryptd.c) has an example of
> that.

Ok. Sounds like something we can re-consider later on.

Let's start by killing the wq in the current code.

Thanks for the pointers.
diff mbox series

Patch

diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
index a420bb45f25f..36cfb95edbf4 100644
--- a/drivers/net/ovpn/io.c
+++ b/drivers/net/ovpn/io.c
@@ -9,11 +9,13 @@ 
 
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#include <net/gso.h>
 
 #include "io.h"
 #include "ovpnstruct.h"
 #include "netlink.h"
 #include "peer.h"
+#include "udp.h"
 
 int ovpn_struct_init(struct net_device *dev)
 {
@@ -28,6 +30,12 @@  int ovpn_struct_init(struct net_device *dev)
 
 	spin_lock_init(&ovpn->lock);
 
+	ovpn->crypto_wq = alloc_workqueue("ovpn-crypto-wq-%s",
+					  WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 0,
+					  dev->name);
+	if (!ovpn->crypto_wq)
+		return -ENOMEM;
+
 	ovpn->events_wq = alloc_workqueue("ovpn-events-wq-%s", WQ_MEM_RECLAIM,
 					  0, dev->name);
 	if (!ovpn->events_wq)
@@ -40,11 +48,175 @@  int ovpn_struct_init(struct net_device *dev)
 	return 0;
 }
 
+static bool ovpn_encrypt_one(struct ovpn_peer *peer, struct sk_buff *skb)
+{
+	return true;
+}
+
+/* Process packets in TX queue in a transport-specific way.
+ *
+ * UDP transport - encrypt and send across the tunnel.
+ */
+void ovpn_encrypt_work(struct work_struct *work)
+{
+	struct sk_buff *skb, *curr, *next;
+	struct ovpn_peer *peer;
+
+	peer = container_of(work, struct ovpn_peer, encrypt_work);
+	while ((skb = ptr_ring_consume_bh(&peer->tx_ring))) {
+		/* this might be a GSO-segmented skb list: process each skb
+		 * independently
+		 */
+		skb_list_walk_safe(skb, curr, next) {
+			/* if one segment fails encryption, we drop the entire
+			 * packet, because it does not really make sense to send
+			 * only part of it at this point
+			 */
+			if (unlikely(!ovpn_encrypt_one(peer, curr))) {
+				kfree_skb_list(skb);
+				skb = NULL;
+				break;
+			}
+		}
+
+		/* successful encryption */
+		if (likely(skb)) {
+			skb_list_walk_safe(skb, curr, next) {
+				skb_mark_not_on_list(curr);
+
+				switch (peer->sock->sock->sk->sk_protocol) {
+				case IPPROTO_UDP:
+					ovpn_udp_send_skb(peer->ovpn, peer,
+							  curr);
+					break;
+				default:
+					/* no transport configured yet */
+					consume_skb(skb);
+					break;
+				}
+			}
+		}
+
+		/* give a chance to be rescheduled if needed */
+		cond_resched();
+	}
+	ovpn_peer_put(peer);
+}
+
+/* send skb to connected peer, if any */
+static void ovpn_queue_skb(struct ovpn_struct *ovpn, struct sk_buff *skb,
+			   struct ovpn_peer *peer)
+{
+	int ret;
+
+	if (likely(!peer))
+		/* retrieve peer serving the destination IP of this packet */
+		peer = ovpn_peer_get_by_dst(ovpn, skb);
+	if (unlikely(!peer)) {
+		net_dbg_ratelimited("%s: no peer to send data to\n",
+				    ovpn->dev->name);
+		goto drop;
+	}
+
+	ret = ptr_ring_produce_bh(&peer->tx_ring, skb);
+	if (unlikely(ret < 0)) {
+		net_err_ratelimited("%s: cannot queue packet to TX ring\n",
+				    peer->ovpn->dev->name);
+		goto drop;
+	}
+
+	if (!queue_work(ovpn->crypto_wq, &peer->encrypt_work))
+		ovpn_peer_put(peer);
+
+	return;
+drop:
+	if (peer)
+		ovpn_peer_put(peer);
+	kfree_skb_list(skb);
+}
+
+/* Return IP protocol version from skb header.
+ * Return 0 if protocol is not IPv4/IPv6 or cannot be read.
+ */
+static __be16 ovpn_ip_check_protocol(struct sk_buff *skb)
+{
+	__be16 proto = 0;
+
+	/* skb could be non-linear, make sure IP header is in non-fragmented
+	 * part
+	 */
+	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
+		return 0;
+
+	if (ip_hdr(skb)->version == 4)
+		proto = htons(ETH_P_IP);
+	else if (ip_hdr(skb)->version == 6)
+		proto = htons(ETH_P_IPV6);
+
+	return proto;
+}
+
 /* Send user data to the network
  */
 netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+	struct ovpn_struct *ovpn = netdev_priv(dev);
+	struct sk_buff *segments, *tmp, *curr, *next;
+	struct sk_buff_head skb_list;
+	__be16 proto;
+	int ret;
+
+	/* reset netfilter state */
+	nf_reset_ct(skb);
+
+	/* verify IP header size in network packet */
+	proto = ovpn_ip_check_protocol(skb);
+	if (unlikely(!proto || skb->protocol != proto)) {
+		net_err_ratelimited("%s: dropping malformed payload packet\n",
+				    dev->name);
+		goto drop;
+	}
+
+	if (skb_is_gso(skb)) {
+		segments = skb_gso_segment(skb, 0);
+		if (IS_ERR(segments)) {
+			ret = PTR_ERR(segments);
+			net_err_ratelimited("%s: cannot segment packet: %d\n",
+					    dev->name, ret);
+			goto drop;
+		}
+
+		consume_skb(skb);
+		skb = segments;
+	}
+
+	/* from this moment on, "skb" might be a list */
+
+	__skb_queue_head_init(&skb_list);
+	skb_list_walk_safe(skb, curr, next) {
+		skb_mark_not_on_list(curr);
+
+		tmp = skb_share_check(curr, GFP_ATOMIC);
+		if (unlikely(!tmp)) {
+			kfree_skb_list(next);
+			net_err_ratelimited("%s: skb_share_check failed\n",
+					    dev->name);
+			goto drop_list;
+		}
+
+		__skb_queue_tail(&skb_list, tmp);
+	}
+	skb_list.prev->next = NULL;
+
+	ovpn_queue_skb(ovpn, skb_list.next, NULL);
+
+	return NETDEV_TX_OK;
+
+drop_list:
+	skb_queue_walk_safe(&skb_list, curr, next)
+		kfree_skb(curr);
+drop:
 	skb_tx_error(skb);
-	kfree_skb(skb);
+	kfree_skb_list(skb);
 	return NET_XMIT_DROP;
 }
diff --git a/drivers/net/ovpn/io.h b/drivers/net/ovpn/io.h
index 61a2485e16b5..171e87f584b6 100644
--- a/drivers/net/ovpn/io.h
+++ b/drivers/net/ovpn/io.h
@@ -19,4 +19,6 @@ 
 int ovpn_struct_init(struct net_device *dev);
 netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev);
 
+void ovpn_encrypt_work(struct work_struct *work);
+
 #endif /* _NET_OVPN_OVPN_H_ */
diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
index dba35ecb236b..9ae9844dd281 100644
--- a/drivers/net/ovpn/main.c
+++ b/drivers/net/ovpn/main.c
@@ -39,7 +39,9 @@  static void ovpn_struct_free(struct net_device *net)
 	rtnl_unlock();
 
 	free_percpu(net->tstats);
+	flush_workqueue(ovpn->crypto_wq);
 	flush_workqueue(ovpn->events_wq);
+	destroy_workqueue(ovpn->crypto_wq);
 	destroy_workqueue(ovpn->events_wq);
 	rcu_barrier();
 }
diff --git a/drivers/net/ovpn/ovpnstruct.h b/drivers/net/ovpn/ovpnstruct.h
index b79d4f0474b0..7414c2459fb9 100644
--- a/drivers/net/ovpn/ovpnstruct.h
+++ b/drivers/net/ovpn/ovpnstruct.h
@@ -18,6 +18,7 @@ 
  * @registered: whether dev is still registered with netdev or not
  * @mode: device operation mode (i.e. p2p, mp, ..)
  * @lock: protect this object
+ * @crypto_wq: used to schedule crypto work that may sleep during TX/RX
  * @event_wq: used to schedule generic events that may sleep and that need to be
  *            performed outside of softirq context
  * @peer: in P2P mode, this is the only remote peer
@@ -28,6 +29,7 @@  struct ovpn_struct {
 	bool registered;
 	enum ovpn_mode mode;
 	spinlock_t lock; /* protect writing to the ovpn_struct object */
+	struct workqueue_struct *crypto_wq;
 	struct workqueue_struct *events_wq;
 	struct ovpn_peer __rcu *peer;
 	struct list_head dev_list;
diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c
index 2948b7320d47..f023f919b75d 100644
--- a/drivers/net/ovpn/peer.c
+++ b/drivers/net/ovpn/peer.c
@@ -39,6 +39,8 @@  struct ovpn_peer *ovpn_peer_new(struct ovpn_struct *ovpn, u32 id)
 	spin_lock_init(&peer->lock);
 	kref_init(&peer->refcount);
 
+	INIT_WORK(&peer->encrypt_work, ovpn_encrypt_work);
+
 	ret = dst_cache_init(&peer->dst_cache, GFP_KERNEL);
 	if (ret < 0) {
 		netdev_err(ovpn->dev, "%s: cannot initialize dst cache\n",
@@ -119,6 +121,9 @@  static void ovpn_peer_release_rcu(struct rcu_head *head)
 
 void ovpn_peer_release(struct ovpn_peer *peer)
 {
+	if (peer->sock)
+		ovpn_socket_put(peer->sock);
+
 	call_rcu(&peer->rcu, ovpn_peer_release_rcu);
 }
 
@@ -288,6 +293,38 @@  struct ovpn_peer *ovpn_peer_get_by_id(struct ovpn_struct *ovpn, u32 peer_id)
 	return peer;
 }
 
+/**
+ * ovpn_peer_get_by_dst - Lookup peer to send skb to
+ * @ovpn: the private data representing the current VPN session
+ * @skb: the skb to extract the destination address from
+ *
+ * This function takes a tunnel packet and looks up the peer to send it to
+ * after encapsulation. The skb is expected to be the in-tunnel packet, without
+ * any OpenVPN related header.
+ *
+ * Assume that the IP header is accessible in the skb data.
+ *
+ * Return: the peer if found or NULL otherwise.
+ */
+struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_struct *ovpn,
+				       struct sk_buff *skb)
+{
+	struct ovpn_peer *tmp, *peer = NULL;
+
+	/* in P2P mode, no matter the destination, packets are always sent to
+	 * the single peer listening on the other side
+	 */
+	if (ovpn->mode == OVPN_MODE_P2P) {
+		rcu_read_lock();
+		tmp = rcu_dereference(ovpn->peer);
+		if (likely(tmp && ovpn_peer_hold(tmp)))
+			peer = tmp;
+		rcu_read_unlock();
+	}
+
+	return peer;
+}
+
 /**
  * ovpn_peer_add_p2p - add per to related tables in a P2P instance
  * @ovpn: the instance to add the peer to
diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h
index 659df320525c..f915afa260c3 100644
--- a/drivers/net/ovpn/peer.h
+++ b/drivers/net/ovpn/peer.h
@@ -11,6 +11,7 @@ 
 #define _NET_OVPN_OVPNPEER_H_
 
 #include "bind.h"
+#include "socket.h"
 
 #include <linux/ptr_ring.h>
 #include <net/dst_cache.h>
@@ -22,9 +23,12 @@ 
  * @id: unique identifier
  * @vpn_addrs.ipv4: IPv4 assigned to peer on the tunnel
  * @vpn_addrs.ipv6: IPv6 assigned to peer on the tunnel
+ * @encrypt_work: work used to process outgoing packets
+ * @decrypt_work: work used to process incoming packets
  * @tx_ring: queue of outgoing poackets to this peer
  * @rx_ring: queue of incoming packets from this peer
  * @netif_rx_ring: queue of packets to be sent to the netdevice via NAPI
+ * @sock: the socket being used to talk to this peer
  * @dst_cache: cache for dst_entry used to send to peer
  * @bind: remote peer binding
  * @halt: true if ovpn_peer_mark_delete was called
@@ -41,9 +45,12 @@  struct ovpn_peer {
 		struct in_addr ipv4;
 		struct in6_addr ipv6;
 	} vpn_addrs;
+	struct work_struct encrypt_work;
+	struct work_struct decrypt_work;
 	struct ptr_ring tx_ring;
 	struct ptr_ring rx_ring;
 	struct ptr_ring netif_rx_ring;
+	struct ovpn_socket *sock;
 	struct dst_cache dst_cache;
 	struct ovpn_bind __rcu *bind;
 	bool halt;
@@ -148,5 +155,7 @@  struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_struct *ovpn,
  * Return: a pointer to the peer if found or NULL otherwise
  */
 struct ovpn_peer *ovpn_peer_get_by_id(struct ovpn_struct *ovpn, u32 peer_id);
+struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_struct *ovpn,
+				       struct sk_buff *skb);
 
 #endif /* _NET_OVPN_OVPNPEER_H_ */
diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c
index 4b7d96a13df0..f434da76dc0a 100644
--- a/drivers/net/ovpn/udp.c
+++ b/drivers/net/ovpn/udp.c
@@ -7,13 +7,232 @@ 
  */
 
 #include <linux/netdevice.h>
+#include <linux/inetdevice.h>
 #include <linux/socket.h>
+#include <net/addrconf.h>
+#include <net/dst_cache.h>
+#include <net/route.h>
+#include <net/ipv6_stubs.h>
+#include <net/udp_tunnel.h>
 
 #include "ovpnstruct.h"
 #include "main.h"
+#include "bind.h"
+#include "io.h"
+#include "peer.h"
 #include "socket.h"
 #include "udp.h"
 
+/**
+ * ovpn_udp4_output - send IPv4 packet over udp socket
+ * @ovpn: the openvpn instance
+ * @bind: the binding related to the destination peer
+ * @cache: dst cache
+ * @sk: the socket to send the packet over
+ * @skb: the packet to send
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+static int ovpn_udp4_output(struct ovpn_struct *ovpn, struct ovpn_bind *bind,
+			    struct dst_cache *cache, struct sock *sk,
+			    struct sk_buff *skb)
+{
+	struct rtable *rt;
+	struct flowi4 fl = {
+		.saddr = bind->local.ipv4.s_addr,
+		.daddr = bind->sa.in4.sin_addr.s_addr,
+		.fl4_sport = inet_sk(sk)->inet_sport,
+		.fl4_dport = bind->sa.in4.sin_port,
+		.flowi4_proto = sk->sk_protocol,
+		.flowi4_mark = sk->sk_mark,
+	};
+	int ret;
+
+	local_bh_disable();
+	rt = dst_cache_get_ip4(cache, &fl.saddr);
+	if (rt)
+		goto transmit;
+
+	if (unlikely(!inet_confirm_addr(sock_net(sk), NULL, 0, fl.saddr,
+					RT_SCOPE_HOST))) {
+		/* we may end up here when the cached address is not usable
+		 * anymore. In this case we reset address/cache and perform a
+		 * new look up
+		 */
+		fl.saddr = 0;
+		bind->local.ipv4.s_addr = 0;
+		dst_cache_reset(cache);
+	}
+
+	rt = ip_route_output_flow(sock_net(sk), &fl, sk);
+	if (IS_ERR(rt) && PTR_ERR(rt) == -EINVAL) {
+		fl.saddr = 0;
+		bind->local.ipv4.s_addr = 0;
+		dst_cache_reset(cache);
+
+		rt = ip_route_output_flow(sock_net(sk), &fl, sk);
+	}
+
+	if (IS_ERR(rt)) {
+		ret = PTR_ERR(rt);
+		net_dbg_ratelimited("%s: no route to host %pISpc: %d\n",
+				    ovpn->dev->name, &bind->sa.in4, ret);
+		goto err;
+	}
+	dst_cache_set_ip4(cache, &rt->dst, fl.saddr);
+
+transmit:
+	udp_tunnel_xmit_skb(rt, sk, skb, fl.saddr, fl.daddr, 0,
+			    ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport,
+			    fl.fl4_dport, false, sk->sk_no_check_tx);
+	ret = 0;
+err:
+	local_bh_enable();
+	return ret;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * ovpn_udp6_output - send IPv6 packet over udp socket
+ * @ovpn: the openvpn instance
+ * @bind: the binding related to the destination peer
+ * @cache: dst cache
+ * @sk: the socket to send the packet over
+ * @skb: the packet to send
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+static int ovpn_udp6_output(struct ovpn_struct *ovpn, struct ovpn_bind *bind,
+			    struct dst_cache *cache, struct sock *sk,
+			    struct sk_buff *skb)
+{
+	struct dst_entry *dst;
+	int ret;
+
+	struct flowi6 fl = {
+		.saddr = bind->local.ipv6,
+		.daddr = bind->sa.in6.sin6_addr,
+		.fl6_sport = inet_sk(sk)->inet_sport,
+		.fl6_dport = bind->sa.in6.sin6_port,
+		.flowi6_proto = sk->sk_protocol,
+		.flowi6_mark = sk->sk_mark,
+		.flowi6_oif = bind->sa.in6.sin6_scope_id,
+	};
+
+	local_bh_disable();
+	dst = dst_cache_get_ip6(cache, &fl.saddr);
+	if (dst)
+		goto transmit;
+
+	if (unlikely(!ipv6_chk_addr(sock_net(sk), &fl.saddr, NULL, 0))) {
+		/* we may end up here when the cached address is not usable
+		 * anymore. In this case we reset address/cache and perform a
+		 * new look up
+		 */
+		fl.saddr = in6addr_any;
+		bind->local.ipv6 = in6addr_any;
+		dst_cache_reset(cache);
+	}
+
+	dst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(sk), sk, &fl, NULL);
+	if (IS_ERR(dst)) {
+		ret = PTR_ERR(dst);
+		net_dbg_ratelimited("%s: no route to host %pISpc: %d\n",
+				    ovpn->dev->name, &bind->sa.in6, ret);
+		goto err;
+	}
+	dst_cache_set_ip6(cache, dst, &fl.saddr);
+
+transmit:
+	udp_tunnel6_xmit_skb(dst, sk, skb, skb->dev, &fl.saddr, &fl.daddr, 0,
+			     ip6_dst_hoplimit(dst), 0, fl.fl6_sport,
+			     fl.fl6_dport, udp_get_no_check6_tx(sk));
+	ret = 0;
+err:
+	local_bh_enable();
+	return ret;
+}
+#endif
+
+/**
+ * ovpn_udp_output - transmit skb using udp-tunnel
+ * @ovpn: the openvpn instance
+ * @bind: the binding related to the destination peer
+ * @cache: dst cache
+ * @sk: the socket to send the packet over
+ * @skb: the packet to send
+ *
+ * rcu_read_lock should be held on entry.
+ * On return, the skb is consumed.
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+static int ovpn_udp_output(struct ovpn_struct *ovpn, struct ovpn_bind *bind,
+			   struct dst_cache *cache, struct sock *sk,
+			   struct sk_buff *skb)
+{
+	int ret;
+
+	/* set sk to null if skb is already orphaned */
+	if (!skb->destructor)
+		skb->sk = NULL;
+
+	/* always permit openvpn-created packets to be (outside) fragmented */
+	skb->ignore_df = 1;
+
+	switch (bind->sa.in4.sin_family) {
+	case AF_INET:
+		ret = ovpn_udp4_output(ovpn, bind, cache, sk, skb);
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		ret = ovpn_udp6_output(ovpn, bind, cache, sk, skb);
+		break;
+#endif
+	default:
+		ret = -EAFNOSUPPORT;
+		break;
+	}
+
+	return ret;
+}
+
+void ovpn_udp_send_skb(struct ovpn_struct *ovpn, struct ovpn_peer *peer,
+		       struct sk_buff *skb)
+{
+	struct ovpn_bind *bind;
+	struct socket *sock;
+	int ret = -1;
+
+	skb->dev = ovpn->dev;
+	/* no checksum performed at this layer */
+	skb->ip_summed = CHECKSUM_NONE;
+
+	/* get socket info */
+	sock = peer->sock->sock;
+	if (unlikely(!sock)) {
+		net_warn_ratelimited("%s: no sock for remote peer\n", __func__);
+		goto out;
+	}
+
+	rcu_read_lock();
+	/* get binding */
+	bind = rcu_dereference(peer->bind);
+	if (unlikely(!bind)) {
+		net_warn_ratelimited("%s: no bind for remote peer\n", __func__);
+		goto out_unlock;
+	}
+
+	/* crypto layer -> transport (UDP) */
+	ret = ovpn_udp_output(ovpn, bind, &peer->dst_cache, sock->sk, skb);
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	if (ret < 0)
+		kfree_skb(skb);
+}
+
 int ovpn_udp_socket_attach(struct socket *sock, struct ovpn_struct *ovpn)
 {
 	struct ovpn_socket *old_data;
diff --git a/drivers/net/ovpn/udp.h b/drivers/net/ovpn/udp.h
index 16422a649cb9..f4eb1e63e103 100644
--- a/drivers/net/ovpn/udp.h
+++ b/drivers/net/ovpn/udp.h
@@ -9,7 +9,12 @@ 
 #ifndef _NET_OVPN_UDP_H_
 #define _NET_OVPN_UDP_H_
 
+#include <linux/skbuff.h>
+#include <net/sock.h>
+
+struct ovpn_peer;
 struct ovpn_struct;
+struct sk_buff;
 struct socket;
 
 /**
@@ -24,4 +29,13 @@  struct socket;
  */
 int ovpn_udp_socket_attach(struct socket *sock, struct ovpn_struct *ovpn);
 
+/**
+ * ovpn_udp_send_skb - prepare skb and send it over via UDP
+ * @ovpn: the openvpn instance
+ * @peer: the destination peer
+ * @skb: the packet to send
+ */
+void ovpn_udp_send_skb(struct ovpn_struct *ovpn, struct ovpn_peer *peer,
+		       struct sk_buff *skb);
+
 #endif /* _NET_OVPN_UDP_H_ */