diff mbox series

[net-next,v3,04/24] ovpn: add basic interface creation/destruction/management routines

Message ID 20240506011637.27272-5-antonio@openvpn.net (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series Introducing OpenVPN Data Channel Offload | expand

Checks

Context Check Description
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; GEN HAS DIFF 2 files changed, 2613 insertions(+);
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 932 this patch: 932
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 1 maintainers not CCed: openvpn-devel@lists.sourceforge.net
netdev/build_clang success Errors and warnings before: 938 this patch: 938
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 944 this patch: 944
netdev/checkpatch warning WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-05-07--03-00 (tests: 1000)

Commit Message

Antonio Quartulli May 6, 2024, 1:16 a.m. UTC
Add basic infrastructure for handling ovpn interfaces.

Signed-off-by: Antonio Quartulli <antonio@openvpn.net>
---
 drivers/net/ovpn/io.c         |  20 ++++
 drivers/net/ovpn/io.h         |   7 ++
 drivers/net/ovpn/main.c       | 183 +++++++++++++++++++++++++++++++++-
 drivers/net/ovpn/main.h       |  31 ++++++
 drivers/net/ovpn/ovpnstruct.h |   8 ++
 drivers/net/ovpn/packet.h     |  40 ++++++++
 6 files changed, 285 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/ovpn/packet.h

Comments

Jakub Kicinski May 8, 2024, 12:18 a.m. UTC | #1
On Mon,  6 May 2024 03:16:17 +0200 Antonio Quartulli wrote:

> diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
> index ad3813419c33..338e99dfe886 100644
> --- a/drivers/net/ovpn/io.c
> +++ b/drivers/net/ovpn/io.c
> @@ -11,6 +11,26 @@
>  #include <linux/skbuff.h>
>  
>  #include "io.h"
> +#include "ovpnstruct.h"
> +#include "netlink.h"
> +
> +int ovpn_struct_init(struct net_device *dev)
> +{
> +	struct ovpn_struct *ovpn = netdev_priv(dev);
> +	int err;
> +
> +	ovpn->dev = dev;
> +
> +	err = ovpn_nl_init(ovpn);
> +	if (err < 0)
> +		return err;
> +
> +	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);

Set pcpu_stat_type, core will allocate for you

> +	if (!dev->tstats)
> +		return -ENOMEM;
> +
> +	return 0;
> +}

> +/**
> + * ovpn_struct_init - Initialize the netdevice private area
> + * @dev: the device to initialize
> + *
> + * Return: 0 on success or a negative error code otherwise
> + */
> +int ovpn_struct_init(struct net_device *dev);

Weak preference for kdoc to go with the implementation, not declaration.

> +static const struct net_device_ops ovpn_netdev_ops = {
> +	.ndo_open		= ovpn_net_open,
> +	.ndo_stop		= ovpn_net_stop,
> +	.ndo_start_xmit		= ovpn_net_xmit,
> +	.ndo_get_stats64        = dev_get_tstats64,

Core should count pcpu stats automatically

> +};
> +
>  bool ovpn_dev_is_valid(const struct net_device *dev)
>  {
>  	return dev->netdev_ops->ndo_start_xmit == ovpn_net_xmit;
>  }

> +	list_add(&ovpn->dev_list, &dev_list);
> +	rtnl_unlock();
> +
> +	/* turn carrier explicitly off after registration, this way state is
> +	 * clearly defined
> +	 */
> +	netif_carrier_off(dev);

carrier off inside the locked section, user can call open
immediately after unlock
Antonio Quartulli May 8, 2024, 7:53 a.m. UTC | #2
On 08/05/2024 02:18, Jakub Kicinski wrote:
> On Mon,  6 May 2024 03:16:17 +0200 Antonio Quartulli wrote:
> 
>> diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
>> index ad3813419c33..338e99dfe886 100644
>> --- a/drivers/net/ovpn/io.c
>> +++ b/drivers/net/ovpn/io.c
>> @@ -11,6 +11,26 @@
>>   #include <linux/skbuff.h>
>>   
>>   #include "io.h"
>> +#include "ovpnstruct.h"
>> +#include "netlink.h"
>> +
>> +int ovpn_struct_init(struct net_device *dev)
>> +{
>> +	struct ovpn_struct *ovpn = netdev_priv(dev);
>> +	int err;
>> +
>> +	ovpn->dev = dev;
>> +
>> +	err = ovpn_nl_init(ovpn);
>> +	if (err < 0)
>> +		return err;
>> +
>> +	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
> 
> Set pcpu_stat_type, core will allocate for you

ok

> 
>> +	if (!dev->tstats)
>> +		return -ENOMEM;
>> +
>> +	return 0;
>> +}
> 
>> +/**
>> + * ovpn_struct_init - Initialize the netdevice private area
>> + * @dev: the device to initialize
>> + *
>> + * Return: 0 on success or a negative error code otherwise
>> + */
>> +int ovpn_struct_init(struct net_device *dev);
> 
> Weak preference for kdoc to go with the implementation, not declaration.

oh ok - this wasn't clear.
Will move the kdoc next to the implementation.

> 
>> +static const struct net_device_ops ovpn_netdev_ops = {
>> +	.ndo_open		= ovpn_net_open,
>> +	.ndo_stop		= ovpn_net_stop,
>> +	.ndo_start_xmit		= ovpn_net_xmit,
>> +	.ndo_get_stats64        = dev_get_tstats64,
> 
> Core should count pcpu stats automatically

Thanks for pointing this out.
I see dev_get_stats() takes care of all this for us.

> 
>> +};
>> +
>>   bool ovpn_dev_is_valid(const struct net_device *dev)
>>   {
>>   	return dev->netdev_ops->ndo_start_xmit == ovpn_net_xmit;
>>   }
> 
>> +	list_add(&ovpn->dev_list, &dev_list);
>> +	rtnl_unlock();
>> +
>> +	/* turn carrier explicitly off after registration, this way state is
>> +	 * clearly defined
>> +	 */
>> +	netif_carrier_off(dev);
> 
> carrier off inside the locked section, user can call open
> immediately after unlock

ok, will move it up.
Sabrina Dubroca May 8, 2024, 2:52 p.m. UTC | #3
2024-05-06, 03:16:17 +0200, Antonio Quartulli wrote:
> diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
> index ad3813419c33..338e99dfe886 100644
> --- a/drivers/net/ovpn/io.c
> +++ b/drivers/net/ovpn/io.c
> @@ -11,6 +11,26 @@
>  #include <linux/skbuff.h>
>  
>  #include "io.h"
> +#include "ovpnstruct.h"
> +#include "netlink.h"
> +
> +int ovpn_struct_init(struct net_device *dev)

nit: Should this be in main.c? It's only used there, and I think it
would make more sense to drop it next to ovpn_struct_free.

> +{
> +	struct ovpn_struct *ovpn = netdev_priv(dev);
> +	int err;
> +

[...]
> diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
> index 33c0b004ce16..584cd7286aff 100644
> --- a/drivers/net/ovpn/main.c
> +++ b/drivers/net/ovpn/main.c
[...]
> +static void ovpn_struct_free(struct net_device *net)
> +{
> +	struct ovpn_struct *ovpn = netdev_priv(net);
> +
> +	rtnl_lock();

 ->priv_destructor can run from register_netdevice (already under
RTNL), this doesn't look right.

> +	list_del(&ovpn->dev_list);

And if this gets called from register_netdevice, the list_add from
ovpn_iface_create hasn't run yet, so this will probably do strange
things?

> +	rtnl_unlock();
> +
> +	free_percpu(net->tstats);
> +}
> +
> +static int ovpn_net_open(struct net_device *dev)
> +{
> +	struct in_device *dev_v4 = __in_dev_get_rtnl(dev);
> +
> +	if (dev_v4) {
> +		/* disable redirects as Linux gets confused by ovpn handling
> +		 * same-LAN routing
> +		 */
> +		IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false);
> +		IPV4_DEVCONF_ALL(dev_net(dev), SEND_REDIRECTS) = false;

Jakub, are you ok with that? This feels a bit weird to have in the
middle of a driver.

> +	}
> +
> +	netif_tx_start_all_queues(dev);
> +	return 0;
> +}

[...]
> +void ovpn_iface_destruct(struct ovpn_struct *ovpn)
> +{
> +	ASSERT_RTNL();
> +
> +	netif_carrier_off(ovpn->dev);
> +
> +	ovpn->registered = false;
> +
> +	unregister_netdevice(ovpn->dev);
> +	synchronize_net();

If this gets called from the loop in ovpn_netns_pre_exit, one
synchronize_net per ovpn device would seem quite expensive.

> +}
> +
>  static int ovpn_netdev_notifier_call(struct notifier_block *nb,
>  				     unsigned long state, void *ptr)
>  {
>  	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> +	struct ovpn_struct *ovpn;
>  
>  	if (!ovpn_dev_is_valid(dev))
>  		return NOTIFY_DONE;
>  
> +	ovpn = netdev_priv(dev);
> +
>  	switch (state) {
>  	case NETDEV_REGISTER:
> -		/* add device to internal list for later destruction upon
> -		 * unregistration
> -		 */
> +		ovpn->registered = true;
>  		break;
>  	case NETDEV_UNREGISTER:
> +		/* twiddle thumbs on netns device moves */
> +		if (dev->reg_state != NETREG_UNREGISTERING)
> +			break;
> +
>  		/* can be delivered multiple times, so check registered flag,
>  		 * then destroy the interface
>  		 */
> +		if (!ovpn->registered)
> +			return NOTIFY_DONE;
> +
> +		ovpn_iface_destruct(ovpn);

Maybe I'm misunderstanding this code. Why do you want to manually
destroy a device that is already going away?

>  		break;
>  	case NETDEV_POST_INIT:
>  	case NETDEV_GOING_DOWN:
>  	case NETDEV_DOWN:
>  	case NETDEV_UP:
>  	case NETDEV_PRE_UP:
> +		break;
>  	default:
>  		return NOTIFY_DONE;
>  	}
> @@ -62,6 +210,24 @@ static struct notifier_block ovpn_netdev_notifier = {
>  	.notifier_call = ovpn_netdev_notifier_call,
>  };
>  
> +static void ovpn_netns_pre_exit(struct net *net)
> +{
> +	struct ovpn_struct *ovpn;
> +
> +	rtnl_lock();
> +	list_for_each_entry(ovpn, &dev_list, dev_list) {
> +		if (dev_net(ovpn->dev) != net)
> +			continue;
> +
> +		ovpn_iface_destruct(ovpn);

Is this needed? On netns destruction all devices within the ns will be
destroyed by the networking core.

> +	}
> +	rtnl_unlock();
> +}
Jakub Kicinski May 9, 2024, 1:06 a.m. UTC | #4
On Wed, 8 May 2024 16:52:27 +0200 Sabrina Dubroca wrote:
> > +static int ovpn_net_open(struct net_device *dev)
> > +{
> > +	struct in_device *dev_v4 = __in_dev_get_rtnl(dev);
> > +
> > +	if (dev_v4) {
> > +		/* disable redirects as Linux gets confused by ovpn handling
> > +		 * same-LAN routing
> > +		 */
> > +		IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false);
> > +		IPV4_DEVCONF_ALL(dev_net(dev), SEND_REDIRECTS) = false;  
> 
> Jakub, are you ok with that? This feels a bit weird to have in the
> middle of a driver.

Herm, I only looked at the netlink bits so far.
Would be good to get more details on the problem and see if we can fix
it more directly.
Antonio Quartulli May 9, 2024, 8:25 a.m. UTC | #5
On 08/05/2024 16:52, Sabrina Dubroca wrote:
> 2024-05-06, 03:16:17 +0200, Antonio Quartulli wrote:
>> diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
>> index ad3813419c33..338e99dfe886 100644
>> --- a/drivers/net/ovpn/io.c
>> +++ b/drivers/net/ovpn/io.c
>> @@ -11,6 +11,26 @@
>>   #include <linux/skbuff.h>
>>   
>>   #include "io.h"
>> +#include "ovpnstruct.h"
>> +#include "netlink.h"
>> +
>> +int ovpn_struct_init(struct net_device *dev)
> 
> nit: Should this be in main.c? It's only used there, and I think it
> would make more sense to drop it next to ovpn_struct_free.

yeah, it makes sense. will move it.

> 
>> +{
>> +	struct ovpn_struct *ovpn = netdev_priv(dev);
>> +	int err;
>> +
> 
> [...]
>> diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
>> index 33c0b004ce16..584cd7286aff 100644
>> --- a/drivers/net/ovpn/main.c
>> +++ b/drivers/net/ovpn/main.c
> [...]
>> +static void ovpn_struct_free(struct net_device *net)
>> +{
>> +	struct ovpn_struct *ovpn = netdev_priv(net);
>> +
>> +	rtnl_lock();
> 
>   ->priv_destructor can run from register_netdevice (already under
> RTNL), this doesn't look right.
> 
>> +	list_del(&ovpn->dev_list);
> 
> And if this gets called from register_netdevice, the list_add from
> ovpn_iface_create hasn't run yet, so this will probably do strange
> things?

Argh, again I haven't considered a failure in register_netdevice and you 
are indeed right.

Maybe it is better to call list_del() in the netdev notifier, upon 
NETDEV_UNREGISTER event?


> 
>> +	rtnl_unlock();
>> +
>> +	free_percpu(net->tstats);
>> +}
>> +
>> +static int ovpn_net_open(struct net_device *dev)
>> +{
>> +	struct in_device *dev_v4 = __in_dev_get_rtnl(dev);
>> +
>> +	if (dev_v4) {
>> +		/* disable redirects as Linux gets confused by ovpn handling
>> +		 * same-LAN routing
>> +		 */
>> +		IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false);
>> +		IPV4_DEVCONF_ALL(dev_net(dev), SEND_REDIRECTS) = false;
> 
> Jakub, are you ok with that? This feels a bit weird to have in the
> middle of a driver.

Let me share what the problem is (copied from the email I sent to Andrew 
Lunn as he was also curious about this):

The reason for requiring this setting lies in the OpenVPN server acting 
as relay point (star topology) for hosts in the same subnet.

Example: given the a.b.c.0/24 IP network, you have .2 that in order to 
talk to .3 must have its traffic relayed by .1 (the server).

When the kernel (at .1) sees this traffic it will send the ICMP 
redirects, because it believes that .2 should directly talk to .3 
without passing through .1.

Of course it makes sense in a normal network with a classic broadcast 
domain, but this is not the case in a VPN implemented as a star topology.

Does it make sense?

The only way I see to fix this globally is to have an extra flag in the 
netdevice signaling this peculiarity and thus disabling ICMP redirects 
automatically.

Note: wireguard has those lines too, as it probably needs to address the 
same scenario.


> 
>> +	}
>> +
>> +	netif_tx_start_all_queues(dev);
>> +	return 0;
>> +}
> 
> [...]
>> +void ovpn_iface_destruct(struct ovpn_struct *ovpn)
>> +{
>> +	ASSERT_RTNL();
>> +
>> +	netif_carrier_off(ovpn->dev);
>> +
>> +	ovpn->registered = false;
>> +
>> +	unregister_netdevice(ovpn->dev);
>> +	synchronize_net();
> 
> If this gets called from the loop in ovpn_netns_pre_exit, one
> synchronize_net per ovpn device would seem quite expensive.

As per your other comment, maybe I should just remove the 
synchronize_net() entirely since it'll be the core to take care of 
inflight packets?

> 
>> +}
>> +
>>   static int ovpn_netdev_notifier_call(struct notifier_block *nb,
>>   				     unsigned long state, void *ptr)
>>   {
>>   	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
>> +	struct ovpn_struct *ovpn;
>>   
>>   	if (!ovpn_dev_is_valid(dev))
>>   		return NOTIFY_DONE;
>>   
>> +	ovpn = netdev_priv(dev);
>> +
>>   	switch (state) {
>>   	case NETDEV_REGISTER:
>> -		/* add device to internal list for later destruction upon
>> -		 * unregistration
>> -		 */
>> +		ovpn->registered = true;
>>   		break;
>>   	case NETDEV_UNREGISTER:
>> +		/* twiddle thumbs on netns device moves */
>> +		if (dev->reg_state != NETREG_UNREGISTERING)
>> +			break;
>> +
>>   		/* can be delivered multiple times, so check registered flag,
>>   		 * then destroy the interface
>>   		 */
>> +		if (!ovpn->registered)
>> +			return NOTIFY_DONE;
>> +
>> +		ovpn_iface_destruct(ovpn);
> 
> Maybe I'm misunderstanding this code. Why do you want to manually
> destroy a device that is already going away?

We need to perform some internal cleanup (i.e. release all peers).
I don't see how this can happen automatically, no?

> 
>>   		break;
>>   	case NETDEV_POST_INIT:
>>   	case NETDEV_GOING_DOWN:
>>   	case NETDEV_DOWN:
>>   	case NETDEV_UP:
>>   	case NETDEV_PRE_UP:
>> +		break;
>>   	default:
>>   		return NOTIFY_DONE;
>>   	}
>> @@ -62,6 +210,24 @@ static struct notifier_block ovpn_netdev_notifier = {
>>   	.notifier_call = ovpn_netdev_notifier_call,
>>   };
>>   
>> +static void ovpn_netns_pre_exit(struct net *net)
>> +{
>> +	struct ovpn_struct *ovpn;
>> +
>> +	rtnl_lock();
>> +	list_for_each_entry(ovpn, &dev_list, dev_list) {
>> +		if (dev_net(ovpn->dev) != net)
>> +			continue;
>> +
>> +		ovpn_iface_destruct(ovpn);
> 
> Is this needed? On netns destruction all devices within the ns will be
> destroyed by the networking core.

Before implementing ovpn_netns_pre_exit() this way, upon namespace 
deletion the ovpn interface was being moved to the global namespace.

Hence I decided to manually take care of its destruction.

Isn't this expected?
Sabrina Dubroca May 9, 2024, 10:09 a.m. UTC | #6
2024-05-09, 10:25:44 +0200, Antonio Quartulli wrote:
> On 08/05/2024 16:52, Sabrina Dubroca wrote:
> > 2024-05-06, 03:16:17 +0200, Antonio Quartulli wrote:
> > > diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
> > > index 33c0b004ce16..584cd7286aff 100644
> > > --- a/drivers/net/ovpn/main.c
> > > +++ b/drivers/net/ovpn/main.c
> > [...]
> > > +static void ovpn_struct_free(struct net_device *net)
> > > +{
> > > +	struct ovpn_struct *ovpn = netdev_priv(net);
> > > +
> > > +	rtnl_lock();
> > 
> >   ->priv_destructor can run from register_netdevice (already under
> > RTNL), this doesn't look right.
> > 
> > > +	list_del(&ovpn->dev_list);
> > 
> > And if this gets called from register_netdevice, the list_add from
> > ovpn_iface_create hasn't run yet, so this will probably do strange
> > things?
> 
> Argh, again I haven't considered a failure in register_netdevice and you are
> indeed right.
> 
> Maybe it is better to call list_del() in the netdev notifier, upon
> NETDEV_UNREGISTER event?

I'd like to avoid splitting the clean up code over so maybe different
functions and called through different means. Keep it simple.

AFAICT the only reason you need this list is to delete your devices on
netns exit, so if we can get rid of that the list can go away.


> > > +static int ovpn_net_open(struct net_device *dev)
> > > +{
> > > +	struct in_device *dev_v4 = __in_dev_get_rtnl(dev);
> > > +
> > > +	if (dev_v4) {
> > > +		/* disable redirects as Linux gets confused by ovpn handling
> > > +		 * same-LAN routing
> > > +		 */
> > > +		IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false);
> > > +		IPV4_DEVCONF_ALL(dev_net(dev), SEND_REDIRECTS) = false;
> > 
> > Jakub, are you ok with that? This feels a bit weird to have in the
> > middle of a driver.
> 
> Let me share what the problem is (copied from the email I sent to Andrew
> Lunn as he was also curious about this):
> 
> The reason for requiring this setting lies in the OpenVPN server acting as
> relay point (star topology) for hosts in the same subnet.
> 
> Example: given the a.b.c.0/24 IP network, you have .2 that in order to talk
> to .3 must have its traffic relayed by .1 (the server).
> 
> When the kernel (at .1) sees this traffic it will send the ICMP redirects,
> because it believes that .2 should directly talk to .3 without passing
> through .1.

So only the server would need to stop sending them, not the client?
(or the client would need to ignore them)
But the kernel has no way of knowing if an ovpn device is on a client
or a server?

> Of course it makes sense in a normal network with a classic broadcast
> domain, but this is not the case in a VPN implemented as a star topology.
> 
> Does it make sense?

It looks like the problem is that ovpn links are point-to-point
(instead of a broadcast LAN kind of link where redirects would make
sense), and the kernel doesn't handle it that way.

> The only way I see to fix this globally is to have an extra flag in the
> netdevice signaling this peculiarity and thus disabling ICMP redirects
> automatically.
> 
> Note: wireguard has those lines too, as it probably needs to address the
> same scenario.

I've noticed a lot of similarities in some bits I've looked at, and I
hate that this is turning into another pile of duplicate code like
vxlan/geneve, bond/team, etc :(


> > [...]
> > > +void ovpn_iface_destruct(struct ovpn_struct *ovpn)
> > > +{
> > > +	ASSERT_RTNL();
> > > +
> > > +	netif_carrier_off(ovpn->dev);
> > > +
> > > +	ovpn->registered = false;
> > > +
> > > +	unregister_netdevice(ovpn->dev);
> > > +	synchronize_net();
> > 
> > If this gets called from the loop in ovpn_netns_pre_exit, one
> > synchronize_net per ovpn device would seem quite expensive.
> 
> As per your other comment, maybe I should just remove the synchronize_net()
> entirely since it'll be the core to take care of inflight packets?

There's a synchronize_net in unregister_netdevice_many_notify, so I'd
say you can get rid of it here.


> > >   static int ovpn_netdev_notifier_call(struct notifier_block *nb,
> > >   				     unsigned long state, void *ptr)
> > >   {
> > >   	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> > > +	struct ovpn_struct *ovpn;
> > >   	if (!ovpn_dev_is_valid(dev))
> > >   		return NOTIFY_DONE;
> > > +	ovpn = netdev_priv(dev);
> > > +
> > >   	switch (state) {
> > >   	case NETDEV_REGISTER:
> > > -		/* add device to internal list for later destruction upon
> > > -		 * unregistration
> > > -		 */
> > > +		ovpn->registered = true;
> > >   		break;
> > >   	case NETDEV_UNREGISTER:
> > > +		/* twiddle thumbs on netns device moves */
> > > +		if (dev->reg_state != NETREG_UNREGISTERING)
> > > +			break;
> > > +
> > >   		/* can be delivered multiple times, so check registered flag,
> > >   		 * then destroy the interface
> > >   		 */
> > > +		if (!ovpn->registered)
> > > +			return NOTIFY_DONE;
> > > +
> > > +		ovpn_iface_destruct(ovpn);
> > 
> > Maybe I'm misunderstanding this code. Why do you want to manually
> > destroy a device that is already going away?
> 
> We need to perform some internal cleanup (i.e. release all peers).
> I don't see how this can happen automatically, no?

That's what ->priv_destructor does, and it will be called ultimately
by the unregister_netdevice call you have in ovpn_iface_destruct (in
netdev_run_todo). Anyway, this UNREGISTER event is probably generated
by unregister_netdevice_many_notify (basically a previous
unregister_netdevice() call), so I don't know why you want to call
unregister_netdevice again on the same device.


> > > @@ -62,6 +210,24 @@ static struct notifier_block ovpn_netdev_notifier = {
> > >   	.notifier_call = ovpn_netdev_notifier_call,
> > >   };
> > > +static void ovpn_netns_pre_exit(struct net *net)
> > > +{
> > > +	struct ovpn_struct *ovpn;
> > > +
> > > +	rtnl_lock();
> > > +	list_for_each_entry(ovpn, &dev_list, dev_list) {
> > > +		if (dev_net(ovpn->dev) != net)
> > > +			continue;
> > > +
> > > +		ovpn_iface_destruct(ovpn);
> > 
> > Is this needed? On netns destruction all devices within the ns will be
> > destroyed by the networking core.
> 
> Before implementing ovpn_netns_pre_exit() this way, upon namespace deletion
> the ovpn interface was being moved to the global namespace.

Crap it's only the devices with ->rtnl_link_ops that get killed by the
core. Because you create your devices via genl (which I'm not a fan
of, even if it's a bit nicer for userspace having a single netlink api
to deal with), default_device_exit_batch/default_device_exit_net think
ovpn devices are real NICs and move them back to init_net instead of
destroying them.

Maybe we can extend the condition in default_device_exit_net with a
new flag so that ovpn devices get destroyed by the core, even without
rtnl_link_ops?
Antonio Quartulli May 9, 2024, 10:35 a.m. UTC | #7
On 09/05/2024 12:09, Sabrina Dubroca wrote:
> 2024-05-09, 10:25:44 +0200, Antonio Quartulli wrote:
>> On 08/05/2024 16:52, Sabrina Dubroca wrote:
>>> 2024-05-06, 03:16:17 +0200, Antonio Quartulli wrote:
>>>> diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
>>>> index 33c0b004ce16..584cd7286aff 100644
>>>> --- a/drivers/net/ovpn/main.c
>>>> +++ b/drivers/net/ovpn/main.c
>>> [...]
>>>> +static void ovpn_struct_free(struct net_device *net)
>>>> +{
>>>> +	struct ovpn_struct *ovpn = netdev_priv(net);
>>>> +
>>>> +	rtnl_lock();
>>>
>>>    ->priv_destructor can run from register_netdevice (already under
>>> RTNL), this doesn't look right.
>>>
>>>> +	list_del(&ovpn->dev_list);
>>>
>>> And if this gets called from register_netdevice, the list_add from
>>> ovpn_iface_create hasn't run yet, so this will probably do strange
>>> things?
>>
>> Argh, again I haven't considered a failure in register_netdevice and you are
>> indeed right.
>>
>> Maybe it is better to call list_del() in the netdev notifier, upon
>> NETDEV_UNREGISTER event?
> 
> I'd like to avoid splitting the clean up code over so maybe different
> functions and called through different means. Keep it simple.
> 
> AFAICT the only reason you need this list is to delete your devices on
> netns exit, so if we can get rid of that the list can go away.

right.

> 
> 
>>>> +static int ovpn_net_open(struct net_device *dev)
>>>> +{
>>>> +	struct in_device *dev_v4 = __in_dev_get_rtnl(dev);
>>>> +
>>>> +	if (dev_v4) {
>>>> +		/* disable redirects as Linux gets confused by ovpn handling
>>>> +		 * same-LAN routing
>>>> +		 */
>>>> +		IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false);
>>>> +		IPV4_DEVCONF_ALL(dev_net(dev), SEND_REDIRECTS) = false;
>>>
>>> Jakub, are you ok with that? This feels a bit weird to have in the
>>> middle of a driver.
>>
>> Let me share what the problem is (copied from the email I sent to Andrew
>> Lunn as he was also curious about this):
>>
>> The reason for requiring this setting lies in the OpenVPN server acting as
>> relay point (star topology) for hosts in the same subnet.
>>
>> Example: given the a.b.c.0/24 IP network, you have .2 that in order to talk
>> to .3 must have its traffic relayed by .1 (the server).
>>
>> When the kernel (at .1) sees this traffic it will send the ICMP redirects,
>> because it believes that .2 should directly talk to .3 without passing
>> through .1.
> 
> So only the server would need to stop sending them, not the client?

correct

> (or the client would need to ignore them)
> But the kernel has no way of knowing if an ovpn device is on a client
> or a server?

the server knows if the interface is configured in P2P or MP (MultiPeer) 
mode. The latter is what requires redirects to be off, so we could at 
least add a check and switch them off only for MP ifaces.

> 
>> Of course it makes sense in a normal network with a classic broadcast
>> domain, but this is not the case in a VPN implemented as a star topology.
>>
>> Does it make sense?
> 
> It looks like the problem is that ovpn links are point-to-point
> (instead of a broadcast LAN kind of link where redirects would make
> sense), and the kernel doesn't handle it that way.

exactly

> 
>> The only way I see to fix this globally is to have an extra flag in the
>> netdevice signaling this peculiarity and thus disabling ICMP redirects
>> automatically.
>>
>> Note: wireguard has those lines too, as it probably needs to address the
>> same scenario.
> 
> I've noticed a lot of similarities in some bits I've looked at, and I
> hate that this is turning into another pile of duplicate code like
> vxlan/geneve, bond/team, etc :(

For starters, we could at least moves these few lines in some helper 
function and call it from both modules.

On the other hand, we could, like I suggested above, convert this into a 
netdev flag and let core handle the behaviour when the flag is set.

> 
> 
>>> [...]
>>>> +void ovpn_iface_destruct(struct ovpn_struct *ovpn)
>>>> +{
>>>> +	ASSERT_RTNL();
>>>> +
>>>> +	netif_carrier_off(ovpn->dev);
>>>> +
>>>> +	ovpn->registered = false;
>>>> +
>>>> +	unregister_netdevice(ovpn->dev);
>>>> +	synchronize_net();
>>>
>>> If this gets called from the loop in ovpn_netns_pre_exit, one
>>> synchronize_net per ovpn device would seem quite expensive.
>>
>> As per your other comment, maybe I should just remove the synchronize_net()
>> entirely since it'll be the core to take care of inflight packets?
> 
> There's a synchronize_net in unregister_netdevice_many_notify, so I'd
> say you can get rid of it here.

ok! Jakub was indeed suggesting that core should already take care of this.

Will remove it for good.

> 
> 
>>>>    static int ovpn_netdev_notifier_call(struct notifier_block *nb,
>>>>    				     unsigned long state, void *ptr)
>>>>    {
>>>>    	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
>>>> +	struct ovpn_struct *ovpn;
>>>>    	if (!ovpn_dev_is_valid(dev))
>>>>    		return NOTIFY_DONE;
>>>> +	ovpn = netdev_priv(dev);
>>>> +
>>>>    	switch (state) {
>>>>    	case NETDEV_REGISTER:
>>>> -		/* add device to internal list for later destruction upon
>>>> -		 * unregistration
>>>> -		 */
>>>> +		ovpn->registered = true;
>>>>    		break;
>>>>    	case NETDEV_UNREGISTER:
>>>> +		/* twiddle thumbs on netns device moves */
>>>> +		if (dev->reg_state != NETREG_UNREGISTERING)
>>>> +			break;
>>>> +
>>>>    		/* can be delivered multiple times, so check registered flag,
>>>>    		 * then destroy the interface
>>>>    		 */
>>>> +		if (!ovpn->registered)
>>>> +			return NOTIFY_DONE;
>>>> +
>>>> +		ovpn_iface_destruct(ovpn);
>>>
>>> Maybe I'm misunderstanding this code. Why do you want to manually
>>> destroy a device that is already going away?
>>
>> We need to perform some internal cleanup (i.e. release all peers).
>> I don't see how this can happen automatically, no?
> 
> That's what ->priv_destructor does, 

Not really.

Every peer object increases the netdev refcounter to the netdev, 
therefore we must first delete all peers in order to have 
netdevice->refcnt reach 0 (and then invoke priv_destructor).

So the idea is: upon UNREGISTER event we drop all resources and 
eventually (via RCU) all references to the netdev are also released, 
which in turn triggers the destructor.

makes sense?


> and it will be called ultimately
> by the unregister_netdevice call you have in ovpn_iface_destruct (in
> netdev_run_todo). Anyway, this UNREGISTER event is probably generated
> by unregister_netdevice_many_notify (basically a previous
> unregister_netdevice() call), so I don't know why you want to call
> unregister_netdevice again on the same device.

I believe I have seen this notification being triggered upon netns exit, 
but in that case the netdevice was not being removed from core.

Hence I decided to fully trigger the unregistration.

Expected?

I can repeat the test to be sure.

> 
> 
>>>> @@ -62,6 +210,24 @@ static struct notifier_block ovpn_netdev_notifier = {
>>>>    	.notifier_call = ovpn_netdev_notifier_call,
>>>>    };
>>>> +static void ovpn_netns_pre_exit(struct net *net)
>>>> +{
>>>> +	struct ovpn_struct *ovpn;
>>>> +
>>>> +	rtnl_lock();
>>>> +	list_for_each_entry(ovpn, &dev_list, dev_list) {
>>>> +		if (dev_net(ovpn->dev) != net)
>>>> +			continue;
>>>> +
>>>> +		ovpn_iface_destruct(ovpn);
>>>
>>> Is this needed? On netns destruction all devices within the ns will be
>>> destroyed by the networking core.
>>
>> Before implementing ovpn_netns_pre_exit() this way, upon namespace deletion
>> the ovpn interface was being moved to the global namespace.
> 
> Crap it's only the devices with ->rtnl_link_ops that get killed by the
> core. 

exactly! this goes hand to hand with my comment above: event delivered 
but interface not destroyed.

> Because you create your devices via genl (which I'm not a fan
> of, even if it's a bit nicer for userspace having a single netlink api
> to deal with),

Originally I had implemented the rtnl_link_ops, but the (meaningful) 
objection was that a user is never supposed to create an ovpn iface by 
himself, but there should always be an openvpn process running in 
userspace. Hence the restriction to genl only.

> default_device_exit_batch/default_device_exit_net think
> ovpn devices are real NICs and move them back to init_net instead of
> destroying them.
> 
> Maybe we can extend the condition in default_device_exit_net with a
> new flag so that ovpn devices get destroyed by the core, even without
> rtnl_link_ops?

Thanks for pointing out the function responsible for this decision.
How would you extend the check though?

Alternatively, what if ovpn simply registers an empty rtnl_link_ops with 
netns_fund set to false? That should make the condition happy, while 
keeping ovpn genl-only


Thanks a lot
Sabrina Dubroca May 9, 2024, 12:16 p.m. UTC | #8
2024-05-09, 12:35:32 +0200, Antonio Quartulli wrote:
> On 09/05/2024 12:09, Sabrina Dubroca wrote:
> > 2024-05-09, 10:25:44 +0200, Antonio Quartulli wrote:
> > > On 08/05/2024 16:52, Sabrina Dubroca wrote:
> > > > 2024-05-06, 03:16:17 +0200, Antonio Quartulli wrote:
> > > > >    static int ovpn_netdev_notifier_call(struct notifier_block *nb,
> > > > >    				     unsigned long state, void *ptr)
> > > > >    {
> > > > >    	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> > > > > +	struct ovpn_struct *ovpn;
> > > > >    	if (!ovpn_dev_is_valid(dev))
> > > > >    		return NOTIFY_DONE;
> > > > > +	ovpn = netdev_priv(dev);
> > > > > +
> > > > >    	switch (state) {
> > > > >    	case NETDEV_REGISTER:
> > > > > -		/* add device to internal list for later destruction upon
> > > > > -		 * unregistration
> > > > > -		 */
> > > > > +		ovpn->registered = true;
> > > > >    		break;
> > > > >    	case NETDEV_UNREGISTER:
> > > > > +		/* twiddle thumbs on netns device moves */
> > > > > +		if (dev->reg_state != NETREG_UNREGISTERING)
> > > > > +			break;
> > > > > +
> > > > >    		/* can be delivered multiple times, so check registered flag,
> > > > >    		 * then destroy the interface
> > > > >    		 */
> > > > > +		if (!ovpn->registered)
> > > > > +			return NOTIFY_DONE;
> > > > > +
> > > > > +		ovpn_iface_destruct(ovpn);
> > > > 
> > > > Maybe I'm misunderstanding this code. Why do you want to manually
> > > > destroy a device that is already going away?
> > > 
> > > We need to perform some internal cleanup (i.e. release all peers).
> > > I don't see how this can happen automatically, no?
> > 
> > That's what ->priv_destructor does,
> 
> Not really.
> 
> Every peer object increases the netdev refcounter to the netdev, therefore
> we must first delete all peers in order to have netdevice->refcnt reach 0
> (and then invoke priv_destructor).

Oh, I see. I'm still trying to wrap my head around all the objects and
components of your driver.

> So the idea is: upon UNREGISTER event we drop all resources and eventually
> (via RCU) all references to the netdev are also released, which in turn
> triggers the destructor.
> 
> makes sense?

That part, yes, thanks for explaining. Do you really need the peers to
hold a reference on the netdevice? With my limited understanding, it
seems the peers are sub-objects of the netdevice.

> > and it will be called ultimately
> > by the unregister_netdevice call you have in ovpn_iface_destruct (in
> > netdev_run_todo). Anyway, this UNREGISTER event is probably generated
> > by unregister_netdevice_many_notify (basically a previous
> > unregister_netdevice() call), so I don't know why you want to call
> > unregister_netdevice again on the same device.
> 
> I believe I have seen this notification being triggered upon netns exit, but
> in that case the netdevice was not being removed from core.

Sure, but you have a comment about that and you're filtering that
event, so I'm ignoring this case.

> Hence I decided to fully trigger the unregistration.

That's the bit that doesn't make sense to me: the device is going
away, so you trigger a manual unregister. Cleaning up some additional
resources (peers etc), that makes sense. But calling
unregister_netdevice (when you're most likely getting called from
unregister_netdevice already, because I don't see other spots setting
dev->reg_state = NETREG_UNREGISTERING) is what I don't get. And I
wonder why you're not hitting the BUG_ON in
unregister_netdevice_many_notify:

    BUG_ON(dev->reg_state != NETREG_REGISTERED);


> > > > > @@ -62,6 +210,24 @@ static struct notifier_block ovpn_netdev_notifier = {
> > > > >    	.notifier_call = ovpn_netdev_notifier_call,
> > > > >    };
> > > > > +static void ovpn_netns_pre_exit(struct net *net)

BTW, in case you end up keeping this function, it should have
__net_exit annotation (see for example ipv4_frags_exit_net).

> > > > > +{
> > > > > +	struct ovpn_struct *ovpn;
> > > > > +
> > > > > +	rtnl_lock();
> > > > > +	list_for_each_entry(ovpn, &dev_list, dev_list) {
> > > > > +		if (dev_net(ovpn->dev) != net)
> > > > > +			continue;
> > > > > +
> > > > > +		ovpn_iface_destruct(ovpn);
> > > > 
> > > > Is this needed? On netns destruction all devices within the ns will be
> > > > destroyed by the networking core.
> > > 
> > > Before implementing ovpn_netns_pre_exit() this way, upon namespace deletion
> > > the ovpn interface was being moved to the global namespace.
> > 
> > Crap it's only the devices with ->rtnl_link_ops that get killed by the
> > core.
> 
> exactly! this goes hand to hand with my comment above: event delivered but
> interface not destroyed.

There's no event sent to ovpn_netdev_notifier_call in that case (well,
only the fake "unregister" out of the current netns that you're
ignoring). Otherwise, you wouldn't need ovpn_netns_pre_exit.

> > Because you create your devices via genl (which I'm not a fan
> > of, even if it's a bit nicer for userspace having a single netlink api
> > to deal with),
> 
> Originally I had implemented the rtnl_link_ops, but the (meaningful)
> objection was that a user is never supposed to create an ovpn iface by
> himself, but there should always be an openvpn process running in userspace.
> Hence the restriction to genl only.

Sorry, but how does genl prevent a user from creating the ovpn
interface manually? Whatever API you define, anyone who manages to
come up with the right netlink message will be able to create an
interface. You can't stop people from using your API without your
official client.

> > default_device_exit_batch/default_device_exit_net think
> > ovpn devices are real NICs and move them back to init_net instead of
> > destroying them.
> > 
> > Maybe we can extend the condition in default_device_exit_net with a
> > new flag so that ovpn devices get destroyed by the core, even without
> > rtnl_link_ops?
> 
> Thanks for pointing out the function responsible for this decision.
> How would you extend the check though?
>
> Alternatively, what if ovpn simply registers an empty rtnl_link_ops with
> netns_fund set to false? That should make the condition happy, while keeping
> ovpn genl-only

Yes. I was thinking about adding a flag to the device, because I
wasn't sure an almost empty rtnl_link_ops could be handled safely, but
it seems ok. ovs does it, see commit 5b9e7e160795 ("openvswitch:
introduce rtnl ops stub"). And, as that commit message says, "ip -d
link show" would also show that the device is of type openvpn (or
ovpn, whatever you put in ops->kind), which would be nice.
Antonio Quartulli May 9, 2024, 1:25 p.m. UTC | #9
By the way, thank you very much for taking the time to have this 
constructive discussion. I really appreciate it!

On 09/05/2024 14:16, Sabrina Dubroca wrote:
> 2024-05-09, 12:35:32 +0200, Antonio Quartulli wrote:
>> On 09/05/2024 12:09, Sabrina Dubroca wrote:
>>> 2024-05-09, 10:25:44 +0200, Antonio Quartulli wrote:
>>>> On 08/05/2024 16:52, Sabrina Dubroca wrote:
>>>>> 2024-05-06, 03:16:17 +0200, Antonio Quartulli wrote:
>>>>>>     static int ovpn_netdev_notifier_call(struct notifier_block *nb,
>>>>>>     				     unsigned long state, void *ptr)
>>>>>>     {
>>>>>>     	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
>>>>>> +	struct ovpn_struct *ovpn;
>>>>>>     	if (!ovpn_dev_is_valid(dev))
>>>>>>     		return NOTIFY_DONE;
>>>>>> +	ovpn = netdev_priv(dev);
>>>>>> +
>>>>>>     	switch (state) {
>>>>>>     	case NETDEV_REGISTER:
>>>>>> -		/* add device to internal list for later destruction upon
>>>>>> -		 * unregistration
>>>>>> -		 */
>>>>>> +		ovpn->registered = true;
>>>>>>     		break;
>>>>>>     	case NETDEV_UNREGISTER:
>>>>>> +		/* twiddle thumbs on netns device moves */
>>>>>> +		if (dev->reg_state != NETREG_UNREGISTERING)
>>>>>> +			break;
>>>>>> +
>>>>>>     		/* can be delivered multiple times, so check registered flag,
>>>>>>     		 * then destroy the interface
>>>>>>     		 */
>>>>>> +		if (!ovpn->registered)
>>>>>> +			return NOTIFY_DONE;
>>>>>> +
>>>>>> +		ovpn_iface_destruct(ovpn);
>>>>>
>>>>> Maybe I'm misunderstanding this code. Why do you want to manually
>>>>> destroy a device that is already going away?
>>>>
>>>> We need to perform some internal cleanup (i.e. release all peers).
>>>> I don't see how this can happen automatically, no?
>>>
>>> That's what ->priv_destructor does,
>>
>> Not really.
>>
>> Every peer object increases the netdev refcounter to the netdev, therefore
>> we must first delete all peers in order to have netdevice->refcnt reach 0
>> (and then invoke priv_destructor).
> 
> Oh, I see. I'm still trying to wrap my head around all the objects and
> components of your driver.
> 
>> So the idea is: upon UNREGISTER event we drop all resources and eventually
>> (via RCU) all references to the netdev are also released, which in turn
>> triggers the destructor.
>>
>> makes sense?
> 
> That part, yes, thanks for explaining. Do you really need the peers to
> hold a reference on the netdevice? With my limited understanding, it
> seems the peers are sub-objects of the netdevice.
> 
>>> and it will be called ultimately
>>> by the unregister_netdevice call you have in ovpn_iface_destruct (in
>>> netdev_run_todo). Anyway, this UNREGISTER event is probably generated
>>> by unregister_netdevice_many_notify (basically a previous
>>> unregister_netdevice() call), so I don't know why you want to call
>>> unregister_netdevice again on the same device.
>>
>> I believe I have seen this notification being triggered upon netns exit, but
>> in that case the netdevice was not being removed from core.
> 
> Sure, but you have a comment about that and you're filtering that
> event, so I'm ignoring this case.

You're right..now I wonder if my observation was made before I 
introduced that check...

> 
>> Hence I decided to fully trigger the unregistration.
> 
> That's the bit that doesn't make sense to me: the device is going
> away, so you trigger a manual unregister. Cleaning up some additional
> resources (peers etc), that makes sense. But calling
> unregister_netdevice (when you're most likely getting called from
> unregister_netdevice already, because I don't see other spots setting
> dev->reg_state = NETREG_UNREGISTERING) is what I don't get. And I
> wonder why you're not hitting the BUG_ON in
> unregister_netdevice_many_notify:
> 
>      BUG_ON(dev->reg_state != NETREG_REGISTERED);

I think because we have our ovpn->registered check.
It ensures that we don't call ovpn_iface_destruct more than once.

But now, that I implemented the rtnl_link_ops I can confirm I am hitting 
the BUG_ON. And now it makes sense.

I presume that now I can I simply remove the call to 
unregister_netdevice() from ovpn_iface_destruct() and move it to 
ovpn_nl_del_iface_doit().

This way, upon netns exit, the real UNREGISTER handler (triggered thanks 
to rtnl_link_ops) will still perform the destruct, but won't try to 
schedule an UNREGISTER event again.

> 
> 
>>>>>> @@ -62,6 +210,24 @@ static struct notifier_block ovpn_netdev_notifier = {
>>>>>>     	.notifier_call = ovpn_netdev_notifier_call,
>>>>>>     };
>>>>>> +static void ovpn_netns_pre_exit(struct net *net)
> 
> BTW, in case you end up keeping this function, it should have
> __net_exit annotation (see for example ipv4_frags_exit_net).

ACK, but thanks to the rtnl_link_ops trick we are definitely ditching it.
> 
>>>>>> +{
>>>>>> +	struct ovpn_struct *ovpn;
>>>>>> +
>>>>>> +	rtnl_lock();
>>>>>> +	list_for_each_entry(ovpn, &dev_list, dev_list) {
>>>>>> +		if (dev_net(ovpn->dev) != net)
>>>>>> +			continue;
>>>>>> +
>>>>>> +		ovpn_iface_destruct(ovpn);
>>>>>
>>>>> Is this needed? On netns destruction all devices within the ns will be
>>>>> destroyed by the networking core.
>>>>
>>>> Before implementing ovpn_netns_pre_exit() this way, upon namespace deletion
>>>> the ovpn interface was being moved to the global namespace.
>>>
>>> Crap it's only the devices with ->rtnl_link_ops that get killed by the
>>> core.
>>
>> exactly! this goes hand to hand with my comment above: event delivered but
>> interface not destroyed.
> 
> There's no event sent to ovpn_netdev_notifier_call in that case (well,
> only the fake "unregister" out of the current netns that you're
> ignoring). Otherwise, you wouldn't need ovpn_netns_pre_exit.

Yeah you're right. I think I wanted to conclude the same thing but my 
brain was unable to produce a meaningful sentence.

> 
>>> Because you create your devices via genl (which I'm not a fan
>>> of, even if it's a bit nicer for userspace having a single netlink api
>>> to deal with),
>>
>> Originally I had implemented the rtnl_link_ops, but the (meaningful)
>> objection was that a user is never supposed to create an ovpn iface by
>> himself, but there should always be an openvpn process running in userspace.
>> Hence the restriction to genl only.
> 
> Sorry, but how does genl prevent a user from creating the ovpn
> interface manually? Whatever API you define, anyone who manages to
> come up with the right netlink message will be able to create an
> interface. You can't stop people from using your API without your
> official client.

I don't want to prevent people from creating ovpn ifaces the way they like.
I just don't see how the rtnl_link API can be useful, other than 
allowing users to execute 'ip link add/del..'.
And by design that is not a usecase we want to support, because once the 
iface is created, nothing will happen if there is no userspace software 
driving it (no matter if it is openvpn or anything else).

When explaining this decision, I like to make a comparison to virtual 
802.11/wifi ifaces.
They also lack rtnl_link (AFAIR) as they also require some userspace 
software to handle them in order to be useful.

All this said, having everything in one place looks cleaner too :)

> 
>>> default_device_exit_batch/default_device_exit_net think
>>> ovpn devices are real NICs and move them back to init_net instead of
>>> destroying them.
>>>
>>> Maybe we can extend the condition in default_device_exit_net with a
>>> new flag so that ovpn devices get destroyed by the core, even without
>>> rtnl_link_ops?
>>
>> Thanks for pointing out the function responsible for this decision.
>> How would you extend the check though?
>>
>> Alternatively, what if ovpn simply registers an empty rtnl_link_ops with
>> netns_fund set to false? That should make the condition happy, while keeping
>> ovpn genl-only
> 
> Yes. I was thinking about adding a flag to the device, because I
> wasn't sure an almost empty rtnl_link_ops could be handled safely, but
> it seems ok. ovs does it, see commit 5b9e7e160795 ("openvswitch:
> introduce rtnl ops stub"). And, as that commit message says, "ip -d
> link show" would also show that the device is of type openvpn (or
> ovpn, whatever you put in ops->kind), which would be nice.

I just coded something along those lines.

It seems pretty clean and we don't need to touch core (+ the bonus of 
having the name in "ip -d link")....and the iface does get destroyed 
upon netns exit! :-)

I am grasping much better how all these APIs work together now.

Thanks!
Sabrina Dubroca May 9, 2024, 1:52 p.m. UTC | #10
2024-05-09, 15:25:21 +0200, Antonio Quartulli wrote:
> By the way, thank you very much for taking the time to have this
> constructive discussion. I really appreciate it!

Cheers :)


> On 09/05/2024 14:16, Sabrina Dubroca wrote:
> > 2024-05-09, 12:35:32 +0200, Antonio Quartulli wrote:
> > > On 09/05/2024 12:09, Sabrina Dubroca wrote:
> > > Hence I decided to fully trigger the unregistration.
> > 
> > That's the bit that doesn't make sense to me: the device is going
> > away, so you trigger a manual unregister. Cleaning up some additional
> > resources (peers etc), that makes sense. But calling
> > unregister_netdevice (when you're most likely getting called from
> > unregister_netdevice already, because I don't see other spots setting
> > dev->reg_state = NETREG_UNREGISTERING) is what I don't get. And I
> > wonder why you're not hitting the BUG_ON in
> > unregister_netdevice_many_notify:
> > 
> >      BUG_ON(dev->reg_state != NETREG_REGISTERED);
> 
> I think because we have our ovpn->registered check.
>
> It ensures that we don't call ovpn_iface_destruct more than once.

Ah, probably, yes.

> But now, that I implemented the rtnl_link_ops I can confirm I am hitting the
> BUG_ON. And now it makes sense.
> 
> I presume that now I can I simply remove the call to unregister_netdevice()
> from ovpn_iface_destruct() and move it to ovpn_nl_del_iface_doit().

Sounds good.


> > > > Because you create your devices via genl (which I'm not a fan
> > > > of, even if it's a bit nicer for userspace having a single netlink api
> > > > to deal with),
> > > 
> > > Originally I had implemented the rtnl_link_ops, but the (meaningful)
> > > objection was that a user is never supposed to create an ovpn iface by
> > > himself, but there should always be an openvpn process running in userspace.
> > > Hence the restriction to genl only.
> > 
> > Sorry, but how does genl prevent a user from creating the ovpn
> > interface manually? Whatever API you define, anyone who manages to
> > come up with the right netlink message will be able to create an
> > interface. You can't stop people from using your API without your
> > official client.
> 
> I don't want to prevent people from creating ovpn ifaces the way they like.
> I just don't see how the rtnl_link API can be useful, other than allowing
> users to execute 'ip link add/del..'.
>
> And by design that is not a usecase we want to support, because once the
> iface is created, nothing will happen if there is no userspace software
> driving it (no matter if it is openvpn or anything else).
> 
> When explaining this decision, I like to make a comparison to virtual
> 802.11/wifi ifaces.
> They also lack rtnl_link (AFAIR) as they also require some userspace
> software to handle them in order to be useful.
> 
> All this said, having everything in one place looks cleaner too :)

From an API point of view, maybe. But for the kernel implementation,
using rtnl_link_ops->newlink is easier.

> > > > default_device_exit_batch/default_device_exit_net think
> > > > ovpn devices are real NICs and move them back to init_net instead of
> > > > destroying them.
> > > > 
> > > > Maybe we can extend the condition in default_device_exit_net with a
> > > > new flag so that ovpn devices get destroyed by the core, even without
> > > > rtnl_link_ops?
> > > 
> > > Thanks for pointing out the function responsible for this decision.
> > > How would you extend the check though?
> > > 
> > > Alternatively, what if ovpn simply registers an empty rtnl_link_ops with
> > > netns_fund set to false? That should make the condition happy, while keeping
> > > ovpn genl-only
> > 
> > Yes. I was thinking about adding a flag to the device, because I
> > wasn't sure an almost empty rtnl_link_ops could be handled safely, but
> > it seems ok. ovs does it, see commit 5b9e7e160795 ("openvswitch:
> > introduce rtnl ops stub"). And, as that commit message says, "ip -d
> > link show" would also show that the device is of type openvpn (or
> > ovpn, whatever you put in ops->kind), which would be nice.
> 
> I just coded something along those lines.

Great, thanks.

> It seems pretty clean and we don't need to touch core (+ the bonus of having
> the name in "ip -d link")....and the iface does get destroyed upon netns
> exit! :-)
> 
> I am grasping much better how all these APIs work together now.

Nice :)
diff mbox series

Patch

diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
index ad3813419c33..338e99dfe886 100644
--- a/drivers/net/ovpn/io.c
+++ b/drivers/net/ovpn/io.c
@@ -11,6 +11,26 @@ 
 #include <linux/skbuff.h>
 
 #include "io.h"
+#include "ovpnstruct.h"
+#include "netlink.h"
+
+int ovpn_struct_init(struct net_device *dev)
+{
+	struct ovpn_struct *ovpn = netdev_priv(dev);
+	int err;
+
+	ovpn->dev = dev;
+
+	err = ovpn_nl_init(ovpn);
+	if (err < 0)
+		return err;
+
+	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	return 0;
+}
 
 /* Send user data to the network
  */
diff --git a/drivers/net/ovpn/io.h b/drivers/net/ovpn/io.h
index aa259be66441..61a2485e16b5 100644
--- a/drivers/net/ovpn/io.h
+++ b/drivers/net/ovpn/io.h
@@ -10,6 +10,13 @@ 
 #ifndef _NET_OVPN_OVPN_H_
 #define _NET_OVPN_OVPN_H_
 
+/**
+ * ovpn_struct_init - Initialize the netdevice private area
+ * @dev: the device to initialize
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int ovpn_struct_init(struct net_device *dev);
 netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev);
 
 #endif /* _NET_OVPN_OVPN_H_ */
diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
index 33c0b004ce16..584cd7286aff 100644
--- a/drivers/net/ovpn/main.c
+++ b/drivers/net/ovpn/main.c
@@ -10,47 +10,195 @@ 
 #include <linux/genetlink.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
+#include <linux/inetdevice.h>
 #include <linux/version.h>
+#include <net/ip.h>
+#include <uapi/linux/if_arp.h>
 #include <uapi/linux/ovpn.h>
 
 #include "ovpnstruct.h"
 #include "main.h"
 #include "netlink.h"
 #include "io.h"
+#include "packet.h"
 
 /* Driver info */
 #define DRV_DESCRIPTION	"OpenVPN data channel offload (ovpn)"
 #define DRV_COPYRIGHT	"(C) 2020-2024 OpenVPN, Inc."
 
+static LIST_HEAD(dev_list);
+
+static void ovpn_struct_free(struct net_device *net)
+{
+	struct ovpn_struct *ovpn = netdev_priv(net);
+
+	rtnl_lock();
+	list_del(&ovpn->dev_list);
+	rtnl_unlock();
+
+	free_percpu(net->tstats);
+}
+
+static int ovpn_net_open(struct net_device *dev)
+{
+	struct in_device *dev_v4 = __in_dev_get_rtnl(dev);
+
+	if (dev_v4) {
+		/* disable redirects as Linux gets confused by ovpn handling
+		 * same-LAN routing
+		 */
+		IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false);
+		IPV4_DEVCONF_ALL(dev_net(dev), SEND_REDIRECTS) = false;
+	}
+
+	netif_tx_start_all_queues(dev);
+	return 0;
+}
+
+static int ovpn_net_stop(struct net_device *dev)
+{
+	netif_tx_stop_all_queues(dev);
+	return 0;
+}
+
+static const struct net_device_ops ovpn_netdev_ops = {
+	.ndo_open		= ovpn_net_open,
+	.ndo_stop		= ovpn_net_stop,
+	.ndo_start_xmit		= ovpn_net_xmit,
+	.ndo_get_stats64        = dev_get_tstats64,
+};
+
 bool ovpn_dev_is_valid(const struct net_device *dev)
 {
 	return dev->netdev_ops->ndo_start_xmit == ovpn_net_xmit;
 }
 
+static void ovpn_setup(struct net_device *dev)
+{
+	/* compute the overhead considering AEAD encryption */
+	const int overhead = sizeof(u32) + NONCE_WIRE_SIZE + 16 +
+			     sizeof(struct udphdr) +
+			     max(sizeof(struct ipv6hdr), sizeof(struct iphdr));
+
+	netdev_features_t feat = NETIF_F_SG | NETIF_F_LLTX |
+				 NETIF_F_HW_CSUM | NETIF_F_RXCSUM |
+				 NETIF_F_GSO | NETIF_F_GSO_SOFTWARE |
+				 NETIF_F_HIGHDMA;
+
+	dev->needs_free_netdev = true;
+
+	dev->netdev_ops = &ovpn_netdev_ops;
+
+	dev->priv_destructor = ovpn_struct_free;
+
+	dev->hard_header_len = 0;
+	dev->addr_len = 0;
+	dev->mtu = ETH_DATA_LEN - overhead;
+	dev->min_mtu = IPV4_MIN_MTU;
+	dev->max_mtu = IP_MAX_MTU - overhead;
+
+	dev->type = ARPHRD_NONE;
+	dev->flags = IFF_POINTOPOINT | IFF_NOARP;
+
+	dev->features |= feat;
+	dev->hw_features |= feat;
+	dev->hw_enc_features |= feat;
+
+	dev->needed_headroom = OVPN_HEAD_ROOM;
+	dev->needed_tailroom = OVPN_MAX_PADDING;
+}
+
+struct net_device *ovpn_iface_create(const char *name, enum ovpn_mode mode,
+				     struct net *net)
+{
+	struct ovpn_struct *ovpn;
+	struct net_device *dev;
+	int ret;
+
+	dev = alloc_netdev(sizeof(struct ovpn_struct), name, NET_NAME_USER,
+			   ovpn_setup);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	dev_net_set(dev, net);
+
+	ret = ovpn_struct_init(dev);
+	if (ret < 0)
+		goto err;
+
+	ovpn = netdev_priv(dev);
+	ovpn->mode = mode;
+
+	rtnl_lock();
+
+	ret = register_netdevice(dev);
+	if (ret < 0) {
+		netdev_err(dev, "cannot register interface: %d\n", ret);
+		rtnl_unlock();
+		goto err;
+	}
+
+	list_add(&ovpn->dev_list, &dev_list);
+	rtnl_unlock();
+
+	/* turn carrier explicitly off after registration, this way state is
+	 * clearly defined
+	 */
+	netif_carrier_off(dev);
+
+	return dev;
+
+err:
+	free_netdev(dev);
+	return ERR_PTR(ret);
+}
+
+void ovpn_iface_destruct(struct ovpn_struct *ovpn)
+{
+	ASSERT_RTNL();
+
+	netif_carrier_off(ovpn->dev);
+
+	ovpn->registered = false;
+
+	unregister_netdevice(ovpn->dev);
+	synchronize_net();
+}
+
 static int ovpn_netdev_notifier_call(struct notifier_block *nb,
 				     unsigned long state, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct ovpn_struct *ovpn;
 
 	if (!ovpn_dev_is_valid(dev))
 		return NOTIFY_DONE;
 
+	ovpn = netdev_priv(dev);
+
 	switch (state) {
 	case NETDEV_REGISTER:
-		/* add device to internal list for later destruction upon
-		 * unregistration
-		 */
+		ovpn->registered = true;
 		break;
 	case NETDEV_UNREGISTER:
+		/* twiddle thumbs on netns device moves */
+		if (dev->reg_state != NETREG_UNREGISTERING)
+			break;
+
 		/* can be delivered multiple times, so check registered flag,
 		 * then destroy the interface
 		 */
+		if (!ovpn->registered)
+			return NOTIFY_DONE;
+
+		ovpn_iface_destruct(ovpn);
 		break;
 	case NETDEV_POST_INIT:
 	case NETDEV_GOING_DOWN:
 	case NETDEV_DOWN:
 	case NETDEV_UP:
 	case NETDEV_PRE_UP:
+		break;
 	default:
 		return NOTIFY_DONE;
 	}
@@ -62,6 +210,24 @@  static struct notifier_block ovpn_netdev_notifier = {
 	.notifier_call = ovpn_netdev_notifier_call,
 };
 
+static void ovpn_netns_pre_exit(struct net *net)
+{
+	struct ovpn_struct *ovpn;
+
+	rtnl_lock();
+	list_for_each_entry(ovpn, &dev_list, dev_list) {
+		if (dev_net(ovpn->dev) != net)
+			continue;
+
+		ovpn_iface_destruct(ovpn);
+	}
+	rtnl_unlock();
+}
+
+static struct pernet_operations ovpn_pernet_ops = {
+	.pre_exit = ovpn_netns_pre_exit
+};
+
 static int __init ovpn_init(void)
 {
 	int err = register_netdevice_notifier(&ovpn_netdev_notifier);
@@ -71,14 +237,22 @@  static int __init ovpn_init(void)
 		return err;
 	}
 
+	err = register_pernet_device(&ovpn_pernet_ops);
+	if (err) {
+		pr_err("ovpn: can't register pernet ops: %d\n", err);
+		goto unreg_netdev;
+	}
+
 	err = ovpn_nl_register();
 	if (err) {
 		pr_err("ovpn: can't register netlink family: %d\n", err);
-		goto unreg_netdev;
+		goto unreg_pernet;
 	}
 
 	return 0;
 
+unreg_pernet:
+	unregister_pernet_device(&ovpn_pernet_ops);
 unreg_netdev:
 	unregister_netdevice_notifier(&ovpn_netdev_notifier);
 	return err;
@@ -87,6 +261,7 @@  static int __init ovpn_init(void)
 static __exit void ovpn_cleanup(void)
 {
 	ovpn_nl_unregister();
+	unregister_pernet_device(&ovpn_pernet_ops);
 	unregister_netdevice_notifier(&ovpn_netdev_notifier);
 }
 
diff --git a/drivers/net/ovpn/main.h b/drivers/net/ovpn/main.h
index 380adb593d0c..21d6bfb27d67 100644
--- a/drivers/net/ovpn/main.h
+++ b/drivers/net/ovpn/main.h
@@ -10,6 +10,30 @@ 
 #ifndef _NET_OVPN_MAIN_H_
 #define _NET_OVPN_MAIN_H_
 
+/**
+ * ovpn_iface_create - create and initialize a new 'ovpn' netdevice
+ * @name: the name of the new device
+ * @mode: the OpenVPN mode to set this device to
+ * @net: the netns this device should be created in
+ *
+ * A new netdevice is created and registered.
+ * Its private area is initialized with an empty ovpn_struct object.
+ *
+ * Return: a pointer to the new device on success or a negative error code
+ *         otherwise
+ */
+struct net_device *ovpn_iface_create(const char *name, enum ovpn_mode mode,
+				     struct net *net);
+
+/**
+ * ovpn_iface_destruct - tear down netdevice
+ * @ovpn: the ovpn instance objected related to the interface to tear down
+ *
+ * This function takes care of tearing down an ovpn device and can be invoked
+ * internally or upon UNREGISTER netdev event
+ */
+void ovpn_iface_destruct(struct ovpn_struct *ovpn);
+
 /**
  * ovpn_dev_is_valid - check if the netdevice is of type 'ovpn'
  * @dev: the interface to check
@@ -18,4 +42,11 @@ 
  */
 bool ovpn_dev_is_valid(const struct net_device *dev);
 
+#define SKB_HEADER_LEN                                       \
+	(max(sizeof(struct iphdr), sizeof(struct ipv6hdr)) + \
+	 sizeof(struct udphdr) + NET_SKB_PAD)
+
+#define OVPN_HEAD_ROOM ALIGN(16 + SKB_HEADER_LEN, 4)
+#define OVPN_MAX_PADDING 16
+
 #endif /* _NET_OVPN_MAIN_H_ */
diff --git a/drivers/net/ovpn/ovpnstruct.h b/drivers/net/ovpn/ovpnstruct.h
index ff248cad1401..ee05b8a2c61d 100644
--- a/drivers/net/ovpn/ovpnstruct.h
+++ b/drivers/net/ovpn/ovpnstruct.h
@@ -10,12 +10,20 @@ 
 #ifndef _NET_OVPN_OVPNSTRUCT_H_
 #define _NET_OVPN_OVPNSTRUCT_H_
 
+#include <uapi/linux/ovpn.h>
+
 /**
  * struct ovpn_struct - per ovpn interface state
  * @dev: the actual netdev representing the tunnel
+ * @registered: whether dev is still registered with netdev or not
+ * @mode: device operation mode (i.e. p2p, mp, ..)
+ * @dev_list: entry for the module wide device list
  */
 struct ovpn_struct {
 	struct net_device *dev;
+	bool registered;
+	enum ovpn_mode mode;
+	struct list_head dev_list;
 };
 
 #endif /* _NET_OVPN_OVPNSTRUCT_H_ */
diff --git a/drivers/net/ovpn/packet.h b/drivers/net/ovpn/packet.h
new file mode 100644
index 000000000000..7ed146f5932a
--- /dev/null
+++ b/drivers/net/ovpn/packet.h
@@ -0,0 +1,40 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*  OpenVPN data channel offload
+ *
+ *  Copyright (C) 2020-2024 OpenVPN, Inc.
+ *
+ *  Author:	Antonio Quartulli <antonio@openvpn.net>
+ *		James Yonan <james@openvpn.net>
+ */
+
+#ifndef _NET_OVPN_PACKET_H_
+#define _NET_OVPN_PACKET_H_
+
+/* When the OpenVPN protocol is ran in AEAD mode, use
+ * the OpenVPN packet ID as the AEAD nonce:
+ *
+ *    00000005 521c3b01 4308c041
+ *    [seq # ] [  nonce_tail   ]
+ *    [     12-byte full IV    ] -> NONCE_SIZE
+ *    [4-bytes                   -> NONCE_WIRE_SIZE
+ *    on wire]
+ */
+
+/* OpenVPN nonce size */
+#define NONCE_SIZE 12
+
+/* OpenVPN nonce size reduced by 8-byte nonce tail -- this is the
+ * size of the AEAD Associated Data (AD) sent over the wire
+ * and is normally the head of the IV
+ */
+#define NONCE_WIRE_SIZE (NONCE_SIZE - sizeof(struct ovpn_nonce_tail))
+
+/* Last 8 bytes of AEAD nonce
+ * Provided by userspace and usually derived from
+ * key material generated during TLS handshake
+ */
+struct ovpn_nonce_tail {
+	u8 u8[OVPN_NONCE_TAIL_SIZE];
+};
+
+#endif /* _NET_OVPN_PACKET_H_ */