diff mbox series

[net-next] net: geneve: support IPv4/IPv6 as inner protocol

Message ID 20220315185603.160778-1-eyal.birger@gmail.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next] net: geneve: support IPv4/IPv6 as inner protocol | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4769 this patch: 4769
netdev/cc_maintainers success CCed 3 of 3 maintainers
netdev/build_clang success Errors and warnings before: 824 this patch: 824
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4924 this patch: 4924
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 179 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Eyal Birger March 15, 2022, 6:56 p.m. UTC
This patch adds support for encapsulating IPv4/IPv6 within GENEVE.

In order use this, a new IFLA_GENEVE_TUN flag needs to be provided at
device creation. This property cannot be changed for the time being.

In case IP traffic is received on a non-tun device the drop count is
increased.

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
---
 drivers/net/geneve.c         | 79 +++++++++++++++++++++++++++---------
 include/uapi/linux/if_link.h |  1 +
 2 files changed, 61 insertions(+), 19 deletions(-)

Comments

Roopa Prabhu March 15, 2022, 7:57 p.m. UTC | #1
On 3/15/22 11:56, Eyal Birger wrote:
> This patch adds support for encapsulating IPv4/IPv6 within GENEVE.
>
> In order use this, a new IFLA_GENEVE_TUN flag needs to be provided at
> device creation. This property cannot be changed for the time being.
>
> In case IP traffic is received on a non-tun device the drop count is
> increased.
>
> Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
> ---
>   drivers/net/geneve.c         | 79 +++++++++++++++++++++++++++---------
>   include/uapi/linux/if_link.h |  1 +
>   2 files changed, 61 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
> index a895ff756093..37305ec26250 100644
> --- a/drivers/net/geneve.c
> +++ b/drivers/net/geneve.c
> @@ -56,6 +56,7 @@ struct geneve_config {
>   	bool			use_udp6_rx_checksums;
>   	bool			ttl_inherit;
>   	enum ifla_geneve_df	df;
> +	bool			tun;
>   };
>   
>   /* Pseudo network device */
> @@ -251,17 +252,24 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
>   		}
>   	}
>   
> -	skb_reset_mac_header(skb);
> -	skb->protocol = eth_type_trans(skb, geneve->dev);
> -	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> -
>   	if (tun_dst)
>   		skb_dst_set(skb, &tun_dst->dst);
>   
> -	/* Ignore packet loops (and multicast echo) */
> -	if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) {
> -		geneve->dev->stats.rx_errors++;
> -		goto drop;
> +	if (gnvh->proto_type == htons(ETH_P_TEB)) {
> +		skb_reset_mac_header(skb);
> +		skb->protocol = eth_type_trans(skb, geneve->dev);
> +		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> +
> +		/* Ignore packet loops (and multicast echo) */
> +		if (ether_addr_equal(eth_hdr(skb)->h_source,
> +				     geneve->dev->dev_addr)) {
> +			geneve->dev->stats.rx_errors++;
> +			goto drop;
> +		}
> +	} else {
> +		skb_reset_mac_header(skb);
> +		skb->dev = geneve->dev;
> +		skb->pkt_type = PACKET_HOST;
>   	}
>   
>   	oiph = skb_network_header(skb);
> @@ -345,6 +353,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>   	struct genevehdr *geneveh;
>   	struct geneve_dev *geneve;
>   	struct geneve_sock *gs;
> +	__be16 inner_proto;
>   	int opts_len;
>   
>   	/* Need UDP and Geneve header to be present */
> @@ -356,8 +365,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>   	if (unlikely(geneveh->ver != GENEVE_VER))
>   		goto drop;
>   
> -	if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
> +	inner_proto = geneveh->proto_type;
> +
> +	if (unlikely((inner_proto != htons(ETH_P_TEB) &&
> +		      inner_proto != htons(ETH_P_IP) &&
> +		      inner_proto != htons(ETH_P_IPV6)))) {
>   		goto drop;
> +	}
>   


unnecessary braces

>   	gs = rcu_dereference_sk_user_data(sk);
>   	if (!gs)
> @@ -367,9 +381,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>   	if (!geneve)
>   		goto drop;
>   
> +	if (unlikely((!geneve->cfg.tun && inner_proto != htons(ETH_P_TEB)))) {
> +		geneve->dev->stats.rx_dropped++;
> +		goto drop;
> +	}

Does this change current default behavior ?.

its unclear to be what the current behavior is for a non ETH_P_TEB packet


> +
>   	opts_len = geneveh->opt_len * 4;
> -	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
> -				 htons(ETH_P_TEB),
> +	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto,
>   				 !net_eq(geneve->net, dev_net(geneve->dev)))) {
>   		geneve->dev->stats.rx_dropped++;
>   		goto drop;
> @@ -717,7 +735,8 @@ static int geneve_stop(struct net_device *dev)
>   }
>   
>   static void geneve_build_header(struct genevehdr *geneveh,
> -				const struct ip_tunnel_info *info)
> +				const struct ip_tunnel_info *info,
> +				__be16 inner_proto)
>   {
>   	geneveh->ver = GENEVE_VER;
>   	geneveh->opt_len = info->options_len / 4;
> @@ -725,7 +744,7 @@ static void geneve_build_header(struct genevehdr *geneveh,
>   	geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT);
>   	geneveh->rsvd1 = 0;
>   	tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
> -	geneveh->proto_type = htons(ETH_P_TEB);
> +	geneveh->proto_type = inner_proto;
>   	geneveh->rsvd2 = 0;
>   
>   	if (info->key.tun_flags & TUNNEL_GENEVE_OPT)
> @@ -734,8 +753,9 @@ static void geneve_build_header(struct genevehdr *geneveh,
>   
>   static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
>   			    const struct ip_tunnel_info *info,
> -			    bool xnet, int ip_hdr_len)
> +			    bool xnet, int ip_hdr_len, bool tun)
>   {
> +	__be16 inner_proto = tun ? skb->protocol : htons(ETH_P_TEB);
>   	bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
>   	struct genevehdr *gnvh;
>   	int min_headroom;
> @@ -755,8 +775,8 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
>   		goto free_dst;
>   
>   	gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len);
> -	geneve_build_header(gnvh, info);
> -	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> +	geneve_build_header(gnvh, info, inner_proto);
> +	skb_set_inner_protocol(skb, inner_proto);
>   	return 0;
>   
>   free_dst:
> @@ -959,7 +979,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
>   		}
>   	}
>   
> -	err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
> +	err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr),
> +			       geneve->cfg.tun);
>   	if (unlikely(err))
>   		return err;
>   
> @@ -1038,7 +1059,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
>   			ttl = key->ttl;
>   		ttl = ttl ? : ip6_dst_hoplimit(dst);
>   	}
> -	err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr));
> +	err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr),
> +			       geneve->cfg.tun);
>   	if (unlikely(err))
>   		return err;
>   
> @@ -1388,6 +1410,14 @@ static int geneve_configure(struct net *net, struct net_device *dev,
>   	dst_cache_reset(&geneve->cfg.info.dst_cache);
>   	memcpy(&geneve->cfg, cfg, sizeof(*cfg));
>   
> +	if (geneve->cfg.tun) {
> +		dev->header_ops = NULL;
> +		dev->type = ARPHRD_NONE;
> +		dev->hard_header_len = 0;
> +		dev->addr_len = 0;
> +		dev->flags = IFF_NOARP;
> +	}
> +
>   	err = register_netdevice(dev);
>   	if (err)
>   		return err;
> @@ -1561,10 +1591,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
>   #endif
>   	}
>   
> +	if (data[IFLA_GENEVE_TUN]) {
> +		if (changelink) {
> +			attrtype = IFLA_GENEVE_TUN;
> +			goto change_notsup;
> +		}
> +		cfg->tun = true;
> +	}
> +
>   	return 0;
>   change_notsup:
>   	NL_SET_ERR_MSG_ATTR(extack, data[attrtype],
> -			    "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported");
> +			    "Changing VNI, Port, endpoint IP address family, external, tun, and UDP checksum attributes are not supported");
>   	return -EOPNOTSUPP;
>   }
>   
> @@ -1799,6 +1837,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
>   	if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
>   		goto nla_put_failure;
>   
> +	if (geneve->cfg.tun && nla_put_flag(skb, IFLA_GENEVE_TUN))
> +		goto nla_put_failure;
> +
>   	return 0;
>   
>   nla_put_failure:
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index bd24c7dc10a2..198aefa2c513 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -842,6 +842,7 @@ enum {
>   	IFLA_GENEVE_LABEL,
>   	IFLA_GENEVE_TTL_INHERIT,
>   	IFLA_GENEVE_DF,
> +	IFLA_GENEVE_TUN,

geneve is itself called a tunnel, i wonder if a different name for the 
flag would be more appropriate.

what is the use case for this ?. is there a RFC reference ?



>   	__IFLA_GENEVE_MAX
>   };
>   #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
Eyal Birger March 15, 2022, 8:22 p.m. UTC | #2
Hi Roopa,

On Tue, Mar 15, 2022 at 9:57 PM Roopa Prabhu <roopa@nvidia.com> wrote:
>
>
> On 3/15/22 11:56, Eyal Birger wrote:
> > This patch adds support for encapsulating IPv4/IPv6 within GENEVE.
> >
> > In order use this, a new IFLA_GENEVE_TUN flag needs to be provided at
> > device creation. This property cannot be changed for the time being.
> >
> > In case IP traffic is received on a non-tun device the drop count is
> > increased.
> >
> > Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
> > ---
> >   drivers/net/geneve.c         | 79 +++++++++++++++++++++++++++---------
> >   include/uapi/linux/if_link.h |  1 +
> >   2 files changed, 61 insertions(+), 19 deletions(-)
> >
> > diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
> > index a895ff756093..37305ec26250 100644
> > --- a/drivers/net/geneve.c
> > +++ b/drivers/net/geneve.c
> > @@ -56,6 +56,7 @@ struct geneve_config {
> >       bool                    use_udp6_rx_checksums;
> >       bool                    ttl_inherit;
> >       enum ifla_geneve_df     df;
> > +     bool                    tun;
> >   };
> >
> >   /* Pseudo network device */
> > @@ -251,17 +252,24 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
> >               }
> >       }
> >
> > -     skb_reset_mac_header(skb);
> > -     skb->protocol = eth_type_trans(skb, geneve->dev);
> > -     skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> > -
> >       if (tun_dst)
> >               skb_dst_set(skb, &tun_dst->dst);
> >
> > -     /* Ignore packet loops (and multicast echo) */
> > -     if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) {
> > -             geneve->dev->stats.rx_errors++;
> > -             goto drop;
> > +     if (gnvh->proto_type == htons(ETH_P_TEB)) {
> > +             skb_reset_mac_header(skb);
> > +             skb->protocol = eth_type_trans(skb, geneve->dev);
> > +             skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> > +
> > +             /* Ignore packet loops (and multicast echo) */
> > +             if (ether_addr_equal(eth_hdr(skb)->h_source,
> > +                                  geneve->dev->dev_addr)) {
> > +                     geneve->dev->stats.rx_errors++;
> > +                     goto drop;
> > +             }
> > +     } else {
> > +             skb_reset_mac_header(skb);
> > +             skb->dev = geneve->dev;
> > +             skb->pkt_type = PACKET_HOST;
> >       }
> >
> >       oiph = skb_network_header(skb);
> > @@ -345,6 +353,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> >       struct genevehdr *geneveh;
> >       struct geneve_dev *geneve;
> >       struct geneve_sock *gs;
> > +     __be16 inner_proto;
> >       int opts_len;
> >
> >       /* Need UDP and Geneve header to be present */
> > @@ -356,8 +365,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> >       if (unlikely(geneveh->ver != GENEVE_VER))
> >               goto drop;
> >
> > -     if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
> > +     inner_proto = geneveh->proto_type;
> > +
> > +     if (unlikely((inner_proto != htons(ETH_P_TEB) &&
> > +                   inner_proto != htons(ETH_P_IP) &&
> > +                   inner_proto != htons(ETH_P_IPV6)))) {
> >               goto drop;
> > +     }
> >
>
>
> unnecessary braces

Will fix.

>
> >       gs = rcu_dereference_sk_user_data(sk);
> >       if (!gs)
> > @@ -367,9 +381,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> >       if (!geneve)
> >               goto drop;
> >
> > +     if (unlikely((!geneve->cfg.tun && inner_proto != htons(ETH_P_TEB)))) {
> > +             geneve->dev->stats.rx_dropped++;
> > +             goto drop;
> > +     }
>
> Does this change current default behavior ?.
>
> its unclear to be what the current behavior is for a non ETH_P_TEB packet

Currently non ETH_P_TEB packets are silently dropped.
I figured that if the driver supported other ethertypes it would make
sense to increase the count in such case, to assist in debugging wrong
configurations.

I can remove this if it's better to keep the current behavior.
>
>
> > +
> >       opts_len = geneveh->opt_len * 4;
> > -     if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
> > -                              htons(ETH_P_TEB),
> > +     if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto,
> >                                !net_eq(geneve->net, dev_net(geneve->dev)))) {
> >               geneve->dev->stats.rx_dropped++;
> >               goto drop;
> > @@ -717,7 +735,8 @@ static int geneve_stop(struct net_device *dev)
> >   }
> >
> >   static void geneve_build_header(struct genevehdr *geneveh,
> > -                             const struct ip_tunnel_info *info)
> > +                             const struct ip_tunnel_info *info,
> > +                             __be16 inner_proto)
> >   {
> >       geneveh->ver = GENEVE_VER;
> >       geneveh->opt_len = info->options_len / 4;
> > @@ -725,7 +744,7 @@ static void geneve_build_header(struct genevehdr *geneveh,
> >       geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT);
> >       geneveh->rsvd1 = 0;
> >       tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
> > -     geneveh->proto_type = htons(ETH_P_TEB);
> > +     geneveh->proto_type = inner_proto;
> >       geneveh->rsvd2 = 0;
> >
> >       if (info->key.tun_flags & TUNNEL_GENEVE_OPT)
> > @@ -734,8 +753,9 @@ static void geneve_build_header(struct genevehdr *geneveh,
> >
> >   static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
> >                           const struct ip_tunnel_info *info,
> > -                         bool xnet, int ip_hdr_len)
> > +                         bool xnet, int ip_hdr_len, bool tun)
> >   {
> > +     __be16 inner_proto = tun ? skb->protocol : htons(ETH_P_TEB);
> >       bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
> >       struct genevehdr *gnvh;
> >       int min_headroom;
> > @@ -755,8 +775,8 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
> >               goto free_dst;
> >
> >       gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len);
> > -     geneve_build_header(gnvh, info);
> > -     skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> > +     geneve_build_header(gnvh, info, inner_proto);
> > +     skb_set_inner_protocol(skb, inner_proto);
> >       return 0;
> >
> >   free_dst:
> > @@ -959,7 +979,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
> >               }
> >       }
> >
> > -     err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
> > +     err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr),
> > +                            geneve->cfg.tun);
> >       if (unlikely(err))
> >               return err;
> >
> > @@ -1038,7 +1059,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
> >                       ttl = key->ttl;
> >               ttl = ttl ? : ip6_dst_hoplimit(dst);
> >       }
> > -     err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr));
> > +     err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr),
> > +                            geneve->cfg.tun);
> >       if (unlikely(err))
> >               return err;
> >
> > @@ -1388,6 +1410,14 @@ static int geneve_configure(struct net *net, struct net_device *dev,
> >       dst_cache_reset(&geneve->cfg.info.dst_cache);
> >       memcpy(&geneve->cfg, cfg, sizeof(*cfg));
> >
> > +     if (geneve->cfg.tun) {
> > +             dev->header_ops = NULL;
> > +             dev->type = ARPHRD_NONE;
> > +             dev->hard_header_len = 0;
> > +             dev->addr_len = 0;
> > +             dev->flags = IFF_NOARP;
> > +     }
> > +
> >       err = register_netdevice(dev);
> >       if (err)
> >               return err;
> > @@ -1561,10 +1591,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
> >   #endif
> >       }
> >
> > +     if (data[IFLA_GENEVE_TUN]) {
> > +             if (changelink) {
> > +                     attrtype = IFLA_GENEVE_TUN;
> > +                     goto change_notsup;
> > +             }
> > +             cfg->tun = true;
> > +     }
> > +
> >       return 0;
> >   change_notsup:
> >       NL_SET_ERR_MSG_ATTR(extack, data[attrtype],
> > -                         "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported");
> > +                         "Changing VNI, Port, endpoint IP address family, external, tun, and UDP checksum attributes are not supported");
> >       return -EOPNOTSUPP;
> >   }
> >
> > @@ -1799,6 +1837,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
> >       if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
> >               goto nla_put_failure;
> >
> > +     if (geneve->cfg.tun && nla_put_flag(skb, IFLA_GENEVE_TUN))
> > +             goto nla_put_failure;
> > +
> >       return 0;
> >
> >   nla_put_failure:
> > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> > index bd24c7dc10a2..198aefa2c513 100644
> > --- a/include/uapi/linux/if_link.h
> > +++ b/include/uapi/linux/if_link.h
> > @@ -842,6 +842,7 @@ enum {
> >       IFLA_GENEVE_LABEL,
> >       IFLA_GENEVE_TTL_INHERIT,
> >       IFLA_GENEVE_DF,
> > +     IFLA_GENEVE_TUN,
>
> geneve is itself called a tunnel, i wonder if a different name for the
> flag would be more appropriate.

I tried to find a reference to something similar, so went with something like
the tun vs. tap distinction. I was also concerned about the possible
confusion, but it felt clearer than something like L3_INNER_PROTO_MODE.

I'd happily replace it with a better suggestion.

>
> what is the use case for this ?. is there a RFC reference ?

I stumbled upon this configuration when trying to receive packets from an
AWS "Gateway Load Balancer" which sends IP packets encapsulated in GENEVE [1].

But to my understanding the RFC allows this so it doesn't seem something
specific to AWS.

Thanks for the review!

Eyal.

[1] https://aws.amazon.com/blogs/networking-and-content-delivery/integrate-your-custom-logic-or-appliance-with-aws-gateway-load-balancer/

>
>
>
> >       __IFLA_GENEVE_MAX
> >   };
> >   #define IFLA_GENEVE_MAX     (__IFLA_GENEVE_MAX - 1)
Roopa Prabhu March 16, 2022, 12:32 a.m. UTC | #3
On 3/15/22 13:22, Eyal Birger wrote:
> Hi Roopa,
>
> On Tue, Mar 15, 2022 at 9:57 PM Roopa Prabhu <roopa@nvidia.com> wrote:
>>
>> On 3/15/22 11:56, Eyal Birger wrote:
>>> This patch adds support for encapsulating IPv4/IPv6 within GENEVE.
>>>
>>> In order use this, a new IFLA_GENEVE_TUN flag needs to be provided at
>>> device creation. This property cannot be changed for the time being.
>>>
>>> In case IP traffic is received on a non-tun device the drop count is
>>> increased.
>>>
>>> Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
>>> ---
>>>    drivers/net/geneve.c         | 79 +++++++++++++++++++++++++++---------
>>>    include/uapi/linux/if_link.h |  1 +
>>>    2 files changed, 61 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
>>> index a895ff756093..37305ec26250 100644
>>> --- a/drivers/net/geneve.c
>>> +++ b/drivers/net/geneve.c
>>> @@ -56,6 +56,7 @@ struct geneve_config {
>>>        bool                    use_udp6_rx_checksums;
>>>        bool                    ttl_inherit;
>>>        enum ifla_geneve_df     df;
>>> +     bool                    tun;
>>>    };
>>>
>>>    /* Pseudo network device */
>>> @@ -251,17 +252,24 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
>>>                }
>>>        }
>>>
>>> -     skb_reset_mac_header(skb);
>>> -     skb->protocol = eth_type_trans(skb, geneve->dev);
>>> -     skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
>>> -
>>>        if (tun_dst)
>>>                skb_dst_set(skb, &tun_dst->dst);
>>>
>>> -     /* Ignore packet loops (and multicast echo) */
>>> -     if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) {
>>> -             geneve->dev->stats.rx_errors++;
>>> -             goto drop;
>>> +     if (gnvh->proto_type == htons(ETH_P_TEB)) {
>>> +             skb_reset_mac_header(skb);
>>> +             skb->protocol = eth_type_trans(skb, geneve->dev);
>>> +             skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
>>> +
>>> +             /* Ignore packet loops (and multicast echo) */
>>> +             if (ether_addr_equal(eth_hdr(skb)->h_source,
>>> +                                  geneve->dev->dev_addr)) {
>>> +                     geneve->dev->stats.rx_errors++;
>>> +                     goto drop;
>>> +             }
>>> +     } else {
>>> +             skb_reset_mac_header(skb);
>>> +             skb->dev = geneve->dev;
>>> +             skb->pkt_type = PACKET_HOST;
>>>        }
>>>
>>>        oiph = skb_network_header(skb);
>>> @@ -345,6 +353,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>>>        struct genevehdr *geneveh;
>>>        struct geneve_dev *geneve;
>>>        struct geneve_sock *gs;
>>> +     __be16 inner_proto;
>>>        int opts_len;
>>>
>>>        /* Need UDP and Geneve header to be present */
>>> @@ -356,8 +365,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>>>        if (unlikely(geneveh->ver != GENEVE_VER))
>>>                goto drop;
>>>
>>> -     if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
>>> +     inner_proto = geneveh->proto_type;
>>> +
>>> +     if (unlikely((inner_proto != htons(ETH_P_TEB) &&
>>> +                   inner_proto != htons(ETH_P_IP) &&
>>> +                   inner_proto != htons(ETH_P_IPV6)))) {
>>>                goto drop;
>>> +     }
>>>
>>
>> unnecessary braces
> Will fix.
>
>>>        gs = rcu_dereference_sk_user_data(sk);
>>>        if (!gs)
>>> @@ -367,9 +381,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>>>        if (!geneve)
>>>                goto drop;
>>>
>>> +     if (unlikely((!geneve->cfg.tun && inner_proto != htons(ETH_P_TEB)))) {
>>> +             geneve->dev->stats.rx_dropped++;
>>> +             goto drop;
>>> +     }
>> Does this change current default behavior ?.
>>
>> its unclear to be what the current behavior is for a non ETH_P_TEB packet
> Currently non ETH_P_TEB packets are silently dropped.
> I figured that if the driver supported other ethertypes it would make
> sense to increase the count in such case, to assist in debugging wrong
> configurations.
>
> I can remove this if it's better to keep the current behavior.

yes, agree. counting is better.


>>
>>> +
>>>        opts_len = geneveh->opt_len * 4;
>>> -     if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
>>> -                              htons(ETH_P_TEB),
>>> +     if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto,
>>>                                 !net_eq(geneve->net, dev_net(geneve->dev)))) {
>>>                geneve->dev->stats.rx_dropped++;
>>>                goto drop;
>>> @@ -717,7 +735,8 @@ static int geneve_stop(struct net_device *dev)
>>>    }
>>>
>>>    static void geneve_build_header(struct genevehdr *geneveh,
>>> -                             const struct ip_tunnel_info *info)
>>> +                             const struct ip_tunnel_info *info,
>>> +                             __be16 inner_proto)
>>>    {
>>>        geneveh->ver = GENEVE_VER;
>>>        geneveh->opt_len = info->options_len / 4;
>>> @@ -725,7 +744,7 @@ static void geneve_build_header(struct genevehdr *geneveh,
>>>        geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT);
>>>        geneveh->rsvd1 = 0;
>>>        tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
>>> -     geneveh->proto_type = htons(ETH_P_TEB);
>>> +     geneveh->proto_type = inner_proto;
>>>        geneveh->rsvd2 = 0;
>>>
>>>        if (info->key.tun_flags & TUNNEL_GENEVE_OPT)
>>> @@ -734,8 +753,9 @@ static void geneve_build_header(struct genevehdr *geneveh,
>>>
>>>    static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
>>>                            const struct ip_tunnel_info *info,
>>> -                         bool xnet, int ip_hdr_len)
>>> +                         bool xnet, int ip_hdr_len, bool tun)
>>>    {
>>> +     __be16 inner_proto = tun ? skb->protocol : htons(ETH_P_TEB);
>>>        bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
>>>        struct genevehdr *gnvh;
>>>        int min_headroom;
>>> @@ -755,8 +775,8 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
>>>                goto free_dst;
>>>
>>>        gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len);
>>> -     geneve_build_header(gnvh, info);
>>> -     skb_set_inner_protocol(skb, htons(ETH_P_TEB));
>>> +     geneve_build_header(gnvh, info, inner_proto);
>>> +     skb_set_inner_protocol(skb, inner_proto);
>>>        return 0;
>>>
>>>    free_dst:
>>> @@ -959,7 +979,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
>>>                }
>>>        }
>>>
>>> -     err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
>>> +     err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr),
>>> +                            geneve->cfg.tun);
>>>        if (unlikely(err))
>>>                return err;
>>>
>>> @@ -1038,7 +1059,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
>>>                        ttl = key->ttl;
>>>                ttl = ttl ? : ip6_dst_hoplimit(dst);
>>>        }
>>> -     err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr));
>>> +     err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr),
>>> +                            geneve->cfg.tun);
>>>        if (unlikely(err))
>>>                return err;
>>>
>>> @@ -1388,6 +1410,14 @@ static int geneve_configure(struct net *net, struct net_device *dev,
>>>        dst_cache_reset(&geneve->cfg.info.dst_cache);
>>>        memcpy(&geneve->cfg, cfg, sizeof(*cfg));
>>>
>>> +     if (geneve->cfg.tun) {
>>> +             dev->header_ops = NULL;
>>> +             dev->type = ARPHRD_NONE;
>>> +             dev->hard_header_len = 0;
>>> +             dev->addr_len = 0;
>>> +             dev->flags = IFF_NOARP;
>>> +     }
>>> +
>>>        err = register_netdevice(dev);
>>>        if (err)
>>>                return err;
>>> @@ -1561,10 +1591,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
>>>    #endif
>>>        }
>>>
>>> +     if (data[IFLA_GENEVE_TUN]) {
>>> +             if (changelink) {
>>> +                     attrtype = IFLA_GENEVE_TUN;
>>> +                     goto change_notsup;
>>> +             }
>>> +             cfg->tun = true;
>>> +     }
>>> +
>>>        return 0;
>>>    change_notsup:
>>>        NL_SET_ERR_MSG_ATTR(extack, data[attrtype],
>>> -                         "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported");
>>> +                         "Changing VNI, Port, endpoint IP address family, external, tun, and UDP checksum attributes are not supported");
>>>        return -EOPNOTSUPP;
>>>    }
>>>
>>> @@ -1799,6 +1837,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
>>>        if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
>>>                goto nla_put_failure;
>>>
>>> +     if (geneve->cfg.tun && nla_put_flag(skb, IFLA_GENEVE_TUN))
>>> +             goto nla_put_failure;
>>> +
>>>        return 0;
>>>
>>>    nla_put_failure:
>>> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
>>> index bd24c7dc10a2..198aefa2c513 100644
>>> --- a/include/uapi/linux/if_link.h
>>> +++ b/include/uapi/linux/if_link.h
>>> @@ -842,6 +842,7 @@ enum {
>>>        IFLA_GENEVE_LABEL,
>>>        IFLA_GENEVE_TTL_INHERIT,
>>>        IFLA_GENEVE_DF,
>>> +     IFLA_GENEVE_TUN,
>> geneve is itself called a tunnel, i wonder if a different name for the
>> flag would be more appropriate.
> I tried to find a reference to something similar, so went with something like
> the tun vs. tap distinction. I was also concerned about the possible
> confusion, but it felt clearer than something like L3_INNER_PROTO_MODE.
>
> I'd happily replace it with a better suggestion.

o ok, makes sense. I can't think of anything other than simply 
IFLA_GENEVE_INNER_PROTO

(maybe others have better suggestions)


>
>> what is the use case for this ?. is there a RFC reference ?
> I stumbled upon this configuration when trying to receive packets from an
> AWS "Gateway Load Balancer" which sends IP packets encapsulated in GENEVE [1].
>
> But to my understanding the RFC allows this so it doesn't seem something
> specific to AWS.

that makes me wonder if we need a knob atall and if this should be 
allowed by default.

wonder how other network vendor standard geneve implementations behave 
by default.


>
> Thanks for the review!
>
> Eyal.
>
> [1] https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Faws.amazon.com%2Fblogs%2Fnetworking-and-content-delivery%2Fintegrate-your-custom-logic-or-appliance-with-aws-gateway-load-balancer%2F&amp;data=04%7C01%7Croopa%40nvidia.com%7C223c13c616ef430a487f08da06c19610%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637829725767772379%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=ZQTDrFJ8LLn5SdN6h%2B5NECxwlD9PAGV2KzpVUV%2B1chc%3D&amp;reserved=0

Thanks for the details.
Eyal Birger March 16, 2022, 3:50 a.m. UTC | #4
On Wed, Mar 16, 2022 at 2:33 AM Roopa Prabhu <roopa@nvidia.com> wrote:
>
>
> On 3/15/22 13:22, Eyal Birger wrote:
> > Hi Roopa,
> >
> > On Tue, Mar 15, 2022 at 9:57 PM Roopa Prabhu <roopa@nvidia.com> wrote:
> >>
> >> On 3/15/22 11:56, Eyal Birger wrote:
> >>> This patch adds support for encapsulating IPv4/IPv6 within GENEVE.
> >>>
> >>> In order use this, a new IFLA_GENEVE_TUN flag needs to be provided at
> >>> device creation. This property cannot be changed for the time being.
> >>>
> >>> In case IP traffic is received on a non-tun device the drop count is
> >>> increased.
> >>>
> >>> Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
> >>> ---
> >>>    drivers/net/geneve.c         | 79 +++++++++++++++++++++++++++---------
> >>>    include/uapi/linux/if_link.h |  1 +
> >>>    2 files changed, 61 insertions(+), 19 deletions(-)
> >>>
> >>> diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
> >>> index a895ff756093..37305ec26250 100644
> >>> --- a/drivers/net/geneve.c
> >>> +++ b/drivers/net/geneve.c
> >>> @@ -56,6 +56,7 @@ struct geneve_config {
> >>>        bool                    use_udp6_rx_checksums;
> >>>        bool                    ttl_inherit;
> >>>        enum ifla_geneve_df     df;
> >>> +     bool                    tun;
> >>>    };
> >>>
> >>>    /* Pseudo network device */
> >>> @@ -251,17 +252,24 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
> >>>                }
> >>>        }
> >>>
> >>> -     skb_reset_mac_header(skb);
> >>> -     skb->protocol = eth_type_trans(skb, geneve->dev);
> >>> -     skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> >>> -
> >>>        if (tun_dst)
> >>>                skb_dst_set(skb, &tun_dst->dst);
> >>>
> >>> -     /* Ignore packet loops (and multicast echo) */
> >>> -     if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) {
> >>> -             geneve->dev->stats.rx_errors++;
> >>> -             goto drop;
> >>> +     if (gnvh->proto_type == htons(ETH_P_TEB)) {
> >>> +             skb_reset_mac_header(skb);
> >>> +             skb->protocol = eth_type_trans(skb, geneve->dev);
> >>> +             skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> >>> +
> >>> +             /* Ignore packet loops (and multicast echo) */
> >>> +             if (ether_addr_equal(eth_hdr(skb)->h_source,
> >>> +                                  geneve->dev->dev_addr)) {
> >>> +                     geneve->dev->stats.rx_errors++;
> >>> +                     goto drop;
> >>> +             }
> >>> +     } else {
> >>> +             skb_reset_mac_header(skb);
> >>> +             skb->dev = geneve->dev;
> >>> +             skb->pkt_type = PACKET_HOST;
> >>>        }
> >>>
> >>>        oiph = skb_network_header(skb);
> >>> @@ -345,6 +353,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> >>>        struct genevehdr *geneveh;
> >>>        struct geneve_dev *geneve;
> >>>        struct geneve_sock *gs;
> >>> +     __be16 inner_proto;
> >>>        int opts_len;
> >>>
> >>>        /* Need UDP and Geneve header to be present */
> >>> @@ -356,8 +365,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> >>>        if (unlikely(geneveh->ver != GENEVE_VER))
> >>>                goto drop;
> >>>
> >>> -     if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
> >>> +     inner_proto = geneveh->proto_type;
> >>> +
> >>> +     if (unlikely((inner_proto != htons(ETH_P_TEB) &&
> >>> +                   inner_proto != htons(ETH_P_IP) &&
> >>> +                   inner_proto != htons(ETH_P_IPV6)))) {
> >>>                goto drop;
> >>> +     }
> >>>
> >>
> >> unnecessary braces
> > Will fix.
> >
> >>>        gs = rcu_dereference_sk_user_data(sk);
> >>>        if (!gs)
> >>> @@ -367,9 +381,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> >>>        if (!geneve)
> >>>                goto drop;
> >>>
> >>> +     if (unlikely((!geneve->cfg.tun && inner_proto != htons(ETH_P_TEB)))) {
> >>> +             geneve->dev->stats.rx_dropped++;
> >>> +             goto drop;
> >>> +     }
> >> Does this change current default behavior ?.
> >>
> >> its unclear to be what the current behavior is for a non ETH_P_TEB packet
> > Currently non ETH_P_TEB packets are silently dropped.
> > I figured that if the driver supported other ethertypes it would make
> > sense to increase the count in such case, to assist in debugging wrong
> > configurations.
> >
> > I can remove this if it's better to keep the current behavior.
>
> yes, agree. counting is better.
>
>
> >>
> >>> +
> >>>        opts_len = geneveh->opt_len * 4;
> >>> -     if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
> >>> -                              htons(ETH_P_TEB),
> >>> +     if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto,
> >>>                                 !net_eq(geneve->net, dev_net(geneve->dev)))) {
> >>>                geneve->dev->stats.rx_dropped++;
> >>>                goto drop;
> >>> @@ -717,7 +735,8 @@ static int geneve_stop(struct net_device *dev)
> >>>    }
> >>>
> >>>    static void geneve_build_header(struct genevehdr *geneveh,
> >>> -                             const struct ip_tunnel_info *info)
> >>> +                             const struct ip_tunnel_info *info,
> >>> +                             __be16 inner_proto)
> >>>    {
> >>>        geneveh->ver = GENEVE_VER;
> >>>        geneveh->opt_len = info->options_len / 4;
> >>> @@ -725,7 +744,7 @@ static void geneve_build_header(struct genevehdr *geneveh,
> >>>        geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT);
> >>>        geneveh->rsvd1 = 0;
> >>>        tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
> >>> -     geneveh->proto_type = htons(ETH_P_TEB);
> >>> +     geneveh->proto_type = inner_proto;
> >>>        geneveh->rsvd2 = 0;
> >>>
> >>>        if (info->key.tun_flags & TUNNEL_GENEVE_OPT)
> >>> @@ -734,8 +753,9 @@ static void geneve_build_header(struct genevehdr *geneveh,
> >>>
> >>>    static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
> >>>                            const struct ip_tunnel_info *info,
> >>> -                         bool xnet, int ip_hdr_len)
> >>> +                         bool xnet, int ip_hdr_len, bool tun)
> >>>    {
> >>> +     __be16 inner_proto = tun ? skb->protocol : htons(ETH_P_TEB);
> >>>        bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
> >>>        struct genevehdr *gnvh;
> >>>        int min_headroom;
> >>> @@ -755,8 +775,8 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
> >>>                goto free_dst;
> >>>
> >>>        gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len);
> >>> -     geneve_build_header(gnvh, info);
> >>> -     skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> >>> +     geneve_build_header(gnvh, info, inner_proto);
> >>> +     skb_set_inner_protocol(skb, inner_proto);
> >>>        return 0;
> >>>
> >>>    free_dst:
> >>> @@ -959,7 +979,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
> >>>                }
> >>>        }
> >>>
> >>> -     err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
> >>> +     err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr),
> >>> +                            geneve->cfg.tun);
> >>>        if (unlikely(err))
> >>>                return err;
> >>>
> >>> @@ -1038,7 +1059,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
> >>>                        ttl = key->ttl;
> >>>                ttl = ttl ? : ip6_dst_hoplimit(dst);
> >>>        }
> >>> -     err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr));
> >>> +     err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr),
> >>> +                            geneve->cfg.tun);
> >>>        if (unlikely(err))
> >>>                return err;
> >>>
> >>> @@ -1388,6 +1410,14 @@ static int geneve_configure(struct net *net, struct net_device *dev,
> >>>        dst_cache_reset(&geneve->cfg.info.dst_cache);
> >>>        memcpy(&geneve->cfg, cfg, sizeof(*cfg));
> >>>
> >>> +     if (geneve->cfg.tun) {
> >>> +             dev->header_ops = NULL;
> >>> +             dev->type = ARPHRD_NONE;
> >>> +             dev->hard_header_len = 0;
> >>> +             dev->addr_len = 0;
> >>> +             dev->flags = IFF_NOARP;
> >>> +     }
> >>> +
> >>>        err = register_netdevice(dev);
> >>>        if (err)
> >>>                return err;
> >>> @@ -1561,10 +1591,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
> >>>    #endif
> >>>        }
> >>>
> >>> +     if (data[IFLA_GENEVE_TUN]) {
> >>> +             if (changelink) {
> >>> +                     attrtype = IFLA_GENEVE_TUN;
> >>> +                     goto change_notsup;
> >>> +             }
> >>> +             cfg->tun = true;
> >>> +     }
> >>> +
> >>>        return 0;
> >>>    change_notsup:
> >>>        NL_SET_ERR_MSG_ATTR(extack, data[attrtype],
> >>> -                         "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported");
> >>> +                         "Changing VNI, Port, endpoint IP address family, external, tun, and UDP checksum attributes are not supported");
> >>>        return -EOPNOTSUPP;
> >>>    }
> >>>
> >>> @@ -1799,6 +1837,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
> >>>        if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
> >>>                goto nla_put_failure;
> >>>
> >>> +     if (geneve->cfg.tun && nla_put_flag(skb, IFLA_GENEVE_TUN))
> >>> +             goto nla_put_failure;
> >>> +
> >>>        return 0;
> >>>
> >>>    nla_put_failure:
> >>> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> >>> index bd24c7dc10a2..198aefa2c513 100644
> >>> --- a/include/uapi/linux/if_link.h
> >>> +++ b/include/uapi/linux/if_link.h
> >>> @@ -842,6 +842,7 @@ enum {
> >>>        IFLA_GENEVE_LABEL,
> >>>        IFLA_GENEVE_TTL_INHERIT,
> >>>        IFLA_GENEVE_DF,
> >>> +     IFLA_GENEVE_TUN,
> >> geneve is itself called a tunnel, i wonder if a different name for the
> >> flag would be more appropriate.
> > I tried to find a reference to something similar, so went with something like
> > the tun vs. tap distinction. I was also concerned about the possible
> > confusion, but it felt clearer than something like L3_INNER_PROTO_MODE.
> >
> > I'd happily replace it with a better suggestion.
>
> o ok, makes sense. I can't think of anything other than simply
> IFLA_GENEVE_INNER_PROTO
>
> (maybe others have better suggestions)

My concern with calling it IFLA_GENEVE_INNER_PROTO is that inner_proto is
used internally to denote the inner proto value.

Would IFLA_GENEVE_INNER_PROTO_INHERIT make sense?

>
>
> >
> >> what is the use case for this ?. is there a RFC reference ?
> > I stumbled upon this configuration when trying to receive packets from an
> > AWS "Gateway Load Balancer" which sends IP packets encapsulated in GENEVE [1].
> >
> > But to my understanding the RFC allows this so it doesn't seem something
> > specific to AWS.
>
> that makes me wonder if we need a knob atall and if this should be
> allowed by default.

I didn't find a way to make tx work without requiring a flag, so I thought
it'd be better if this mode was enforced in both directions.

>
> wonder how other network vendor standard geneve implementations behave
> by default.
>
>
> >
> > Thanks for the review!
> >
> > Eyal.
> >
> > [1] https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Faws.amazon.com%2Fblogs%2Fnetworking-and-content-delivery%2Fintegrate-your-custom-logic-or-appliance-with-aws-gateway-load-balancer%2F&amp;data=04%7C01%7Croopa%40nvidia.com%7C223c13c616ef430a487f08da06c19610%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637829725767772379%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=ZQTDrFJ8LLn5SdN6h%2B5NECxwlD9PAGV2KzpVUV%2B1chc%3D&amp;reserved=0
>
> Thanks for the details.
>
>
Roopa Prabhu March 16, 2022, 5:09 a.m. UTC | #5
On 3/15/22 20:50, Eyal Birger wrote:
> On Wed, Mar 16, 2022 at 2:33 AM Roopa Prabhu <roopa@nvidia.com> wrote:
>>
>> On 3/15/22 13:22, Eyal Birger wrote:
>>> Hi Roopa,
>>>
>>> On Tue, Mar 15, 2022 at 9:57 PM Roopa Prabhu <roopa@nvidia.com> wrote:
>>>> On 3/15/22 11:56, Eyal Birger wrote:
>>>>> This patch adds support for encapsulating IPv4/IPv6 within GENEVE.
>>>>>
>>>>> In order use this, a new IFLA_GENEVE_TUN flag needs to be provided at
>>>>> device creation. This property cannot be changed for the time being.
>>>>>
>>>>> In case IP traffic is received on a non-tun device the drop count is
>>>>> increased.
>>>>>
>>>>> Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
>>>>> ---
>>>>>     drivers/net/geneve.c         | 79 +++++++++++++++++++++++++++---------
>>>>>     include/uapi/linux/if_link.h |  1 +
>>>>>     2 files changed, 61 insertions(+), 19 deletions(-)
>>>>>
>>>>> diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
>>>>> index a895ff756093..37305ec26250 100644
>>>>> --- a/drivers/net/geneve.c
>>>>> +++ b/drivers/net/geneve.c
>>>>> @@ -56,6 +56,7 @@ struct geneve_config {
>>>>>         bool                    use_udp6_rx_checksums;
>>>>>         bool                    ttl_inherit;
>>>>>         enum ifla_geneve_df     df;
>>>>> +     bool                    tun;
>>>>>     };
>>>>>
>>>>>     /* Pseudo network device */
>>>>> @@ -251,17 +252,24 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
>>>>>                 }
>>>>>         }
>>>>>
>>>>> -     skb_reset_mac_header(skb);
>>>>> -     skb->protocol = eth_type_trans(skb, geneve->dev);
>>>>> -     skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
>>>>> -
>>>>>         if (tun_dst)
>>>>>                 skb_dst_set(skb, &tun_dst->dst);
>>>>>
>>>>> -     /* Ignore packet loops (and multicast echo) */
>>>>> -     if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) {
>>>>> -             geneve->dev->stats.rx_errors++;
>>>>> -             goto drop;
>>>>> +     if (gnvh->proto_type == htons(ETH_P_TEB)) {
>>>>> +             skb_reset_mac_header(skb);
>>>>> +             skb->protocol = eth_type_trans(skb, geneve->dev);
>>>>> +             skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
>>>>> +
>>>>> +             /* Ignore packet loops (and multicast echo) */
>>>>> +             if (ether_addr_equal(eth_hdr(skb)->h_source,
>>>>> +                                  geneve->dev->dev_addr)) {
>>>>> +                     geneve->dev->stats.rx_errors++;
>>>>> +                     goto drop;
>>>>> +             }
>>>>> +     } else {
>>>>> +             skb_reset_mac_header(skb);
>>>>> +             skb->dev = geneve->dev;
>>>>> +             skb->pkt_type = PACKET_HOST;
>>>>>         }
>>>>>
>>>>>         oiph = skb_network_header(skb);
>>>>> @@ -345,6 +353,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>>>>>         struct genevehdr *geneveh;
>>>>>         struct geneve_dev *geneve;
>>>>>         struct geneve_sock *gs;
>>>>> +     __be16 inner_proto;
>>>>>         int opts_len;
>>>>>
>>>>>         /* Need UDP and Geneve header to be present */
>>>>> @@ -356,8 +365,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>>>>>         if (unlikely(geneveh->ver != GENEVE_VER))
>>>>>                 goto drop;
>>>>>
>>>>> -     if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
>>>>> +     inner_proto = geneveh->proto_type;
>>>>> +
>>>>> +     if (unlikely((inner_proto != htons(ETH_P_TEB) &&
>>>>> +                   inner_proto != htons(ETH_P_IP) &&
>>>>> +                   inner_proto != htons(ETH_P_IPV6)))) {
>>>>>                 goto drop;
>>>>> +     }
>>>>>
>>>> unnecessary braces
>>> Will fix.
>>>
>>>>>         gs = rcu_dereference_sk_user_data(sk);
>>>>>         if (!gs)
>>>>> @@ -367,9 +381,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>>>>>         if (!geneve)
>>>>>                 goto drop;
>>>>>
>>>>> +     if (unlikely((!geneve->cfg.tun && inner_proto != htons(ETH_P_TEB)))) {
>>>>> +             geneve->dev->stats.rx_dropped++;
>>>>> +             goto drop;
>>>>> +     }
>>>> Does this change current default behavior ?.
>>>>
>>>> its unclear to be what the current behavior is for a non ETH_P_TEB packet
>>> Currently non ETH_P_TEB packets are silently dropped.
>>> I figured that if the driver supported other ethertypes it would make
>>> sense to increase the count in such case, to assist in debugging wrong
>>> configurations.
>>>
>>> I can remove this if it's better to keep the current behavior.
>> yes, agree. counting is better.
>>
>>
>>>>> +
>>>>>         opts_len = geneveh->opt_len * 4;
>>>>> -     if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
>>>>> -                              htons(ETH_P_TEB),
>>>>> +     if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto,
>>>>>                                  !net_eq(geneve->net, dev_net(geneve->dev)))) {
>>>>>                 geneve->dev->stats.rx_dropped++;
>>>>>                 goto drop;
>>>>> @@ -717,7 +735,8 @@ static int geneve_stop(struct net_device *dev)
>>>>>     }
>>>>>
>>>>>     static void geneve_build_header(struct genevehdr *geneveh,
>>>>> -                             const struct ip_tunnel_info *info)
>>>>> +                             const struct ip_tunnel_info *info,
>>>>> +                             __be16 inner_proto)
>>>>>     {
>>>>>         geneveh->ver = GENEVE_VER;
>>>>>         geneveh->opt_len = info->options_len / 4;
>>>>> @@ -725,7 +744,7 @@ static void geneve_build_header(struct genevehdr *geneveh,
>>>>>         geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT);
>>>>>         geneveh->rsvd1 = 0;
>>>>>         tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
>>>>> -     geneveh->proto_type = htons(ETH_P_TEB);
>>>>> +     geneveh->proto_type = inner_proto;
>>>>>         geneveh->rsvd2 = 0;
>>>>>
>>>>>         if (info->key.tun_flags & TUNNEL_GENEVE_OPT)
>>>>> @@ -734,8 +753,9 @@ static void geneve_build_header(struct genevehdr *geneveh,
>>>>>
>>>>>     static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
>>>>>                             const struct ip_tunnel_info *info,
>>>>> -                         bool xnet, int ip_hdr_len)
>>>>> +                         bool xnet, int ip_hdr_len, bool tun)
>>>>>     {
>>>>> +     __be16 inner_proto = tun ? skb->protocol : htons(ETH_P_TEB);
>>>>>         bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
>>>>>         struct genevehdr *gnvh;
>>>>>         int min_headroom;
>>>>> @@ -755,8 +775,8 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
>>>>>                 goto free_dst;
>>>>>
>>>>>         gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len);
>>>>> -     geneve_build_header(gnvh, info);
>>>>> -     skb_set_inner_protocol(skb, htons(ETH_P_TEB));
>>>>> +     geneve_build_header(gnvh, info, inner_proto);
>>>>> +     skb_set_inner_protocol(skb, inner_proto);
>>>>>         return 0;
>>>>>
>>>>>     free_dst:
>>>>> @@ -959,7 +979,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
>>>>>                 }
>>>>>         }
>>>>>
>>>>> -     err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
>>>>> +     err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr),
>>>>> +                            geneve->cfg.tun);
>>>>>         if (unlikely(err))
>>>>>                 return err;
>>>>>
>>>>> @@ -1038,7 +1059,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
>>>>>                         ttl = key->ttl;
>>>>>                 ttl = ttl ? : ip6_dst_hoplimit(dst);
>>>>>         }
>>>>> -     err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr));
>>>>> +     err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr),
>>>>> +                            geneve->cfg.tun);
>>>>>         if (unlikely(err))
>>>>>                 return err;
>>>>>
>>>>> @@ -1388,6 +1410,14 @@ static int geneve_configure(struct net *net, struct net_device *dev,
>>>>>         dst_cache_reset(&geneve->cfg.info.dst_cache);
>>>>>         memcpy(&geneve->cfg, cfg, sizeof(*cfg));
>>>>>
>>>>> +     if (geneve->cfg.tun) {
>>>>> +             dev->header_ops = NULL;
>>>>> +             dev->type = ARPHRD_NONE;
>>>>> +             dev->hard_header_len = 0;
>>>>> +             dev->addr_len = 0;
>>>>> +             dev->flags = IFF_NOARP;
>>>>> +     }
>>>>> +
>>>>>         err = register_netdevice(dev);
>>>>>         if (err)
>>>>>                 return err;
>>>>> @@ -1561,10 +1591,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
>>>>>     #endif
>>>>>         }
>>>>>
>>>>> +     if (data[IFLA_GENEVE_TUN]) {
>>>>> +             if (changelink) {
>>>>> +                     attrtype = IFLA_GENEVE_TUN;
>>>>> +                     goto change_notsup;
>>>>> +             }
>>>>> +             cfg->tun = true;
>>>>> +     }
>>>>> +
>>>>>         return 0;
>>>>>     change_notsup:
>>>>>         NL_SET_ERR_MSG_ATTR(extack, data[attrtype],
>>>>> -                         "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported");
>>>>> +                         "Changing VNI, Port, endpoint IP address family, external, tun, and UDP checksum attributes are not supported");
>>>>>         return -EOPNOTSUPP;
>>>>>     }
>>>>>
>>>>> @@ -1799,6 +1837,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
>>>>>         if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
>>>>>                 goto nla_put_failure;
>>>>>
>>>>> +     if (geneve->cfg.tun && nla_put_flag(skb, IFLA_GENEVE_TUN))
>>>>> +             goto nla_put_failure;
>>>>> +
>>>>>         return 0;
>>>>>
>>>>>     nla_put_failure:
>>>>> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
>>>>> index bd24c7dc10a2..198aefa2c513 100644
>>>>> --- a/include/uapi/linux/if_link.h
>>>>> +++ b/include/uapi/linux/if_link.h
>>>>> @@ -842,6 +842,7 @@ enum {
>>>>>         IFLA_GENEVE_LABEL,
>>>>>         IFLA_GENEVE_TTL_INHERIT,
>>>>>         IFLA_GENEVE_DF,
>>>>> +     IFLA_GENEVE_TUN,
>>>> geneve is itself called a tunnel, i wonder if a different name for the
>>>> flag would be more appropriate.
>>> I tried to find a reference to something similar, so went with something like
>>> the tun vs. tap distinction. I was also concerned about the possible
>>> confusion, but it felt clearer than something like L3_INNER_PROTO_MODE.
>>>
>>> I'd happily replace it with a better suggestion.
>> o ok, makes sense. I can't think of anything other than simply
>> IFLA_GENEVE_INNER_PROTO
>>
>> (maybe others have better suggestions)
> My concern with calling it IFLA_GENEVE_INNER_PROTO is that inner_proto is
> used internally to denote the inner proto value.
>
> Would IFLA_GENEVE_INNER_PROTO_INHERIT make sense?

yes, i like that better (and there is precedence to using inherit)


>>
>>>> what is the use case for this ?. is there a RFC reference ?
>>> I stumbled upon this configuration when trying to receive packets from an
>>> AWS "Gateway Load Balancer" which sends IP packets encapsulated in GENEVE [1].
>>>
>>> But to my understanding the RFC allows this so it doesn't seem something
>>> specific to AWS.
>> that makes me wonder if we need a knob atall and if this should be
>> allowed by default.
> I didn't find a way to make tx work without requiring a flag, so I thought
> it'd be better if this mode was enforced in both directions.

ok, in that case flag is ok.


>
>> wonder how other network vendor standard geneve implementations behave
>> by default.
>>
>>
>>> Thanks for the review!
>>>
>>> Eyal.
>>>
>>> [1] https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Faws.amazon.com%2Fblogs%2Fnetworking-and-content-delivery%2Fintegrate-your-custom-logic-or-appliance-with-aws-gateway-load-balancer%2F&amp;data=04%7C01%7Croopa%40nvidia.com%7C15818ae5c7f949db55f908da07002ccd%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637829994585887100%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=a4hU8aizVq%2Bp4ETdpn4oNIUAkNT%2FPkDiVtTGr8ksBts%3D&amp;reserved=0
>> Thanks for the details.
>>
>>
diff mbox series

Patch

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index a895ff756093..37305ec26250 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -56,6 +56,7 @@  struct geneve_config {
 	bool			use_udp6_rx_checksums;
 	bool			ttl_inherit;
 	enum ifla_geneve_df	df;
+	bool			tun;
 };
 
 /* Pseudo network device */
@@ -251,17 +252,24 @@  static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
 		}
 	}
 
-	skb_reset_mac_header(skb);
-	skb->protocol = eth_type_trans(skb, geneve->dev);
-	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
-
 	if (tun_dst)
 		skb_dst_set(skb, &tun_dst->dst);
 
-	/* Ignore packet loops (and multicast echo) */
-	if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) {
-		geneve->dev->stats.rx_errors++;
-		goto drop;
+	if (gnvh->proto_type == htons(ETH_P_TEB)) {
+		skb_reset_mac_header(skb);
+		skb->protocol = eth_type_trans(skb, geneve->dev);
+		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+
+		/* Ignore packet loops (and multicast echo) */
+		if (ether_addr_equal(eth_hdr(skb)->h_source,
+				     geneve->dev->dev_addr)) {
+			geneve->dev->stats.rx_errors++;
+			goto drop;
+		}
+	} else {
+		skb_reset_mac_header(skb);
+		skb->dev = geneve->dev;
+		skb->pkt_type = PACKET_HOST;
 	}
 
 	oiph = skb_network_header(skb);
@@ -345,6 +353,7 @@  static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	struct genevehdr *geneveh;
 	struct geneve_dev *geneve;
 	struct geneve_sock *gs;
+	__be16 inner_proto;
 	int opts_len;
 
 	/* Need UDP and Geneve header to be present */
@@ -356,8 +365,13 @@  static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (unlikely(geneveh->ver != GENEVE_VER))
 		goto drop;
 
-	if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
+	inner_proto = geneveh->proto_type;
+
+	if (unlikely((inner_proto != htons(ETH_P_TEB) &&
+		      inner_proto != htons(ETH_P_IP) &&
+		      inner_proto != htons(ETH_P_IPV6)))) {
 		goto drop;
+	}
 
 	gs = rcu_dereference_sk_user_data(sk);
 	if (!gs)
@@ -367,9 +381,13 @@  static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (!geneve)
 		goto drop;
 
+	if (unlikely((!geneve->cfg.tun && inner_proto != htons(ETH_P_TEB)))) {
+		geneve->dev->stats.rx_dropped++;
+		goto drop;
+	}
+
 	opts_len = geneveh->opt_len * 4;
-	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
-				 htons(ETH_P_TEB),
+	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto,
 				 !net_eq(geneve->net, dev_net(geneve->dev)))) {
 		geneve->dev->stats.rx_dropped++;
 		goto drop;
@@ -717,7 +735,8 @@  static int geneve_stop(struct net_device *dev)
 }
 
 static void geneve_build_header(struct genevehdr *geneveh,
-				const struct ip_tunnel_info *info)
+				const struct ip_tunnel_info *info,
+				__be16 inner_proto)
 {
 	geneveh->ver = GENEVE_VER;
 	geneveh->opt_len = info->options_len / 4;
@@ -725,7 +744,7 @@  static void geneve_build_header(struct genevehdr *geneveh,
 	geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT);
 	geneveh->rsvd1 = 0;
 	tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
-	geneveh->proto_type = htons(ETH_P_TEB);
+	geneveh->proto_type = inner_proto;
 	geneveh->rsvd2 = 0;
 
 	if (info->key.tun_flags & TUNNEL_GENEVE_OPT)
@@ -734,8 +753,9 @@  static void geneve_build_header(struct genevehdr *geneveh,
 
 static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
 			    const struct ip_tunnel_info *info,
-			    bool xnet, int ip_hdr_len)
+			    bool xnet, int ip_hdr_len, bool tun)
 {
+	__be16 inner_proto = tun ? skb->protocol : htons(ETH_P_TEB);
 	bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
 	struct genevehdr *gnvh;
 	int min_headroom;
@@ -755,8 +775,8 @@  static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
 		goto free_dst;
 
 	gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len);
-	geneve_build_header(gnvh, info);
-	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+	geneve_build_header(gnvh, info, inner_proto);
+	skb_set_inner_protocol(skb, inner_proto);
 	return 0;
 
 free_dst:
@@ -959,7 +979,8 @@  static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
-	err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
+	err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr),
+			       geneve->cfg.tun);
 	if (unlikely(err))
 		return err;
 
@@ -1038,7 +1059,8 @@  static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 			ttl = key->ttl;
 		ttl = ttl ? : ip6_dst_hoplimit(dst);
 	}
-	err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr));
+	err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr),
+			       geneve->cfg.tun);
 	if (unlikely(err))
 		return err;
 
@@ -1388,6 +1410,14 @@  static int geneve_configure(struct net *net, struct net_device *dev,
 	dst_cache_reset(&geneve->cfg.info.dst_cache);
 	memcpy(&geneve->cfg, cfg, sizeof(*cfg));
 
+	if (geneve->cfg.tun) {
+		dev->header_ops = NULL;
+		dev->type = ARPHRD_NONE;
+		dev->hard_header_len = 0;
+		dev->addr_len = 0;
+		dev->flags = IFF_NOARP;
+	}
+
 	err = register_netdevice(dev);
 	if (err)
 		return err;
@@ -1561,10 +1591,18 @@  static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 #endif
 	}
 
+	if (data[IFLA_GENEVE_TUN]) {
+		if (changelink) {
+			attrtype = IFLA_GENEVE_TUN;
+			goto change_notsup;
+		}
+		cfg->tun = true;
+	}
+
 	return 0;
 change_notsup:
 	NL_SET_ERR_MSG_ATTR(extack, data[attrtype],
-			    "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported");
+			    "Changing VNI, Port, endpoint IP address family, external, tun, and UDP checksum attributes are not supported");
 	return -EOPNOTSUPP;
 }
 
@@ -1799,6 +1837,9 @@  static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
 		goto nla_put_failure;
 
+	if (geneve->cfg.tun && nla_put_flag(skb, IFLA_GENEVE_TUN))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index bd24c7dc10a2..198aefa2c513 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -842,6 +842,7 @@  enum {
 	IFLA_GENEVE_LABEL,
 	IFLA_GENEVE_TTL_INHERIT,
 	IFLA_GENEVE_DF,
+	IFLA_GENEVE_TUN,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)