Message ID | 20220315185603.160778-1-eyal.birger@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next] net: geneve: support IPv4/IPv6 as inner protocol | expand |
On 3/15/22 11:56, Eyal Birger wrote: > This patch adds support for encapsulating IPv4/IPv6 within GENEVE. > > In order use this, a new IFLA_GENEVE_TUN flag needs to be provided at > device creation. This property cannot be changed for the time being. > > In case IP traffic is received on a non-tun device the drop count is > increased. > > Signed-off-by: Eyal Birger <eyal.birger@gmail.com> > --- > drivers/net/geneve.c | 79 +++++++++++++++++++++++++++--------- > include/uapi/linux/if_link.h | 1 + > 2 files changed, 61 insertions(+), 19 deletions(-) > > diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c > index a895ff756093..37305ec26250 100644 > --- a/drivers/net/geneve.c > +++ b/drivers/net/geneve.c > @@ -56,6 +56,7 @@ struct geneve_config { > bool use_udp6_rx_checksums; > bool ttl_inherit; > enum ifla_geneve_df df; > + bool tun; > }; > > /* Pseudo network device */ > @@ -251,17 +252,24 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, > } > } > > - skb_reset_mac_header(skb); > - skb->protocol = eth_type_trans(skb, geneve->dev); > - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); > - > if (tun_dst) > skb_dst_set(skb, &tun_dst->dst); > > - /* Ignore packet loops (and multicast echo) */ > - if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) { > - geneve->dev->stats.rx_errors++; > - goto drop; > + if (gnvh->proto_type == htons(ETH_P_TEB)) { > + skb_reset_mac_header(skb); > + skb->protocol = eth_type_trans(skb, geneve->dev); > + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); > + > + /* Ignore packet loops (and multicast echo) */ > + if (ether_addr_equal(eth_hdr(skb)->h_source, > + geneve->dev->dev_addr)) { > + geneve->dev->stats.rx_errors++; > + goto drop; > + } > + } else { > + skb_reset_mac_header(skb); > + skb->dev = geneve->dev; > + skb->pkt_type = PACKET_HOST; > } > > oiph = skb_network_header(skb); > @@ -345,6 +353,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) > struct genevehdr *geneveh; > struct geneve_dev *geneve; > struct geneve_sock *gs; > + __be16 inner_proto; > int opts_len; > > /* Need UDP and Geneve header to be present */ > @@ -356,8 +365,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) > if (unlikely(geneveh->ver != GENEVE_VER)) > goto drop; > > - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) > + inner_proto = geneveh->proto_type; > + > + if (unlikely((inner_proto != htons(ETH_P_TEB) && > + inner_proto != htons(ETH_P_IP) && > + inner_proto != htons(ETH_P_IPV6)))) { > goto drop; > + } > unnecessary braces > gs = rcu_dereference_sk_user_data(sk); > if (!gs) > @@ -367,9 +381,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) > if (!geneve) > goto drop; > > + if (unlikely((!geneve->cfg.tun && inner_proto != htons(ETH_P_TEB)))) { > + geneve->dev->stats.rx_dropped++; > + goto drop; > + } Does this change current default behavior ?. its unclear to be what the current behavior is for a non ETH_P_TEB packet > + > opts_len = geneveh->opt_len * 4; > - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, > - htons(ETH_P_TEB), > + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto, > !net_eq(geneve->net, dev_net(geneve->dev)))) { > geneve->dev->stats.rx_dropped++; > goto drop; > @@ -717,7 +735,8 @@ static int geneve_stop(struct net_device *dev) > } > > static void geneve_build_header(struct genevehdr *geneveh, > - const struct ip_tunnel_info *info) > + const struct ip_tunnel_info *info, > + __be16 inner_proto) > { > geneveh->ver = GENEVE_VER; > geneveh->opt_len = info->options_len / 4; > @@ -725,7 +744,7 @@ static void geneve_build_header(struct genevehdr *geneveh, > geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT); > geneveh->rsvd1 = 0; > tunnel_id_to_vni(info->key.tun_id, geneveh->vni); > - geneveh->proto_type = htons(ETH_P_TEB); > + geneveh->proto_type = inner_proto; > geneveh->rsvd2 = 0; > > if (info->key.tun_flags & TUNNEL_GENEVE_OPT) > @@ -734,8 +753,9 @@ static void geneve_build_header(struct genevehdr *geneveh, > > static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, > const struct ip_tunnel_info *info, > - bool xnet, int ip_hdr_len) > + bool xnet, int ip_hdr_len, bool tun) > { > + __be16 inner_proto = tun ? skb->protocol : htons(ETH_P_TEB); > bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); > struct genevehdr *gnvh; > int min_headroom; > @@ -755,8 +775,8 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, > goto free_dst; > > gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len); > - geneve_build_header(gnvh, info); > - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); > + geneve_build_header(gnvh, info, inner_proto); > + skb_set_inner_protocol(skb, inner_proto); > return 0; > > free_dst: > @@ -959,7 +979,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, > } > } > > - err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr)); > + err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr), > + geneve->cfg.tun); > if (unlikely(err)) > return err; > > @@ -1038,7 +1059,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, > ttl = key->ttl; > ttl = ttl ? : ip6_dst_hoplimit(dst); > } > - err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr)); > + err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr), > + geneve->cfg.tun); > if (unlikely(err)) > return err; > > @@ -1388,6 +1410,14 @@ static int geneve_configure(struct net *net, struct net_device *dev, > dst_cache_reset(&geneve->cfg.info.dst_cache); > memcpy(&geneve->cfg, cfg, sizeof(*cfg)); > > + if (geneve->cfg.tun) { > + dev->header_ops = NULL; > + dev->type = ARPHRD_NONE; > + dev->hard_header_len = 0; > + dev->addr_len = 0; > + dev->flags = IFF_NOARP; > + } > + > err = register_netdevice(dev); > if (err) > return err; > @@ -1561,10 +1591,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], > #endif > } > > + if (data[IFLA_GENEVE_TUN]) { > + if (changelink) { > + attrtype = IFLA_GENEVE_TUN; > + goto change_notsup; > + } > + cfg->tun = true; > + } > + > return 0; > change_notsup: > NL_SET_ERR_MSG_ATTR(extack, data[attrtype], > - "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported"); > + "Changing VNI, Port, endpoint IP address family, external, tun, and UDP checksum attributes are not supported"); > return -EOPNOTSUPP; > } > > @@ -1799,6 +1837,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) > if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit)) > goto nla_put_failure; > > + if (geneve->cfg.tun && nla_put_flag(skb, IFLA_GENEVE_TUN)) > + goto nla_put_failure; > + > return 0; > > nla_put_failure: > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h > index bd24c7dc10a2..198aefa2c513 100644 > --- a/include/uapi/linux/if_link.h > +++ b/include/uapi/linux/if_link.h > @@ -842,6 +842,7 @@ enum { > IFLA_GENEVE_LABEL, > IFLA_GENEVE_TTL_INHERIT, > IFLA_GENEVE_DF, > + IFLA_GENEVE_TUN, geneve is itself called a tunnel, i wonder if a different name for the flag would be more appropriate. what is the use case for this ?. is there a RFC reference ? > __IFLA_GENEVE_MAX > }; > #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1)
Hi Roopa, On Tue, Mar 15, 2022 at 9:57 PM Roopa Prabhu <roopa@nvidia.com> wrote: > > > On 3/15/22 11:56, Eyal Birger wrote: > > This patch adds support for encapsulating IPv4/IPv6 within GENEVE. > > > > In order use this, a new IFLA_GENEVE_TUN flag needs to be provided at > > device creation. This property cannot be changed for the time being. > > > > In case IP traffic is received on a non-tun device the drop count is > > increased. > > > > Signed-off-by: Eyal Birger <eyal.birger@gmail.com> > > --- > > drivers/net/geneve.c | 79 +++++++++++++++++++++++++++--------- > > include/uapi/linux/if_link.h | 1 + > > 2 files changed, 61 insertions(+), 19 deletions(-) > > > > diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c > > index a895ff756093..37305ec26250 100644 > > --- a/drivers/net/geneve.c > > +++ b/drivers/net/geneve.c > > @@ -56,6 +56,7 @@ struct geneve_config { > > bool use_udp6_rx_checksums; > > bool ttl_inherit; > > enum ifla_geneve_df df; > > + bool tun; > > }; > > > > /* Pseudo network device */ > > @@ -251,17 +252,24 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, > > } > > } > > > > - skb_reset_mac_header(skb); > > - skb->protocol = eth_type_trans(skb, geneve->dev); > > - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); > > - > > if (tun_dst) > > skb_dst_set(skb, &tun_dst->dst); > > > > - /* Ignore packet loops (and multicast echo) */ > > - if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) { > > - geneve->dev->stats.rx_errors++; > > - goto drop; > > + if (gnvh->proto_type == htons(ETH_P_TEB)) { > > + skb_reset_mac_header(skb); > > + skb->protocol = eth_type_trans(skb, geneve->dev); > > + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); > > + > > + /* Ignore packet loops (and multicast echo) */ > > + if (ether_addr_equal(eth_hdr(skb)->h_source, > > + geneve->dev->dev_addr)) { > > + geneve->dev->stats.rx_errors++; > > + goto drop; > > + } > > + } else { > > + skb_reset_mac_header(skb); > > + skb->dev = geneve->dev; > > + skb->pkt_type = PACKET_HOST; > > } > > > > oiph = skb_network_header(skb); > > @@ -345,6 +353,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) > > struct genevehdr *geneveh; > > struct geneve_dev *geneve; > > struct geneve_sock *gs; > > + __be16 inner_proto; > > int opts_len; > > > > /* Need UDP and Geneve header to be present */ > > @@ -356,8 +365,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) > > if (unlikely(geneveh->ver != GENEVE_VER)) > > goto drop; > > > > - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) > > + inner_proto = geneveh->proto_type; > > + > > + if (unlikely((inner_proto != htons(ETH_P_TEB) && > > + inner_proto != htons(ETH_P_IP) && > > + inner_proto != htons(ETH_P_IPV6)))) { > > goto drop; > > + } > > > > > unnecessary braces Will fix. > > > gs = rcu_dereference_sk_user_data(sk); > > if (!gs) > > @@ -367,9 +381,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) > > if (!geneve) > > goto drop; > > > > + if (unlikely((!geneve->cfg.tun && inner_proto != htons(ETH_P_TEB)))) { > > + geneve->dev->stats.rx_dropped++; > > + goto drop; > > + } > > Does this change current default behavior ?. > > its unclear to be what the current behavior is for a non ETH_P_TEB packet Currently non ETH_P_TEB packets are silently dropped. I figured that if the driver supported other ethertypes it would make sense to increase the count in such case, to assist in debugging wrong configurations. I can remove this if it's better to keep the current behavior. > > > > + > > opts_len = geneveh->opt_len * 4; > > - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, > > - htons(ETH_P_TEB), > > + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto, > > !net_eq(geneve->net, dev_net(geneve->dev)))) { > > geneve->dev->stats.rx_dropped++; > > goto drop; > > @@ -717,7 +735,8 @@ static int geneve_stop(struct net_device *dev) > > } > > > > static void geneve_build_header(struct genevehdr *geneveh, > > - const struct ip_tunnel_info *info) > > + const struct ip_tunnel_info *info, > > + __be16 inner_proto) > > { > > geneveh->ver = GENEVE_VER; > > geneveh->opt_len = info->options_len / 4; > > @@ -725,7 +744,7 @@ static void geneve_build_header(struct genevehdr *geneveh, > > geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT); > > geneveh->rsvd1 = 0; > > tunnel_id_to_vni(info->key.tun_id, geneveh->vni); > > - geneveh->proto_type = htons(ETH_P_TEB); > > + geneveh->proto_type = inner_proto; > > geneveh->rsvd2 = 0; > > > > if (info->key.tun_flags & TUNNEL_GENEVE_OPT) > > @@ -734,8 +753,9 @@ static void geneve_build_header(struct genevehdr *geneveh, > > > > static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, > > const struct ip_tunnel_info *info, > > - bool xnet, int ip_hdr_len) > > + bool xnet, int ip_hdr_len, bool tun) > > { > > + __be16 inner_proto = tun ? skb->protocol : htons(ETH_P_TEB); > > bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); > > struct genevehdr *gnvh; > > int min_headroom; > > @@ -755,8 +775,8 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, > > goto free_dst; > > > > gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len); > > - geneve_build_header(gnvh, info); > > - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); > > + geneve_build_header(gnvh, info, inner_proto); > > + skb_set_inner_protocol(skb, inner_proto); > > return 0; > > > > free_dst: > > @@ -959,7 +979,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, > > } > > } > > > > - err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr)); > > + err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr), > > + geneve->cfg.tun); > > if (unlikely(err)) > > return err; > > > > @@ -1038,7 +1059,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, > > ttl = key->ttl; > > ttl = ttl ? : ip6_dst_hoplimit(dst); > > } > > - err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr)); > > + err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr), > > + geneve->cfg.tun); > > if (unlikely(err)) > > return err; > > > > @@ -1388,6 +1410,14 @@ static int geneve_configure(struct net *net, struct net_device *dev, > > dst_cache_reset(&geneve->cfg.info.dst_cache); > > memcpy(&geneve->cfg, cfg, sizeof(*cfg)); > > > > + if (geneve->cfg.tun) { > > + dev->header_ops = NULL; > > + dev->type = ARPHRD_NONE; > > + dev->hard_header_len = 0; > > + dev->addr_len = 0; > > + dev->flags = IFF_NOARP; > > + } > > + > > err = register_netdevice(dev); > > if (err) > > return err; > > @@ -1561,10 +1591,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], > > #endif > > } > > > > + if (data[IFLA_GENEVE_TUN]) { > > + if (changelink) { > > + attrtype = IFLA_GENEVE_TUN; > > + goto change_notsup; > > + } > > + cfg->tun = true; > > + } > > + > > return 0; > > change_notsup: > > NL_SET_ERR_MSG_ATTR(extack, data[attrtype], > > - "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported"); > > + "Changing VNI, Port, endpoint IP address family, external, tun, and UDP checksum attributes are not supported"); > > return -EOPNOTSUPP; > > } > > > > @@ -1799,6 +1837,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) > > if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit)) > > goto nla_put_failure; > > > > + if (geneve->cfg.tun && nla_put_flag(skb, IFLA_GENEVE_TUN)) > > + goto nla_put_failure; > > + > > return 0; > > > > nla_put_failure: > > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h > > index bd24c7dc10a2..198aefa2c513 100644 > > --- a/include/uapi/linux/if_link.h > > +++ b/include/uapi/linux/if_link.h > > @@ -842,6 +842,7 @@ enum { > > IFLA_GENEVE_LABEL, > > IFLA_GENEVE_TTL_INHERIT, > > IFLA_GENEVE_DF, > > + IFLA_GENEVE_TUN, > > geneve is itself called a tunnel, i wonder if a different name for the > flag would be more appropriate. I tried to find a reference to something similar, so went with something like the tun vs. tap distinction. I was also concerned about the possible confusion, but it felt clearer than something like L3_INNER_PROTO_MODE. I'd happily replace it with a better suggestion. > > what is the use case for this ?. is there a RFC reference ? I stumbled upon this configuration when trying to receive packets from an AWS "Gateway Load Balancer" which sends IP packets encapsulated in GENEVE [1]. But to my understanding the RFC allows this so it doesn't seem something specific to AWS. Thanks for the review! Eyal. [1] https://aws.amazon.com/blogs/networking-and-content-delivery/integrate-your-custom-logic-or-appliance-with-aws-gateway-load-balancer/ > > > > > __IFLA_GENEVE_MAX > > }; > > #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1)
On 3/15/22 13:22, Eyal Birger wrote: > Hi Roopa, > > On Tue, Mar 15, 2022 at 9:57 PM Roopa Prabhu <roopa@nvidia.com> wrote: >> >> On 3/15/22 11:56, Eyal Birger wrote: >>> This patch adds support for encapsulating IPv4/IPv6 within GENEVE. >>> >>> In order use this, a new IFLA_GENEVE_TUN flag needs to be provided at >>> device creation. This property cannot be changed for the time being. >>> >>> In case IP traffic is received on a non-tun device the drop count is >>> increased. >>> >>> Signed-off-by: Eyal Birger <eyal.birger@gmail.com> >>> --- >>> drivers/net/geneve.c | 79 +++++++++++++++++++++++++++--------- >>> include/uapi/linux/if_link.h | 1 + >>> 2 files changed, 61 insertions(+), 19 deletions(-) >>> >>> diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c >>> index a895ff756093..37305ec26250 100644 >>> --- a/drivers/net/geneve.c >>> +++ b/drivers/net/geneve.c >>> @@ -56,6 +56,7 @@ struct geneve_config { >>> bool use_udp6_rx_checksums; >>> bool ttl_inherit; >>> enum ifla_geneve_df df; >>> + bool tun; >>> }; >>> >>> /* Pseudo network device */ >>> @@ -251,17 +252,24 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, >>> } >>> } >>> >>> - skb_reset_mac_header(skb); >>> - skb->protocol = eth_type_trans(skb, geneve->dev); >>> - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); >>> - >>> if (tun_dst) >>> skb_dst_set(skb, &tun_dst->dst); >>> >>> - /* Ignore packet loops (and multicast echo) */ >>> - if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) { >>> - geneve->dev->stats.rx_errors++; >>> - goto drop; >>> + if (gnvh->proto_type == htons(ETH_P_TEB)) { >>> + skb_reset_mac_header(skb); >>> + skb->protocol = eth_type_trans(skb, geneve->dev); >>> + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); >>> + >>> + /* Ignore packet loops (and multicast echo) */ >>> + if (ether_addr_equal(eth_hdr(skb)->h_source, >>> + geneve->dev->dev_addr)) { >>> + geneve->dev->stats.rx_errors++; >>> + goto drop; >>> + } >>> + } else { >>> + skb_reset_mac_header(skb); >>> + skb->dev = geneve->dev; >>> + skb->pkt_type = PACKET_HOST; >>> } >>> >>> oiph = skb_network_header(skb); >>> @@ -345,6 +353,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) >>> struct genevehdr *geneveh; >>> struct geneve_dev *geneve; >>> struct geneve_sock *gs; >>> + __be16 inner_proto; >>> int opts_len; >>> >>> /* Need UDP and Geneve header to be present */ >>> @@ -356,8 +365,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) >>> if (unlikely(geneveh->ver != GENEVE_VER)) >>> goto drop; >>> >>> - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) >>> + inner_proto = geneveh->proto_type; >>> + >>> + if (unlikely((inner_proto != htons(ETH_P_TEB) && >>> + inner_proto != htons(ETH_P_IP) && >>> + inner_proto != htons(ETH_P_IPV6)))) { >>> goto drop; >>> + } >>> >> >> unnecessary braces > Will fix. > >>> gs = rcu_dereference_sk_user_data(sk); >>> if (!gs) >>> @@ -367,9 +381,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) >>> if (!geneve) >>> goto drop; >>> >>> + if (unlikely((!geneve->cfg.tun && inner_proto != htons(ETH_P_TEB)))) { >>> + geneve->dev->stats.rx_dropped++; >>> + goto drop; >>> + } >> Does this change current default behavior ?. >> >> its unclear to be what the current behavior is for a non ETH_P_TEB packet > Currently non ETH_P_TEB packets are silently dropped. > I figured that if the driver supported other ethertypes it would make > sense to increase the count in such case, to assist in debugging wrong > configurations. > > I can remove this if it's better to keep the current behavior. yes, agree. counting is better. >> >>> + >>> opts_len = geneveh->opt_len * 4; >>> - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, >>> - htons(ETH_P_TEB), >>> + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto, >>> !net_eq(geneve->net, dev_net(geneve->dev)))) { >>> geneve->dev->stats.rx_dropped++; >>> goto drop; >>> @@ -717,7 +735,8 @@ static int geneve_stop(struct net_device *dev) >>> } >>> >>> static void geneve_build_header(struct genevehdr *geneveh, >>> - const struct ip_tunnel_info *info) >>> + const struct ip_tunnel_info *info, >>> + __be16 inner_proto) >>> { >>> geneveh->ver = GENEVE_VER; >>> geneveh->opt_len = info->options_len / 4; >>> @@ -725,7 +744,7 @@ static void geneve_build_header(struct genevehdr *geneveh, >>> geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT); >>> geneveh->rsvd1 = 0; >>> tunnel_id_to_vni(info->key.tun_id, geneveh->vni); >>> - geneveh->proto_type = htons(ETH_P_TEB); >>> + geneveh->proto_type = inner_proto; >>> geneveh->rsvd2 = 0; >>> >>> if (info->key.tun_flags & TUNNEL_GENEVE_OPT) >>> @@ -734,8 +753,9 @@ static void geneve_build_header(struct genevehdr *geneveh, >>> >>> static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, >>> const struct ip_tunnel_info *info, >>> - bool xnet, int ip_hdr_len) >>> + bool xnet, int ip_hdr_len, bool tun) >>> { >>> + __be16 inner_proto = tun ? skb->protocol : htons(ETH_P_TEB); >>> bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); >>> struct genevehdr *gnvh; >>> int min_headroom; >>> @@ -755,8 +775,8 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, >>> goto free_dst; >>> >>> gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len); >>> - geneve_build_header(gnvh, info); >>> - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); >>> + geneve_build_header(gnvh, info, inner_proto); >>> + skb_set_inner_protocol(skb, inner_proto); >>> return 0; >>> >>> free_dst: >>> @@ -959,7 +979,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, >>> } >>> } >>> >>> - err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr)); >>> + err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr), >>> + geneve->cfg.tun); >>> if (unlikely(err)) >>> return err; >>> >>> @@ -1038,7 +1059,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, >>> ttl = key->ttl; >>> ttl = ttl ? : ip6_dst_hoplimit(dst); >>> } >>> - err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr)); >>> + err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr), >>> + geneve->cfg.tun); >>> if (unlikely(err)) >>> return err; >>> >>> @@ -1388,6 +1410,14 @@ static int geneve_configure(struct net *net, struct net_device *dev, >>> dst_cache_reset(&geneve->cfg.info.dst_cache); >>> memcpy(&geneve->cfg, cfg, sizeof(*cfg)); >>> >>> + if (geneve->cfg.tun) { >>> + dev->header_ops = NULL; >>> + dev->type = ARPHRD_NONE; >>> + dev->hard_header_len = 0; >>> + dev->addr_len = 0; >>> + dev->flags = IFF_NOARP; >>> + } >>> + >>> err = register_netdevice(dev); >>> if (err) >>> return err; >>> @@ -1561,10 +1591,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], >>> #endif >>> } >>> >>> + if (data[IFLA_GENEVE_TUN]) { >>> + if (changelink) { >>> + attrtype = IFLA_GENEVE_TUN; >>> + goto change_notsup; >>> + } >>> + cfg->tun = true; >>> + } >>> + >>> return 0; >>> change_notsup: >>> NL_SET_ERR_MSG_ATTR(extack, data[attrtype], >>> - "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported"); >>> + "Changing VNI, Port, endpoint IP address family, external, tun, and UDP checksum attributes are not supported"); >>> return -EOPNOTSUPP; >>> } >>> >>> @@ -1799,6 +1837,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) >>> if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit)) >>> goto nla_put_failure; >>> >>> + if (geneve->cfg.tun && nla_put_flag(skb, IFLA_GENEVE_TUN)) >>> + goto nla_put_failure; >>> + >>> return 0; >>> >>> nla_put_failure: >>> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h >>> index bd24c7dc10a2..198aefa2c513 100644 >>> --- a/include/uapi/linux/if_link.h >>> +++ b/include/uapi/linux/if_link.h >>> @@ -842,6 +842,7 @@ enum { >>> IFLA_GENEVE_LABEL, >>> IFLA_GENEVE_TTL_INHERIT, >>> IFLA_GENEVE_DF, >>> + IFLA_GENEVE_TUN, >> geneve is itself called a tunnel, i wonder if a different name for the >> flag would be more appropriate. > I tried to find a reference to something similar, so went with something like > the tun vs. tap distinction. I was also concerned about the possible > confusion, but it felt clearer than something like L3_INNER_PROTO_MODE. > > I'd happily replace it with a better suggestion. o ok, makes sense. I can't think of anything other than simply IFLA_GENEVE_INNER_PROTO (maybe others have better suggestions) > >> what is the use case for this ?. is there a RFC reference ? > I stumbled upon this configuration when trying to receive packets from an > AWS "Gateway Load Balancer" which sends IP packets encapsulated in GENEVE [1]. > > But to my understanding the RFC allows this so it doesn't seem something > specific to AWS. that makes me wonder if we need a knob atall and if this should be allowed by default. wonder how other network vendor standard geneve implementations behave by default. > > Thanks for the review! > > Eyal. > > [1] https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Faws.amazon.com%2Fblogs%2Fnetworking-and-content-delivery%2Fintegrate-your-custom-logic-or-appliance-with-aws-gateway-load-balancer%2F&data=04%7C01%7Croopa%40nvidia.com%7C223c13c616ef430a487f08da06c19610%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637829725767772379%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&sdata=ZQTDrFJ8LLn5SdN6h%2B5NECxwlD9PAGV2KzpVUV%2B1chc%3D&reserved=0 Thanks for the details.
On Wed, Mar 16, 2022 at 2:33 AM Roopa Prabhu <roopa@nvidia.com> wrote: > > > On 3/15/22 13:22, Eyal Birger wrote: > > Hi Roopa, > > > > On Tue, Mar 15, 2022 at 9:57 PM Roopa Prabhu <roopa@nvidia.com> wrote: > >> > >> On 3/15/22 11:56, Eyal Birger wrote: > >>> This patch adds support for encapsulating IPv4/IPv6 within GENEVE. > >>> > >>> In order use this, a new IFLA_GENEVE_TUN flag needs to be provided at > >>> device creation. This property cannot be changed for the time being. > >>> > >>> In case IP traffic is received on a non-tun device the drop count is > >>> increased. > >>> > >>> Signed-off-by: Eyal Birger <eyal.birger@gmail.com> > >>> --- > >>> drivers/net/geneve.c | 79 +++++++++++++++++++++++++++--------- > >>> include/uapi/linux/if_link.h | 1 + > >>> 2 files changed, 61 insertions(+), 19 deletions(-) > >>> > >>> diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c > >>> index a895ff756093..37305ec26250 100644 > >>> --- a/drivers/net/geneve.c > >>> +++ b/drivers/net/geneve.c > >>> @@ -56,6 +56,7 @@ struct geneve_config { > >>> bool use_udp6_rx_checksums; > >>> bool ttl_inherit; > >>> enum ifla_geneve_df df; > >>> + bool tun; > >>> }; > >>> > >>> /* Pseudo network device */ > >>> @@ -251,17 +252,24 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, > >>> } > >>> } > >>> > >>> - skb_reset_mac_header(skb); > >>> - skb->protocol = eth_type_trans(skb, geneve->dev); > >>> - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); > >>> - > >>> if (tun_dst) > >>> skb_dst_set(skb, &tun_dst->dst); > >>> > >>> - /* Ignore packet loops (and multicast echo) */ > >>> - if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) { > >>> - geneve->dev->stats.rx_errors++; > >>> - goto drop; > >>> + if (gnvh->proto_type == htons(ETH_P_TEB)) { > >>> + skb_reset_mac_header(skb); > >>> + skb->protocol = eth_type_trans(skb, geneve->dev); > >>> + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); > >>> + > >>> + /* Ignore packet loops (and multicast echo) */ > >>> + if (ether_addr_equal(eth_hdr(skb)->h_source, > >>> + geneve->dev->dev_addr)) { > >>> + geneve->dev->stats.rx_errors++; > >>> + goto drop; > >>> + } > >>> + } else { > >>> + skb_reset_mac_header(skb); > >>> + skb->dev = geneve->dev; > >>> + skb->pkt_type = PACKET_HOST; > >>> } > >>> > >>> oiph = skb_network_header(skb); > >>> @@ -345,6 +353,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) > >>> struct genevehdr *geneveh; > >>> struct geneve_dev *geneve; > >>> struct geneve_sock *gs; > >>> + __be16 inner_proto; > >>> int opts_len; > >>> > >>> /* Need UDP and Geneve header to be present */ > >>> @@ -356,8 +365,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) > >>> if (unlikely(geneveh->ver != GENEVE_VER)) > >>> goto drop; > >>> > >>> - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) > >>> + inner_proto = geneveh->proto_type; > >>> + > >>> + if (unlikely((inner_proto != htons(ETH_P_TEB) && > >>> + inner_proto != htons(ETH_P_IP) && > >>> + inner_proto != htons(ETH_P_IPV6)))) { > >>> goto drop; > >>> + } > >>> > >> > >> unnecessary braces > > Will fix. > > > >>> gs = rcu_dereference_sk_user_data(sk); > >>> if (!gs) > >>> @@ -367,9 +381,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) > >>> if (!geneve) > >>> goto drop; > >>> > >>> + if (unlikely((!geneve->cfg.tun && inner_proto != htons(ETH_P_TEB)))) { > >>> + geneve->dev->stats.rx_dropped++; > >>> + goto drop; > >>> + } > >> Does this change current default behavior ?. > >> > >> its unclear to be what the current behavior is for a non ETH_P_TEB packet > > Currently non ETH_P_TEB packets are silently dropped. > > I figured that if the driver supported other ethertypes it would make > > sense to increase the count in such case, to assist in debugging wrong > > configurations. > > > > I can remove this if it's better to keep the current behavior. > > yes, agree. counting is better. > > > >> > >>> + > >>> opts_len = geneveh->opt_len * 4; > >>> - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, > >>> - htons(ETH_P_TEB), > >>> + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto, > >>> !net_eq(geneve->net, dev_net(geneve->dev)))) { > >>> geneve->dev->stats.rx_dropped++; > >>> goto drop; > >>> @@ -717,7 +735,8 @@ static int geneve_stop(struct net_device *dev) > >>> } > >>> > >>> static void geneve_build_header(struct genevehdr *geneveh, > >>> - const struct ip_tunnel_info *info) > >>> + const struct ip_tunnel_info *info, > >>> + __be16 inner_proto) > >>> { > >>> geneveh->ver = GENEVE_VER; > >>> geneveh->opt_len = info->options_len / 4; > >>> @@ -725,7 +744,7 @@ static void geneve_build_header(struct genevehdr *geneveh, > >>> geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT); > >>> geneveh->rsvd1 = 0; > >>> tunnel_id_to_vni(info->key.tun_id, geneveh->vni); > >>> - geneveh->proto_type = htons(ETH_P_TEB); > >>> + geneveh->proto_type = inner_proto; > >>> geneveh->rsvd2 = 0; > >>> > >>> if (info->key.tun_flags & TUNNEL_GENEVE_OPT) > >>> @@ -734,8 +753,9 @@ static void geneve_build_header(struct genevehdr *geneveh, > >>> > >>> static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, > >>> const struct ip_tunnel_info *info, > >>> - bool xnet, int ip_hdr_len) > >>> + bool xnet, int ip_hdr_len, bool tun) > >>> { > >>> + __be16 inner_proto = tun ? skb->protocol : htons(ETH_P_TEB); > >>> bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); > >>> struct genevehdr *gnvh; > >>> int min_headroom; > >>> @@ -755,8 +775,8 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, > >>> goto free_dst; > >>> > >>> gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len); > >>> - geneve_build_header(gnvh, info); > >>> - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); > >>> + geneve_build_header(gnvh, info, inner_proto); > >>> + skb_set_inner_protocol(skb, inner_proto); > >>> return 0; > >>> > >>> free_dst: > >>> @@ -959,7 +979,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, > >>> } > >>> } > >>> > >>> - err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr)); > >>> + err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr), > >>> + geneve->cfg.tun); > >>> if (unlikely(err)) > >>> return err; > >>> > >>> @@ -1038,7 +1059,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, > >>> ttl = key->ttl; > >>> ttl = ttl ? : ip6_dst_hoplimit(dst); > >>> } > >>> - err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr)); > >>> + err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr), > >>> + geneve->cfg.tun); > >>> if (unlikely(err)) > >>> return err; > >>> > >>> @@ -1388,6 +1410,14 @@ static int geneve_configure(struct net *net, struct net_device *dev, > >>> dst_cache_reset(&geneve->cfg.info.dst_cache); > >>> memcpy(&geneve->cfg, cfg, sizeof(*cfg)); > >>> > >>> + if (geneve->cfg.tun) { > >>> + dev->header_ops = NULL; > >>> + dev->type = ARPHRD_NONE; > >>> + dev->hard_header_len = 0; > >>> + dev->addr_len = 0; > >>> + dev->flags = IFF_NOARP; > >>> + } > >>> + > >>> err = register_netdevice(dev); > >>> if (err) > >>> return err; > >>> @@ -1561,10 +1591,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], > >>> #endif > >>> } > >>> > >>> + if (data[IFLA_GENEVE_TUN]) { > >>> + if (changelink) { > >>> + attrtype = IFLA_GENEVE_TUN; > >>> + goto change_notsup; > >>> + } > >>> + cfg->tun = true; > >>> + } > >>> + > >>> return 0; > >>> change_notsup: > >>> NL_SET_ERR_MSG_ATTR(extack, data[attrtype], > >>> - "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported"); > >>> + "Changing VNI, Port, endpoint IP address family, external, tun, and UDP checksum attributes are not supported"); > >>> return -EOPNOTSUPP; > >>> } > >>> > >>> @@ -1799,6 +1837,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) > >>> if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit)) > >>> goto nla_put_failure; > >>> > >>> + if (geneve->cfg.tun && nla_put_flag(skb, IFLA_GENEVE_TUN)) > >>> + goto nla_put_failure; > >>> + > >>> return 0; > >>> > >>> nla_put_failure: > >>> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h > >>> index bd24c7dc10a2..198aefa2c513 100644 > >>> --- a/include/uapi/linux/if_link.h > >>> +++ b/include/uapi/linux/if_link.h > >>> @@ -842,6 +842,7 @@ enum { > >>> IFLA_GENEVE_LABEL, > >>> IFLA_GENEVE_TTL_INHERIT, > >>> IFLA_GENEVE_DF, > >>> + IFLA_GENEVE_TUN, > >> geneve is itself called a tunnel, i wonder if a different name for the > >> flag would be more appropriate. > > I tried to find a reference to something similar, so went with something like > > the tun vs. tap distinction. I was also concerned about the possible > > confusion, but it felt clearer than something like L3_INNER_PROTO_MODE. > > > > I'd happily replace it with a better suggestion. > > o ok, makes sense. I can't think of anything other than simply > IFLA_GENEVE_INNER_PROTO > > (maybe others have better suggestions) My concern with calling it IFLA_GENEVE_INNER_PROTO is that inner_proto is used internally to denote the inner proto value. Would IFLA_GENEVE_INNER_PROTO_INHERIT make sense? > > > > > >> what is the use case for this ?. is there a RFC reference ? > > I stumbled upon this configuration when trying to receive packets from an > > AWS "Gateway Load Balancer" which sends IP packets encapsulated in GENEVE [1]. > > > > But to my understanding the RFC allows this so it doesn't seem something > > specific to AWS. > > that makes me wonder if we need a knob atall and if this should be > allowed by default. I didn't find a way to make tx work without requiring a flag, so I thought it'd be better if this mode was enforced in both directions. > > wonder how other network vendor standard geneve implementations behave > by default. > > > > > > Thanks for the review! > > > > Eyal. > > > > [1] https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Faws.amazon.com%2Fblogs%2Fnetworking-and-content-delivery%2Fintegrate-your-custom-logic-or-appliance-with-aws-gateway-load-balancer%2F&data=04%7C01%7Croopa%40nvidia.com%7C223c13c616ef430a487f08da06c19610%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637829725767772379%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&sdata=ZQTDrFJ8LLn5SdN6h%2B5NECxwlD9PAGV2KzpVUV%2B1chc%3D&reserved=0 > > Thanks for the details. > >
On 3/15/22 20:50, Eyal Birger wrote: > On Wed, Mar 16, 2022 at 2:33 AM Roopa Prabhu <roopa@nvidia.com> wrote: >> >> On 3/15/22 13:22, Eyal Birger wrote: >>> Hi Roopa, >>> >>> On Tue, Mar 15, 2022 at 9:57 PM Roopa Prabhu <roopa@nvidia.com> wrote: >>>> On 3/15/22 11:56, Eyal Birger wrote: >>>>> This patch adds support for encapsulating IPv4/IPv6 within GENEVE. >>>>> >>>>> In order use this, a new IFLA_GENEVE_TUN flag needs to be provided at >>>>> device creation. This property cannot be changed for the time being. >>>>> >>>>> In case IP traffic is received on a non-tun device the drop count is >>>>> increased. >>>>> >>>>> Signed-off-by: Eyal Birger <eyal.birger@gmail.com> >>>>> --- >>>>> drivers/net/geneve.c | 79 +++++++++++++++++++++++++++--------- >>>>> include/uapi/linux/if_link.h | 1 + >>>>> 2 files changed, 61 insertions(+), 19 deletions(-) >>>>> >>>>> diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c >>>>> index a895ff756093..37305ec26250 100644 >>>>> --- a/drivers/net/geneve.c >>>>> +++ b/drivers/net/geneve.c >>>>> @@ -56,6 +56,7 @@ struct geneve_config { >>>>> bool use_udp6_rx_checksums; >>>>> bool ttl_inherit; >>>>> enum ifla_geneve_df df; >>>>> + bool tun; >>>>> }; >>>>> >>>>> /* Pseudo network device */ >>>>> @@ -251,17 +252,24 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, >>>>> } >>>>> } >>>>> >>>>> - skb_reset_mac_header(skb); >>>>> - skb->protocol = eth_type_trans(skb, geneve->dev); >>>>> - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); >>>>> - >>>>> if (tun_dst) >>>>> skb_dst_set(skb, &tun_dst->dst); >>>>> >>>>> - /* Ignore packet loops (and multicast echo) */ >>>>> - if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) { >>>>> - geneve->dev->stats.rx_errors++; >>>>> - goto drop; >>>>> + if (gnvh->proto_type == htons(ETH_P_TEB)) { >>>>> + skb_reset_mac_header(skb); >>>>> + skb->protocol = eth_type_trans(skb, geneve->dev); >>>>> + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); >>>>> + >>>>> + /* Ignore packet loops (and multicast echo) */ >>>>> + if (ether_addr_equal(eth_hdr(skb)->h_source, >>>>> + geneve->dev->dev_addr)) { >>>>> + geneve->dev->stats.rx_errors++; >>>>> + goto drop; >>>>> + } >>>>> + } else { >>>>> + skb_reset_mac_header(skb); >>>>> + skb->dev = geneve->dev; >>>>> + skb->pkt_type = PACKET_HOST; >>>>> } >>>>> >>>>> oiph = skb_network_header(skb); >>>>> @@ -345,6 +353,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) >>>>> struct genevehdr *geneveh; >>>>> struct geneve_dev *geneve; >>>>> struct geneve_sock *gs; >>>>> + __be16 inner_proto; >>>>> int opts_len; >>>>> >>>>> /* Need UDP and Geneve header to be present */ >>>>> @@ -356,8 +365,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) >>>>> if (unlikely(geneveh->ver != GENEVE_VER)) >>>>> goto drop; >>>>> >>>>> - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) >>>>> + inner_proto = geneveh->proto_type; >>>>> + >>>>> + if (unlikely((inner_proto != htons(ETH_P_TEB) && >>>>> + inner_proto != htons(ETH_P_IP) && >>>>> + inner_proto != htons(ETH_P_IPV6)))) { >>>>> goto drop; >>>>> + } >>>>> >>>> unnecessary braces >>> Will fix. >>> >>>>> gs = rcu_dereference_sk_user_data(sk); >>>>> if (!gs) >>>>> @@ -367,9 +381,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) >>>>> if (!geneve) >>>>> goto drop; >>>>> >>>>> + if (unlikely((!geneve->cfg.tun && inner_proto != htons(ETH_P_TEB)))) { >>>>> + geneve->dev->stats.rx_dropped++; >>>>> + goto drop; >>>>> + } >>>> Does this change current default behavior ?. >>>> >>>> its unclear to be what the current behavior is for a non ETH_P_TEB packet >>> Currently non ETH_P_TEB packets are silently dropped. >>> I figured that if the driver supported other ethertypes it would make >>> sense to increase the count in such case, to assist in debugging wrong >>> configurations. >>> >>> I can remove this if it's better to keep the current behavior. >> yes, agree. counting is better. >> >> >>>>> + >>>>> opts_len = geneveh->opt_len * 4; >>>>> - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, >>>>> - htons(ETH_P_TEB), >>>>> + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto, >>>>> !net_eq(geneve->net, dev_net(geneve->dev)))) { >>>>> geneve->dev->stats.rx_dropped++; >>>>> goto drop; >>>>> @@ -717,7 +735,8 @@ static int geneve_stop(struct net_device *dev) >>>>> } >>>>> >>>>> static void geneve_build_header(struct genevehdr *geneveh, >>>>> - const struct ip_tunnel_info *info) >>>>> + const struct ip_tunnel_info *info, >>>>> + __be16 inner_proto) >>>>> { >>>>> geneveh->ver = GENEVE_VER; >>>>> geneveh->opt_len = info->options_len / 4; >>>>> @@ -725,7 +744,7 @@ static void geneve_build_header(struct genevehdr *geneveh, >>>>> geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT); >>>>> geneveh->rsvd1 = 0; >>>>> tunnel_id_to_vni(info->key.tun_id, geneveh->vni); >>>>> - geneveh->proto_type = htons(ETH_P_TEB); >>>>> + geneveh->proto_type = inner_proto; >>>>> geneveh->rsvd2 = 0; >>>>> >>>>> if (info->key.tun_flags & TUNNEL_GENEVE_OPT) >>>>> @@ -734,8 +753,9 @@ static void geneve_build_header(struct genevehdr *geneveh, >>>>> >>>>> static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, >>>>> const struct ip_tunnel_info *info, >>>>> - bool xnet, int ip_hdr_len) >>>>> + bool xnet, int ip_hdr_len, bool tun) >>>>> { >>>>> + __be16 inner_proto = tun ? skb->protocol : htons(ETH_P_TEB); >>>>> bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); >>>>> struct genevehdr *gnvh; >>>>> int min_headroom; >>>>> @@ -755,8 +775,8 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, >>>>> goto free_dst; >>>>> >>>>> gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len); >>>>> - geneve_build_header(gnvh, info); >>>>> - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); >>>>> + geneve_build_header(gnvh, info, inner_proto); >>>>> + skb_set_inner_protocol(skb, inner_proto); >>>>> return 0; >>>>> >>>>> free_dst: >>>>> @@ -959,7 +979,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, >>>>> } >>>>> } >>>>> >>>>> - err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr)); >>>>> + err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr), >>>>> + geneve->cfg.tun); >>>>> if (unlikely(err)) >>>>> return err; >>>>> >>>>> @@ -1038,7 +1059,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, >>>>> ttl = key->ttl; >>>>> ttl = ttl ? : ip6_dst_hoplimit(dst); >>>>> } >>>>> - err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr)); >>>>> + err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr), >>>>> + geneve->cfg.tun); >>>>> if (unlikely(err)) >>>>> return err; >>>>> >>>>> @@ -1388,6 +1410,14 @@ static int geneve_configure(struct net *net, struct net_device *dev, >>>>> dst_cache_reset(&geneve->cfg.info.dst_cache); >>>>> memcpy(&geneve->cfg, cfg, sizeof(*cfg)); >>>>> >>>>> + if (geneve->cfg.tun) { >>>>> + dev->header_ops = NULL; >>>>> + dev->type = ARPHRD_NONE; >>>>> + dev->hard_header_len = 0; >>>>> + dev->addr_len = 0; >>>>> + dev->flags = IFF_NOARP; >>>>> + } >>>>> + >>>>> err = register_netdevice(dev); >>>>> if (err) >>>>> return err; >>>>> @@ -1561,10 +1591,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], >>>>> #endif >>>>> } >>>>> >>>>> + if (data[IFLA_GENEVE_TUN]) { >>>>> + if (changelink) { >>>>> + attrtype = IFLA_GENEVE_TUN; >>>>> + goto change_notsup; >>>>> + } >>>>> + cfg->tun = true; >>>>> + } >>>>> + >>>>> return 0; >>>>> change_notsup: >>>>> NL_SET_ERR_MSG_ATTR(extack, data[attrtype], >>>>> - "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported"); >>>>> + "Changing VNI, Port, endpoint IP address family, external, tun, and UDP checksum attributes are not supported"); >>>>> return -EOPNOTSUPP; >>>>> } >>>>> >>>>> @@ -1799,6 +1837,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) >>>>> if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit)) >>>>> goto nla_put_failure; >>>>> >>>>> + if (geneve->cfg.tun && nla_put_flag(skb, IFLA_GENEVE_TUN)) >>>>> + goto nla_put_failure; >>>>> + >>>>> return 0; >>>>> >>>>> nla_put_failure: >>>>> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h >>>>> index bd24c7dc10a2..198aefa2c513 100644 >>>>> --- a/include/uapi/linux/if_link.h >>>>> +++ b/include/uapi/linux/if_link.h >>>>> @@ -842,6 +842,7 @@ enum { >>>>> IFLA_GENEVE_LABEL, >>>>> IFLA_GENEVE_TTL_INHERIT, >>>>> IFLA_GENEVE_DF, >>>>> + IFLA_GENEVE_TUN, >>>> geneve is itself called a tunnel, i wonder if a different name for the >>>> flag would be more appropriate. >>> I tried to find a reference to something similar, so went with something like >>> the tun vs. tap distinction. I was also concerned about the possible >>> confusion, but it felt clearer than something like L3_INNER_PROTO_MODE. >>> >>> I'd happily replace it with a better suggestion. >> o ok, makes sense. I can't think of anything other than simply >> IFLA_GENEVE_INNER_PROTO >> >> (maybe others have better suggestions) > My concern with calling it IFLA_GENEVE_INNER_PROTO is that inner_proto is > used internally to denote the inner proto value. > > Would IFLA_GENEVE_INNER_PROTO_INHERIT make sense? yes, i like that better (and there is precedence to using inherit) >> >>>> what is the use case for this ?. is there a RFC reference ? >>> I stumbled upon this configuration when trying to receive packets from an >>> AWS "Gateway Load Balancer" which sends IP packets encapsulated in GENEVE [1]. >>> >>> But to my understanding the RFC allows this so it doesn't seem something >>> specific to AWS. >> that makes me wonder if we need a knob atall and if this should be >> allowed by default. > I didn't find a way to make tx work without requiring a flag, so I thought > it'd be better if this mode was enforced in both directions. ok, in that case flag is ok. > >> wonder how other network vendor standard geneve implementations behave >> by default. >> >> >>> Thanks for the review! >>> >>> Eyal. >>> >>> [1] https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Faws.amazon.com%2Fblogs%2Fnetworking-and-content-delivery%2Fintegrate-your-custom-logic-or-appliance-with-aws-gateway-load-balancer%2F&data=04%7C01%7Croopa%40nvidia.com%7C15818ae5c7f949db55f908da07002ccd%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637829994585887100%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&sdata=a4hU8aizVq%2Bp4ETdpn4oNIUAkNT%2FPkDiVtTGr8ksBts%3D&reserved=0 >> Thanks for the details. >> >>
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index a895ff756093..37305ec26250 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -56,6 +56,7 @@ struct geneve_config { bool use_udp6_rx_checksums; bool ttl_inherit; enum ifla_geneve_df df; + bool tun; }; /* Pseudo network device */ @@ -251,17 +252,24 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, } } - skb_reset_mac_header(skb); - skb->protocol = eth_type_trans(skb, geneve->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - if (tun_dst) skb_dst_set(skb, &tun_dst->dst); - /* Ignore packet loops (and multicast echo) */ - if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) { - geneve->dev->stats.rx_errors++; - goto drop; + if (gnvh->proto_type == htons(ETH_P_TEB)) { + skb_reset_mac_header(skb); + skb->protocol = eth_type_trans(skb, geneve->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + + /* Ignore packet loops (and multicast echo) */ + if (ether_addr_equal(eth_hdr(skb)->h_source, + geneve->dev->dev_addr)) { + geneve->dev->stats.rx_errors++; + goto drop; + } + } else { + skb_reset_mac_header(skb); + skb->dev = geneve->dev; + skb->pkt_type = PACKET_HOST; } oiph = skb_network_header(skb); @@ -345,6 +353,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) struct genevehdr *geneveh; struct geneve_dev *geneve; struct geneve_sock *gs; + __be16 inner_proto; int opts_len; /* Need UDP and Geneve header to be present */ @@ -356,8 +365,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (unlikely(geneveh->ver != GENEVE_VER)) goto drop; - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) + inner_proto = geneveh->proto_type; + + if (unlikely((inner_proto != htons(ETH_P_TEB) && + inner_proto != htons(ETH_P_IP) && + inner_proto != htons(ETH_P_IPV6)))) { goto drop; + } gs = rcu_dereference_sk_user_data(sk); if (!gs) @@ -367,9 +381,13 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (!geneve) goto drop; + if (unlikely((!geneve->cfg.tun && inner_proto != htons(ETH_P_TEB)))) { + geneve->dev->stats.rx_dropped++; + goto drop; + } + opts_len = geneveh->opt_len * 4; - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, - htons(ETH_P_TEB), + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto, !net_eq(geneve->net, dev_net(geneve->dev)))) { geneve->dev->stats.rx_dropped++; goto drop; @@ -717,7 +735,8 @@ static int geneve_stop(struct net_device *dev) } static void geneve_build_header(struct genevehdr *geneveh, - const struct ip_tunnel_info *info) + const struct ip_tunnel_info *info, + __be16 inner_proto) { geneveh->ver = GENEVE_VER; geneveh->opt_len = info->options_len / 4; @@ -725,7 +744,7 @@ static void geneve_build_header(struct genevehdr *geneveh, geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT); geneveh->rsvd1 = 0; tunnel_id_to_vni(info->key.tun_id, geneveh->vni); - geneveh->proto_type = htons(ETH_P_TEB); + geneveh->proto_type = inner_proto; geneveh->rsvd2 = 0; if (info->key.tun_flags & TUNNEL_GENEVE_OPT) @@ -734,8 +753,9 @@ static void geneve_build_header(struct genevehdr *geneveh, static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, const struct ip_tunnel_info *info, - bool xnet, int ip_hdr_len) + bool xnet, int ip_hdr_len, bool tun) { + __be16 inner_proto = tun ? skb->protocol : htons(ETH_P_TEB); bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); struct genevehdr *gnvh; int min_headroom; @@ -755,8 +775,8 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, goto free_dst; gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len); - geneve_build_header(gnvh, info); - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + geneve_build_header(gnvh, info, inner_proto); + skb_set_inner_protocol(skb, inner_proto); return 0; free_dst: @@ -959,7 +979,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, } } - err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr)); + err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr), + geneve->cfg.tun); if (unlikely(err)) return err; @@ -1038,7 +1059,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, ttl = key->ttl; ttl = ttl ? : ip6_dst_hoplimit(dst); } - err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr)); + err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr), + geneve->cfg.tun); if (unlikely(err)) return err; @@ -1388,6 +1410,14 @@ static int geneve_configure(struct net *net, struct net_device *dev, dst_cache_reset(&geneve->cfg.info.dst_cache); memcpy(&geneve->cfg, cfg, sizeof(*cfg)); + if (geneve->cfg.tun) { + dev->header_ops = NULL; + dev->type = ARPHRD_NONE; + dev->hard_header_len = 0; + dev->addr_len = 0; + dev->flags = IFF_NOARP; + } + err = register_netdevice(dev); if (err) return err; @@ -1561,10 +1591,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], #endif } + if (data[IFLA_GENEVE_TUN]) { + if (changelink) { + attrtype = IFLA_GENEVE_TUN; + goto change_notsup; + } + cfg->tun = true; + } + return 0; change_notsup: NL_SET_ERR_MSG_ATTR(extack, data[attrtype], - "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported"); + "Changing VNI, Port, endpoint IP address family, external, tun, and UDP checksum attributes are not supported"); return -EOPNOTSUPP; } @@ -1799,6 +1837,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit)) goto nla_put_failure; + if (geneve->cfg.tun && nla_put_flag(skb, IFLA_GENEVE_TUN)) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index bd24c7dc10a2..198aefa2c513 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -842,6 +842,7 @@ enum { IFLA_GENEVE_LABEL, IFLA_GENEVE_TTL_INHERIT, IFLA_GENEVE_DF, + IFLA_GENEVE_TUN, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1)
This patch adds support for encapsulating IPv4/IPv6 within GENEVE. In order use this, a new IFLA_GENEVE_TUN flag needs to be provided at device creation. This property cannot be changed for the time being. In case IP traffic is received on a non-tun device the drop count is increased. Signed-off-by: Eyal Birger <eyal.birger@gmail.com> --- drivers/net/geneve.c | 79 +++++++++++++++++++++++++++--------- include/uapi/linux/if_link.h | 1 + 2 files changed, 61 insertions(+), 19 deletions(-)