diff mbox series

[net,3/9] netfilter: nf_nat: undo erroneous tcp edemux lookup

Message ID 20210306121223.28711-4-pablo@netfilter.org (mailing list archive)
State Accepted
Commit 03a3ca37e4c6478e3a84f04c8429dd5889e107fd
Delegated to: Netdev Maintainers
Headers show
Series [net,1/9] uapi: nfnetlink_cthelper.h: fix userspace compilation error | expand

Checks

Context Check Description
netdev/cover_letter success Pull request
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net
netdev/subject_prefix success Link
netdev/cc_maintainers warning 3 maintainers not CCed: fw@strlen.de coreteam@netfilter.org kadlec@netfilter.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 49 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/header_inline success Link

Commit Message

Pablo Neira Ayuso March 6, 2021, 12:12 p.m. UTC
From: Florian Westphal <fw@strlen.de>

Under extremely rare conditions TCP early demux will retrieve the wrong
socket.

1. local machine establishes a connection to a remote server, S, on port
   p.

   This gives:
   laddr:lport -> S:p
   ... both in tcp and conntrack.

2. local machine establishes a connection to host H, on port p2.
   2a. TCP stack choses same laddr:lport, so we have
   laddr:lport -> H:p2 from TCP point of view.
   2b). There is a destination NAT rewrite in place, translating
        H:p2 to S:p.  This results in following conntrack entries:

   I)  laddr:lport -> S:p  (origin)  S:p -> laddr:lport (reply)
   II) laddr:lport -> H:p2 (origin)  S:p -> laddr:lport2 (reply)

   NAT engine has rewritten laddr:lport to laddr:lport2 to map
   the reply packet to the correct origin.

   When server sends SYN/ACK to laddr:lport2, the PREROUTING hook
   will undo-the SNAT transformation, rewriting IP header to
   S:p -> laddr:lport

   This causes TCP early demux to associate the skb with the TCP socket
   of the first connection.

   The INPUT hook will then reverse the DNAT transformation, rewriting
   the IP header to H:p2 -> laddr:lport.

Because packet ends up with the wrong socket, the new connection
never completes: originator stays in SYN_SENT and conntrack entry
remains in SYN_RECV until timeout, and responder retransmits SYN/ACK
until it gives up.

To resolve this, orphan the skb after the input rewrite:
Because the source IP address changed, the socket must be incorrect.
We can't move the DNAT undo to prerouting due to backwards
compatibility, doing so will make iptables/nftables rules to no longer
match the way they did.

After orphan, the packet will be handed to the next protocol layer
(tcp, udp, ...) and that will repeat the socket lookup just like as if
early demux was disabled.

Fixes: 41063e9dd1195 ("ipv4: Early TCP socket demux.")
Closes: https://bugzilla.netfilter.org/show_bug.cgi?id=1427
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_proto.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

Comments

Mika Penttilä March 6, 2021, 2:49 p.m. UTC | #1
On 6.3.2021 14.12, Pablo Neira Ayuso wrote:
> From: Florian Westphal <fw@strlen.de>
>
> Under extremely rare conditions TCP early demux will retrieve the wrong
> socket.
>
> 1. local machine establishes a connection to a remote server, S, on port
>     p.
>
>     This gives:
>     laddr:lport -> S:p
>     ... both in tcp and conntrack.
>
> 2. local machine establishes a connection to host H, on port p2.
>     2a. TCP stack choses same laddr:lport, so we have
>     laddr:lport -> H:p2 from TCP point of view.
>     2b). There is a destination NAT rewrite in place, translating
>          H:p2 to S:p.  This results in following conntrack entries:
>
>     I)  laddr:lport -> S:p  (origin)  S:p -> laddr:lport (reply)
>     II) laddr:lport -> H:p2 (origin)  S:p -> laddr:lport2 (reply)
>
>     NAT engine has rewritten laddr:lport to laddr:lport2 to map
>     the reply packet to the correct origin.
Could you eloborate where and how linux nat engine is doing the

laddr:lport to laddr:lport2

rewrite? There's only DST nat and there should be conflict (for reply) 
in tuple establishment afaik....


>
>     When server sends SYN/ACK to laddr:lport2, the PREROUTING hook
>     will undo-the SNAT transformation, rewriting IP header to
>     S:p -> laddr:lport
>
>     This causes TCP early demux to associate the skb with the TCP socket
>     of the first connection.
>
>     The INPUT hook will then reverse the DNAT transformation, rewriting
>     the IP header to H:p2 -> laddr:lport.
>
> Because packet ends up with the wrong socket, the new connection
> never completes: originator stays in SYN_SENT and conntrack entry
> remains in SYN_RECV until timeout, and responder retransmits SYN/ACK
> until it gives up.
>
> To resolve this, orphan the skb after the input rewrite:
> Because the source IP address changed, the socket must be incorrect.
> We can't move the DNAT undo to prerouting due to backwards
> compatibility, doing so will make iptables/nftables rules to no longer
> match the way they did.
>
> After orphan, the packet will be handed to the next protocol layer
> (tcp, udp, ...) and that will repeat the socket lookup just like as if
> early demux was disabled.
>
> Fixes: 41063e9dd1195 ("ipv4: Early TCP socket demux.")
> Closes: https://bugzilla.netfilter.org/show_bug.cgi?id=1427
> Signed-off-by: Florian Westphal <fw@strlen.de>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
>   net/netfilter/nf_nat_proto.c | 25 +++++++++++++++++++++----
>   1 file changed, 21 insertions(+), 4 deletions(-)
>
> diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
> index e87b6bd6b3cd..4731d21fc3ad 100644
> --- a/net/netfilter/nf_nat_proto.c
> +++ b/net/netfilter/nf_nat_proto.c
> @@ -646,8 +646,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
>   }
>   
>   static unsigned int
> -nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
> -	       const struct nf_hook_state *state)
> +nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb,
> +			const struct nf_hook_state *state)
>   {
>   	unsigned int ret;
>   	__be32 daddr = ip_hdr(skb)->daddr;
> @@ -659,6 +659,23 @@ nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
>   	return ret;
>   }
>   
> +static unsigned int
> +nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb,
> +		     const struct nf_hook_state *state)
> +{
> +	__be32 saddr = ip_hdr(skb)->saddr;
> +	struct sock *sk = skb->sk;
> +	unsigned int ret;
> +
> +	ret = nf_nat_ipv4_fn(priv, skb, state);
> +
> +	if (ret == NF_ACCEPT && sk && saddr != ip_hdr(skb)->saddr &&
> +	    !inet_sk_transparent(sk))
> +		skb_orphan(skb); /* TCP edemux obtained wrong socket */
> +
> +	return ret;
> +}
> +
>   static unsigned int
>   nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
>   		const struct nf_hook_state *state)
> @@ -736,7 +753,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
>   static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
>   	/* Before packet filtering, change destination */
>   	{
> -		.hook		= nf_nat_ipv4_in,
> +		.hook		= nf_nat_ipv4_pre_routing,
>   		.pf		= NFPROTO_IPV4,
>   		.hooknum	= NF_INET_PRE_ROUTING,
>   		.priority	= NF_IP_PRI_NAT_DST,
> @@ -757,7 +774,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
>   	},
>   	/* After packet filtering, change source */
>   	{
> -		.hook		= nf_nat_ipv4_fn,
> +		.hook		= nf_nat_ipv4_local_in,
>   		.pf		= NFPROTO_IPV4,
>   		.hooknum	= NF_INET_LOCAL_IN,
>   		.priority	= NF_IP_PRI_NAT_SRC,
Mika Penttilä March 6, 2021, 4:10 p.m. UTC | #2
On 6.3.2021 16.49, Mika Penttilä wrote:
>
>
> On 6.3.2021 14.12, Pablo Neira Ayuso wrote:
>> From: Florian Westphal <fw@strlen.de>
>>
>> Under extremely rare conditions TCP early demux will retrieve the wrong
>> socket.
>>
>> 1. local machine establishes a connection to a remote server, S, on port
>>     p.
>>
>>     This gives:
>>     laddr:lport -> S:p
>>     ... both in tcp and conntrack.
>>
>> 2. local machine establishes a connection to host H, on port p2.
>>     2a. TCP stack choses same laddr:lport, so we have
>>     laddr:lport -> H:p2 from TCP point of view.
>>     2b). There is a destination NAT rewrite in place, translating
>>          H:p2 to S:p.  This results in following conntrack entries:
>>
>>     I)  laddr:lport -> S:p  (origin)  S:p -> laddr:lport (reply)
>>     II) laddr:lport -> H:p2 (origin)  S:p -> laddr:lport2 (reply)
>>
>>     NAT engine has rewritten laddr:lport to laddr:lport2 to map
>>     the reply packet to the correct origin.
> Could you eloborate where and how linux nat engine is doing the
>
> laddr:lport to laddr:lport2
>
> rewrite? There's only DST nat and there should be conflict (for reply) 
> in tuple establishment afaik....

Ah I see it is the nat null binding for src to make it unique

>
>
>>
>>     When server sends SYN/ACK to laddr:lport2, the PREROUTING hook
>>     will undo-the SNAT transformation, rewriting IP header to
>>     S:p -> laddr:lport
>>
>>     This causes TCP early demux to associate the skb with the TCP socket
>>     of the first connection.
>>
>>     The INPUT hook will then reverse the DNAT transformation, rewriting
>>     the IP header to H:p2 -> laddr:lport.
>>
>> Because packet ends up with the wrong socket, the new connection
>> never completes: originator stays in SYN_SENT and conntrack entry
>> remains in SYN_RECV until timeout, and responder retransmits SYN/ACK
>> until it gives up.
>>
>> To resolve this, orphan the skb after the input rewrite:
>> Because the source IP address changed, the socket must be incorrect.
>> We can't move the DNAT undo to prerouting due to backwards
>> compatibility, doing so will make iptables/nftables rules to no longer
>> match the way they did.
>>
>> After orphan, the packet will be handed to the next protocol layer
>> (tcp, udp, ...) and that will repeat the socket lookup just like as if
>> early demux was disabled.
>>
>> Fixes: 41063e9dd1195 ("ipv4: Early TCP socket demux.")
>> Closes: https://bugzilla.netfilter.org/show_bug.cgi?id=1427
>> Signed-off-by: Florian Westphal <fw@strlen.de>
>> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
>> ---
>>   net/netfilter/nf_nat_proto.c | 25 +++++++++++++++++++++----
>>   1 file changed, 21 insertions(+), 4 deletions(-)
>>
>> diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
>> index e87b6bd6b3cd..4731d21fc3ad 100644
>> --- a/net/netfilter/nf_nat_proto.c
>> +++ b/net/netfilter/nf_nat_proto.c
>> @@ -646,8 +646,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
>>   }
>>     static unsigned int
>> -nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
>> -           const struct nf_hook_state *state)
>> +nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb,
>> +            const struct nf_hook_state *state)
>>   {
>>       unsigned int ret;
>>       __be32 daddr = ip_hdr(skb)->daddr;
>> @@ -659,6 +659,23 @@ nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
>>       return ret;
>>   }
>>   +static unsigned int
>> +nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb,
>> +             const struct nf_hook_state *state)
>> +{
>> +    __be32 saddr = ip_hdr(skb)->saddr;
>> +    struct sock *sk = skb->sk;
>> +    unsigned int ret;
>> +
>> +    ret = nf_nat_ipv4_fn(priv, skb, state);
>> +
>> +    if (ret == NF_ACCEPT && sk && saddr != ip_hdr(skb)->saddr &&
>> +        !inet_sk_transparent(sk))
>> +        skb_orphan(skb); /* TCP edemux obtained wrong socket */
>> +
>> +    return ret;
>> +}
>> +
>>   static unsigned int
>>   nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
>>           const struct nf_hook_state *state)
>> @@ -736,7 +753,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff 
>> *skb,
>>   static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
>>       /* Before packet filtering, change destination */
>>       {
>> -        .hook        = nf_nat_ipv4_in,
>> +        .hook        = nf_nat_ipv4_pre_routing,
>>           .pf        = NFPROTO_IPV4,
>>           .hooknum    = NF_INET_PRE_ROUTING,
>>           .priority    = NF_IP_PRI_NAT_DST,
>> @@ -757,7 +774,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] 
>> = {
>>       },
>>       /* After packet filtering, change source */
>>       {
>> -        .hook        = nf_nat_ipv4_fn,
>> +        .hook        = nf_nat_ipv4_local_in,
>>           .pf        = NFPROTO_IPV4,
>>           .hooknum    = NF_INET_LOCAL_IN,
>>           .priority    = NF_IP_PRI_NAT_SRC,
>
diff mbox series

Patch

diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index e87b6bd6b3cd..4731d21fc3ad 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -646,8 +646,8 @@  nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 }
 
 static unsigned int
-nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state)
+nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb,
+			const struct nf_hook_state *state)
 {
 	unsigned int ret;
 	__be32 daddr = ip_hdr(skb)->daddr;
@@ -659,6 +659,23 @@  nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
 	return ret;
 }
 
+static unsigned int
+nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb,
+		     const struct nf_hook_state *state)
+{
+	__be32 saddr = ip_hdr(skb)->saddr;
+	struct sock *sk = skb->sk;
+	unsigned int ret;
+
+	ret = nf_nat_ipv4_fn(priv, skb, state);
+
+	if (ret == NF_ACCEPT && sk && saddr != ip_hdr(skb)->saddr &&
+	    !inet_sk_transparent(sk))
+		skb_orphan(skb); /* TCP edemux obtained wrong socket */
+
+	return ret;
+}
+
 static unsigned int
 nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
 		const struct nf_hook_state *state)
@@ -736,7 +753,7 @@  nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
 static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
 	/* Before packet filtering, change destination */
 	{
-		.hook		= nf_nat_ipv4_in,
+		.hook		= nf_nat_ipv4_pre_routing,
 		.pf		= NFPROTO_IPV4,
 		.hooknum	= NF_INET_PRE_ROUTING,
 		.priority	= NF_IP_PRI_NAT_DST,
@@ -757,7 +774,7 @@  static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
 	},
 	/* After packet filtering, change source */
 	{
-		.hook		= nf_nat_ipv4_fn,
+		.hook		= nf_nat_ipv4_local_in,
 		.pf		= NFPROTO_IPV4,
 		.hooknum	= NF_INET_LOCAL_IN,
 		.priority	= NF_IP_PRI_NAT_SRC,