diff mbox series

[net,v2] netfilter: nf_flow_table: fix teardown flow timeout

Message ID 20220512182803.6353-1-ozsh@nvidia.com (mailing list archive)
State Awaiting Upstream
Delegated to: Netdev Maintainers
Headers show
Series [net,v2] netfilter: nf_flow_table: fix teardown flow timeout | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net
netdev/fixes_present success Fixes tag present in non-next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 5854 this patch: 5854
netdev/cc_maintainers warning 6 maintainers not CCed: coreteam@netfilter.org edumazet@google.com kadlec@netfilter.org pabeni@redhat.com kuba@kernel.org davem@davemloft.net
netdev/build_clang success Errors and warnings before: 1152 this patch: 1152
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 5993 this patch: 5993
netdev/checkpatch warning WARNING: line length of 86 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Oz Shlomo May 12, 2022, 6:28 p.m. UTC
Connections leaving the established state (due to RST / FIN TCP packets)
set the flow table teardown flag. The packet path continues to set lower
timeout value as per the new TCP state but the offload flag remains set.
Hence, the conntrack garbage collector may race to undo the timeout
adjustment of the packet path, leaving the conntrack entry in place with
the internal offload timeout (one day).

Avoid ct gc timeout overwrite by flagging teared down flowtable
connections.

On the nftables side we only need to allow established TCP connections to
create a flow offload entry. Since we can not guaruantee that
flow_offload_teardown is called by a TCP FIN packet we also need to make
sure that flow_offload_fixup_ct is also called in flow_offload_del
and only fixes up established TCP connections.

Fixes: 1e5b2471bcc4 ("netfilter: nf_flow_table: teardown flow timeout race")
Signed-off-by: Oz Shlomo <ozsh@nvidia.com>
Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de>

------------

v1 -> v2 changes
- Add flow_teardown flag
- Add nftables handling
- Fixup timeout according to the current ct state
---
 include/uapi/linux/netfilter/nf_conntrack_common.h |  6 +++-
 net/netfilter/nf_conntrack_core.c                  |  3 +-
 net/netfilter/nf_flow_table_core.c                 | 42 +++++++++-------------
 net/netfilter/nft_flow_offload.c                   |  2 ++
 4 files changed, 25 insertions(+), 28 deletions(-)

Comments

Pablo Neira Ayuso May 16, 2022, 10:56 a.m. UTC | #1
On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> Connections leaving the established state (due to RST / FIN TCP packets)
> set the flow table teardown flag. The packet path continues to set lower
> timeout value as per the new TCP state but the offload flag remains set.
>
> Hence, the conntrack garbage collector may race to undo the timeout
> adjustment of the packet path, leaving the conntrack entry in place with
> the internal offload timeout (one day).
>
> Avoid ct gc timeout overwrite by flagging teared down flowtable
> connections.
>
> On the nftables side we only need to allow established TCP connections to
> create a flow offload entry. Since we can not guaruantee that
> flow_offload_teardown is called by a TCP FIN packet we also need to make
> sure that flow_offload_fixup_ct is also called in flow_offload_del
> and only fixes up established TCP connections.
[...]
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 0164e5f522e8..324fdb62c08b 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
>  			tmp = nf_ct_tuplehash_to_ctrack(h);
>  
>  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> -				nf_ct_offload_timeout(tmp);

Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
path that triggers the race, ie. nf_ct_is_expired()

The flowtable ct fixup races with conntrack gc collector.

Clearing IPS_OFFLOAD might result in offloading the entry again for
the closing packets.

Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
in a TCP state that represent closure?

  		if (unlikely(!tcph || tcph->fin || tcph->rst))
  			goto out;

this is already the intention in the existing code.

If this does work, could you keep IPS_OFFLOAD_TEARDOWN_BIT internal,
ie. no in uapi? Define it at include/net/netfilter/nf_conntrack.h and
add a comment regarding this to avoid an overlap in the future.

> +				if (!test_bit(IPS_OFFLOAD_TEARDOWN_BIT, &tmp->status))
> +					nf_ct_offload_timeout(tmp);
>  				continue;
>  			}
>  
> diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
> index 3db256da919b..aaed1a244013 100644
> --- a/net/netfilter/nf_flow_table_core.c
> +++ b/net/netfilter/nf_flow_table_core.c
> @@ -177,14 +177,8 @@ int flow_offload_route_init(struct flow_offload *flow,
>  }
>  EXPORT_SYMBOL_GPL(flow_offload_route_init);
>  
> -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
> -{
> -	tcp->state = TCP_CONNTRACK_ESTABLISHED;
> -	tcp->seen[0].td_maxwin = 0;
> -	tcp->seen[1].td_maxwin = 0;
> -}
>  
> -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> +static void flow_offload_fixup_ct(struct nf_conn *ct)
>  {
>  	struct net *net = nf_ct_net(ct);
>  	int l4num = nf_ct_protonum(ct);
> @@ -192,8 +186,12 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
>  
>  	if (l4num == IPPROTO_TCP) {
>  		struct nf_tcp_net *tn = nf_tcp_pernet(net);
> +		struct ip_ct_tcp *tcp = &ct->proto.tcp;
> +
> +		tcp->seen[0].td_maxwin = 0;
> +		tcp->seen[1].td_maxwin = 0;
>  
> -		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> +		timeout = tn->timeouts[ct->proto.tcp.state];
>  		timeout -= tn->offload_timeout;
>  	} else if (l4num == IPPROTO_UDP) {
>  		struct nf_udp_net *tn = nf_udp_pernet(net);
> @@ -211,18 +209,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
>  		WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
>  }
>  
> -static void flow_offload_fixup_ct_state(struct nf_conn *ct)
> -{
> -	if (nf_ct_protonum(ct) == IPPROTO_TCP)
> -		flow_offload_fixup_tcp(&ct->proto.tcp);
> -}
> -
> -static void flow_offload_fixup_ct(struct nf_conn *ct)
> -{
> -	flow_offload_fixup_ct_state(ct);
> -	flow_offload_fixup_ct_timeout(ct);
> -}
> -
>  static void flow_offload_route_release(struct flow_offload *flow)
>  {
>  	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
> @@ -353,6 +339,10 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
>  static void flow_offload_del(struct nf_flowtable *flow_table,
>  			     struct flow_offload *flow)
>  {
> +	struct nf_conn *ct = flow->ct;
> +
> +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> +
>  	rhashtable_remove_fast(&flow_table->rhashtable,
>  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
>  			       nf_flow_offload_rhash_params);
> @@ -360,12 +350,11 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
>  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
>  			       nf_flow_offload_rhash_params);
>  
> -	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
> -
>  	if (nf_flow_has_expired(flow))
> -		flow_offload_fixup_ct(flow->ct);
> -	else
> -		flow_offload_fixup_ct_timeout(flow->ct);
> +		flow_offload_fixup_ct(ct);

Very unlikely, but race might still happen between fixup and
clear IPS_OFFLOAD_BIT with gc below?

Without checking from the packet path, the conntrack gc might race to
refresh the timeout, I don't see a 100% race free solution.

Probably update the nf_ct_offload_timeout to a shorter value than a
day would mitigate this issue too.

> +	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
> +	clear_bit(IPS_OFFLOAD_TEARDOWN_BIT, &ct->status);
>  
>  	flow_offload_free(flow);
>  }
> @@ -373,8 +362,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
>  void flow_offload_teardown(struct flow_offload *flow)
>  {
>  	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
> +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
>  
> -	flow_offload_fixup_ct_state(flow->ct);
> +	flow_offload_fixup_ct(flow->ct);
>  }
>  EXPORT_SYMBOL_GPL(flow_offload_teardown);
>  
> diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
> index 900d48c810a1..9cc3ea08eb3a 100644
> --- a/net/netfilter/nft_flow_offload.c
> +++ b/net/netfilter/nft_flow_offload.c
> @@ -295,6 +295,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
>  					  sizeof(_tcph), &_tcph);
>  		if (unlikely(!tcph || tcph->fin || tcph->rst))
>  			goto out;
> +		if (unlikely(!nf_conntrack_tcp_established(ct)))
> +			goto out;

This chunk is not required, from ruleset users can do

        ... ct status assured ...

instead.

>  		break;
>  	case IPPROTO_UDP:
>  		break;
> -- 
> 1.8.3.1
>
Sven Auhagen May 16, 2022, 11:18 a.m. UTC | #2
On Mon, May 16, 2022 at 12:56:38PM +0200, Pablo Neira Ayuso wrote:
> On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> > Connections leaving the established state (due to RST / FIN TCP packets)
> > set the flow table teardown flag. The packet path continues to set lower
> > timeout value as per the new TCP state but the offload flag remains set.
> >
> > Hence, the conntrack garbage collector may race to undo the timeout
> > adjustment of the packet path, leaving the conntrack entry in place with
> > the internal offload timeout (one day).
> >
> > Avoid ct gc timeout overwrite by flagging teared down flowtable
> > connections.
> >
> > On the nftables side we only need to allow established TCP connections to
> > create a flow offload entry. Since we can not guaruantee that
> > flow_offload_teardown is called by a TCP FIN packet we also need to make
> > sure that flow_offload_fixup_ct is also called in flow_offload_del
> > and only fixes up established TCP connections.
> [...]
> > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > index 0164e5f522e8..324fdb62c08b 100644
> > --- a/net/netfilter/nf_conntrack_core.c
> > +++ b/net/netfilter/nf_conntrack_core.c
> > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> >  
> >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > -				nf_ct_offload_timeout(tmp);
> 
> Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> path that triggers the race, ie. nf_ct_is_expired()
> 
> The flowtable ct fixup races with conntrack gc collector.
> 
> Clearing IPS_OFFLOAD might result in offloading the entry again for
> the closing packets.
> 
> Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> in a TCP state that represent closure?
> 
>   		if (unlikely(!tcph || tcph->fin || tcph->rst))
>   			goto out;
> 
> this is already the intention in the existing code.
> 
> If this does work, could you keep IPS_OFFLOAD_TEARDOWN_BIT internal,
> ie. no in uapi? Define it at include/net/netfilter/nf_conntrack.h and
> add a comment regarding this to avoid an overlap in the future.
> 
> > +				if (!test_bit(IPS_OFFLOAD_TEARDOWN_BIT, &tmp->status))
> > +					nf_ct_offload_timeout(tmp);
> >  				continue;
> >  			}
> >  
> > diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
> > index 3db256da919b..aaed1a244013 100644
> > --- a/net/netfilter/nf_flow_table_core.c
> > +++ b/net/netfilter/nf_flow_table_core.c
> > @@ -177,14 +177,8 @@ int flow_offload_route_init(struct flow_offload *flow,
> >  }
> >  EXPORT_SYMBOL_GPL(flow_offload_route_init);
> >  
> > -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
> > -{
> > -	tcp->state = TCP_CONNTRACK_ESTABLISHED;
> > -	tcp->seen[0].td_maxwin = 0;
> > -	tcp->seen[1].td_maxwin = 0;
> > -}
> >  
> > -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > +static void flow_offload_fixup_ct(struct nf_conn *ct)
> >  {
> >  	struct net *net = nf_ct_net(ct);
> >  	int l4num = nf_ct_protonum(ct);
> > @@ -192,8 +186,12 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> >  
> >  	if (l4num == IPPROTO_TCP) {
> >  		struct nf_tcp_net *tn = nf_tcp_pernet(net);
> > +		struct ip_ct_tcp *tcp = &ct->proto.tcp;
> > +
> > +		tcp->seen[0].td_maxwin = 0;
> > +		tcp->seen[1].td_maxwin = 0;
> >  
> > -		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> > +		timeout = tn->timeouts[ct->proto.tcp.state];
> >  		timeout -= tn->offload_timeout;
> >  	} else if (l4num == IPPROTO_UDP) {
> >  		struct nf_udp_net *tn = nf_udp_pernet(net);
> > @@ -211,18 +209,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> >  		WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
> >  }
> >  
> > -static void flow_offload_fixup_ct_state(struct nf_conn *ct)
> > -{
> > -	if (nf_ct_protonum(ct) == IPPROTO_TCP)
> > -		flow_offload_fixup_tcp(&ct->proto.tcp);
> > -}
> > -
> > -static void flow_offload_fixup_ct(struct nf_conn *ct)
> > -{
> > -	flow_offload_fixup_ct_state(ct);
> > -	flow_offload_fixup_ct_timeout(ct);
> > -}
> > -
> >  static void flow_offload_route_release(struct flow_offload *flow)
> >  {
> >  	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
> > @@ -353,6 +339,10 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
> >  static void flow_offload_del(struct nf_flowtable *flow_table,
> >  			     struct flow_offload *flow)
> >  {
> > +	struct nf_conn *ct = flow->ct;
> > +
> > +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> > +
> >  	rhashtable_remove_fast(&flow_table->rhashtable,
> >  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
> >  			       nf_flow_offload_rhash_params);
> > @@ -360,12 +350,11 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> >  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
> >  			       nf_flow_offload_rhash_params);
> >  
> > -	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
> > -
> >  	if (nf_flow_has_expired(flow))
> > -		flow_offload_fixup_ct(flow->ct);
> > -	else
> > -		flow_offload_fixup_ct_timeout(flow->ct);
> > +		flow_offload_fixup_ct(ct);
> 
> Very unlikely, but race might still happen between fixup and
> clear IPS_OFFLOAD_BIT with gc below?
> 
> Without checking from the packet path, the conntrack gc might race to
> refresh the timeout, I don't see a 100% race free solution.
> 
> Probably update the nf_ct_offload_timeout to a shorter value than a
> day would mitigate this issue too.

This section of the code is now protected by IPS_OFFLOAD_TEARDOWN_BIT
which will prevent the update via nf_ct_offload_timeout.
We set it at the beginning of flow_offload_del and flow_offload_teardown.

Since flow_offload_teardown is only called on TCP packets
we also need to set it at flow_offload_del to prevent the race.

This should prevent the race at this point.

> 
> > +	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
> > +	clear_bit(IPS_OFFLOAD_TEARDOWN_BIT, &ct->status);
> >  
> >  	flow_offload_free(flow);
> >  }
> > @@ -373,8 +362,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> >  void flow_offload_teardown(struct flow_offload *flow)
> >  {
> >  	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
> > +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> >  
> > -	flow_offload_fixup_ct_state(flow->ct);
> > +	flow_offload_fixup_ct(flow->ct);
> >  }
> >  EXPORT_SYMBOL_GPL(flow_offload_teardown);
> >  
> > diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
> > index 900d48c810a1..9cc3ea08eb3a 100644
> > --- a/net/netfilter/nft_flow_offload.c
> > +++ b/net/netfilter/nft_flow_offload.c
> > @@ -295,6 +295,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
> >  					  sizeof(_tcph), &_tcph);
> >  		if (unlikely(!tcph || tcph->fin || tcph->rst))
> >  			goto out;
> > +		if (unlikely(!nf_conntrack_tcp_established(ct)))
> > +			goto out;
> 
> This chunk is not required, from ruleset users can do
> 
>         ... ct status assured ...
> 
> instead.

Maybe this should be mentioned in the manual or wiki if
it is not necessary in the flow offload code.

> 
> >  		break;
> >  	case IPPROTO_UDP:
> >  		break;
> > -- 
> > 1.8.3.1
> >
Pablo Neira Ayuso May 16, 2022, 11:37 a.m. UTC | #3
On Mon, May 16, 2022 at 01:18:17PM +0200, Sven Auhagen wrote:
> On Mon, May 16, 2022 at 12:56:38PM +0200, Pablo Neira Ayuso wrote:
> > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> > > Connections leaving the established state (due to RST / FIN TCP packets)
> > > set the flow table teardown flag. The packet path continues to set lower
> > > timeout value as per the new TCP state but the offload flag remains set.
> > >
> > > Hence, the conntrack garbage collector may race to undo the timeout
> > > adjustment of the packet path, leaving the conntrack entry in place with
> > > the internal offload timeout (one day).
> > >
> > > Avoid ct gc timeout overwrite by flagging teared down flowtable
> > > connections.
> > >
> > > On the nftables side we only need to allow established TCP connections to
> > > create a flow offload entry. Since we can not guaruantee that
> > > flow_offload_teardown is called by a TCP FIN packet we also need to make
> > > sure that flow_offload_fixup_ct is also called in flow_offload_del
> > > and only fixes up established TCP connections.
> > [...]
> > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > index 0164e5f522e8..324fdb62c08b 100644
> > > --- a/net/netfilter/nf_conntrack_core.c
> > > +++ b/net/netfilter/nf_conntrack_core.c
> > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > >  
> > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > -				nf_ct_offload_timeout(tmp);
> > 
> > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > path that triggers the race, ie. nf_ct_is_expired()
> > 
> > The flowtable ct fixup races with conntrack gc collector.
> > 
> > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > the closing packets.
> > 
> > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > in a TCP state that represent closure?
>
> >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> >   			goto out;
> > 
> > this is already the intention in the existing code.
> > 
> > If this does work, could you keep IPS_OFFLOAD_TEARDOWN_BIT internal,
> > ie. no in uapi? Define it at include/net/netfilter/nf_conntrack.h and
> > add a comment regarding this to avoid an overlap in the future.
> > 
> > > +				if (!test_bit(IPS_OFFLOAD_TEARDOWN_BIT, &tmp->status))
> > > +					nf_ct_offload_timeout(tmp);
> > >  				continue;
> > >  			}
> > >  
> > > diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
> > > index 3db256da919b..aaed1a244013 100644
> > > --- a/net/netfilter/nf_flow_table_core.c
> > > +++ b/net/netfilter/nf_flow_table_core.c
> > > @@ -177,14 +177,8 @@ int flow_offload_route_init(struct flow_offload *flow,
> > >  }
> > >  EXPORT_SYMBOL_GPL(flow_offload_route_init);
> > >  
> > > -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
> > > -{
> > > -	tcp->state = TCP_CONNTRACK_ESTABLISHED;
> > > -	tcp->seen[0].td_maxwin = 0;
> > > -	tcp->seen[1].td_maxwin = 0;
> > > -}
> > >  
> > > -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > +static void flow_offload_fixup_ct(struct nf_conn *ct)
> > >  {
> > >  	struct net *net = nf_ct_net(ct);
> > >  	int l4num = nf_ct_protonum(ct);
> > > @@ -192,8 +186,12 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > >  
> > >  	if (l4num == IPPROTO_TCP) {
> > >  		struct nf_tcp_net *tn = nf_tcp_pernet(net);
> > > +		struct ip_ct_tcp *tcp = &ct->proto.tcp;
> > > +
> > > +		tcp->seen[0].td_maxwin = 0;
> > > +		tcp->seen[1].td_maxwin = 0;
> > >  
> > > -		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> > > +		timeout = tn->timeouts[ct->proto.tcp.state];
> > >  		timeout -= tn->offload_timeout;
> > >  	} else if (l4num == IPPROTO_UDP) {
> > >  		struct nf_udp_net *tn = nf_udp_pernet(net);
> > > @@ -211,18 +209,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > >  		WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
> > >  }
> > >  
> > > -static void flow_offload_fixup_ct_state(struct nf_conn *ct)
> > > -{
> > > -	if (nf_ct_protonum(ct) == IPPROTO_TCP)
> > > -		flow_offload_fixup_tcp(&ct->proto.tcp);
> > > -}
> > > -
> > > -static void flow_offload_fixup_ct(struct nf_conn *ct)
> > > -{
> > > -	flow_offload_fixup_ct_state(ct);
> > > -	flow_offload_fixup_ct_timeout(ct);
> > > -}
> > > -
> > >  static void flow_offload_route_release(struct flow_offload *flow)
> > >  {
> > >  	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
> > > @@ -353,6 +339,10 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
> > >  static void flow_offload_del(struct nf_flowtable *flow_table,
> > >  			     struct flow_offload *flow)
> > >  {
> > > +	struct nf_conn *ct = flow->ct;
> > > +
> > > +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> > > +
> > >  	rhashtable_remove_fast(&flow_table->rhashtable,
> > >  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
> > >  			       nf_flow_offload_rhash_params);
> > > @@ -360,12 +350,11 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> > >  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
> > >  			       nf_flow_offload_rhash_params);
> > >  
> > > -	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
> > > -
> > >  	if (nf_flow_has_expired(flow))
> > > -		flow_offload_fixup_ct(flow->ct);
> > > -	else
> > > -		flow_offload_fixup_ct_timeout(flow->ct);
> > > +		flow_offload_fixup_ct(ct);
> > 
> > Very unlikely, but race might still happen between fixup and
> > clear IPS_OFFLOAD_BIT with gc below?
> > 
> > Without checking from the packet path, the conntrack gc might race to
> > refresh the timeout, I don't see a 100% race free solution.
> > 
> > Probably update the nf_ct_offload_timeout to a shorter value than a
> > day would mitigate this issue too.
> 
> This section of the code is now protected by IPS_OFFLOAD_TEARDOWN_BIT
> which will prevent the update via nf_ct_offload_timeout.
> We set it at the beginning of flow_offload_del and flow_offload_teardown.
> 
> Since flow_offload_teardown is only called on TCP packets
> we also need to set it at flow_offload_del to prevent the race.
> 
> This should prevent the race at this point.

OK.

> > > +	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
> > > +	clear_bit(IPS_OFFLOAD_TEARDOWN_BIT, &ct->status);
> > >  
> > >  	flow_offload_free(flow);
> > >  }
> > > @@ -373,8 +362,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> > >  void flow_offload_teardown(struct flow_offload *flow)
> > >  {
> > >  	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
> > > +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> > >  
> > > -	flow_offload_fixup_ct_state(flow->ct);
> > > +	flow_offload_fixup_ct(flow->ct);
> > >  }
> > >  EXPORT_SYMBOL_GPL(flow_offload_teardown);
> > >  
> > > diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
> > > index 900d48c810a1..9cc3ea08eb3a 100644
> > > --- a/net/netfilter/nft_flow_offload.c
> > > +++ b/net/netfilter/nft_flow_offload.c
> > > @@ -295,6 +295,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
> > >  					  sizeof(_tcph), &_tcph);
> > >  		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > >  			goto out;
> > > +		if (unlikely(!nf_conntrack_tcp_established(ct)))
> > > +			goto out;
> > 
> > This chunk is not required, from ruleset users can do
> > 
> >         ... ct status assured ...
> > 
> > instead.
> 
> Maybe this should be mentioned in the manual or wiki if
> it is not necessary in the flow offload code.

Yes, documentation and wiki can be updated.

Users might want to offload the flow at a later stage in the TCP
connection.
Pablo Neira Ayuso May 16, 2022, 12:06 p.m. UTC | #4
On Mon, May 16, 2022 at 01:37:40PM +0200, Pablo Neira Ayuso wrote:
> On Mon, May 16, 2022 at 01:18:17PM +0200, Sven Auhagen wrote:
> > On Mon, May 16, 2022 at 12:56:38PM +0200, Pablo Neira Ayuso wrote:
> > > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> > > > Connections leaving the established state (due to RST / FIN TCP packets)
> > > > set the flow table teardown flag. The packet path continues to set lower
> > > > timeout value as per the new TCP state but the offload flag remains set.
> > > >
> > > > Hence, the conntrack garbage collector may race to undo the timeout
> > > > adjustment of the packet path, leaving the conntrack entry in place with
> > > > the internal offload timeout (one day).
> > > >
> > > > Avoid ct gc timeout overwrite by flagging teared down flowtable
> > > > connections.
> > > >
> > > > On the nftables side we only need to allow established TCP connections to
> > > > create a flow offload entry. Since we can not guaruantee that
> > > > flow_offload_teardown is called by a TCP FIN packet we also need to make
> > > > sure that flow_offload_fixup_ct is also called in flow_offload_del
> > > > and only fixes up established TCP connections.
> > > [...]
> > > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > > index 0164e5f522e8..324fdb62c08b 100644
> > > > --- a/net/netfilter/nf_conntrack_core.c
> > > > +++ b/net/netfilter/nf_conntrack_core.c
> > > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > > >  
> > > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > > -				nf_ct_offload_timeout(tmp);
> > > 
> > > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > > path that triggers the race, ie. nf_ct_is_expired()
> > > 
> > > The flowtable ct fixup races with conntrack gc collector.
> > > 
> > > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > > the closing packets.
> > > 
> > > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > > in a TCP state that represent closure?
> >
> > >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > >   			goto out;
> > > 
> > > this is already the intention in the existing code.
> > > 
> > > If this does work, could you keep IPS_OFFLOAD_TEARDOWN_BIT internal,
> > > ie. no in uapi? Define it at include/net/netfilter/nf_conntrack.h and
> > > add a comment regarding this to avoid an overlap in the future.
> > > 
> > > > +				if (!test_bit(IPS_OFFLOAD_TEARDOWN_BIT, &tmp->status))
> > > > +					nf_ct_offload_timeout(tmp);
> > > >  				continue;
> > > >  			}
> > > >  
> > > > diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
> > > > index 3db256da919b..aaed1a244013 100644
> > > > --- a/net/netfilter/nf_flow_table_core.c
> > > > +++ b/net/netfilter/nf_flow_table_core.c
> > > > @@ -177,14 +177,8 @@ int flow_offload_route_init(struct flow_offload *flow,
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(flow_offload_route_init);
> > > >  
> > > > -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
> > > > -{
> > > > -	tcp->state = TCP_CONNTRACK_ESTABLISHED;
> > > > -	tcp->seen[0].td_maxwin = 0;
> > > > -	tcp->seen[1].td_maxwin = 0;
> > > > -}
> > > >  
> > > > -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > > +static void flow_offload_fixup_ct(struct nf_conn *ct)
> > > >  {
> > > >  	struct net *net = nf_ct_net(ct);
> > > >  	int l4num = nf_ct_protonum(ct);
> > > > @@ -192,8 +186,12 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > >  
> > > >  	if (l4num == IPPROTO_TCP) {
> > > >  		struct nf_tcp_net *tn = nf_tcp_pernet(net);
> > > > +		struct ip_ct_tcp *tcp = &ct->proto.tcp;
> > > > +
> > > > +		tcp->seen[0].td_maxwin = 0;
> > > > +		tcp->seen[1].td_maxwin = 0;
> > > >  
> > > > -		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> > > > +		timeout = tn->timeouts[ct->proto.tcp.state];
> > > >  		timeout -= tn->offload_timeout;
> > > >  	} else if (l4num == IPPROTO_UDP) {
> > > >  		struct nf_udp_net *tn = nf_udp_pernet(net);
> > > > @@ -211,18 +209,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > >  		WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
> > > >  }
> > > >  
> > > > -static void flow_offload_fixup_ct_state(struct nf_conn *ct)
> > > > -{
> > > > -	if (nf_ct_protonum(ct) == IPPROTO_TCP)
> > > > -		flow_offload_fixup_tcp(&ct->proto.tcp);
> > > > -}
> > > > -
> > > > -static void flow_offload_fixup_ct(struct nf_conn *ct)
> > > > -{
> > > > -	flow_offload_fixup_ct_state(ct);
> > > > -	flow_offload_fixup_ct_timeout(ct);
> > > > -}
> > > > -
> > > >  static void flow_offload_route_release(struct flow_offload *flow)
> > > >  {
> > > >  	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
> > > > @@ -353,6 +339,10 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
> > > >  static void flow_offload_del(struct nf_flowtable *flow_table,
> > > >  			     struct flow_offload *flow)
> > > >  {
> > > > +	struct nf_conn *ct = flow->ct;
> > > > +
> > > > +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> > > > +
> > > >  	rhashtable_remove_fast(&flow_table->rhashtable,
> > > >  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
> > > >  			       nf_flow_offload_rhash_params);
> > > > @@ -360,12 +350,11 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> > > >  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
> > > >  			       nf_flow_offload_rhash_params);
> > > >  
> > > > -	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
> > > > -
> > > >  	if (nf_flow_has_expired(flow))
> > > > -		flow_offload_fixup_ct(flow->ct);
> > > > -	else
> > > > -		flow_offload_fixup_ct_timeout(flow->ct);
> > > > +		flow_offload_fixup_ct(ct);
> > > 
> > > Very unlikely, but race might still happen between fixup and
> > > clear IPS_OFFLOAD_BIT with gc below?
> > > 
> > > Without checking from the packet path, the conntrack gc might race to
> > > refresh the timeout, I don't see a 100% race free solution.
> > > 
> > > Probably update the nf_ct_offload_timeout to a shorter value than a
> > > day would mitigate this issue too.
> > 
> > This section of the code is now protected by IPS_OFFLOAD_TEARDOWN_BIT
> > which will prevent the update via nf_ct_offload_timeout.
> > We set it at the beginning of flow_offload_del and flow_offload_teardown.
> > 
> > Since flow_offload_teardown is only called on TCP packets
> > we also need to set it at flow_offload_del to prevent the race.
> > 
> > This should prevent the race at this point.
> 
> OK.
> 
> > > > +	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
> > > > +	clear_bit(IPS_OFFLOAD_TEARDOWN_BIT, &ct->status);
> > > >  
> > > >  	flow_offload_free(flow);
> > > >  }
> > > > @@ -373,8 +362,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> > > >  void flow_offload_teardown(struct flow_offload *flow)
> > > >  {
> > > >  	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
> > > > +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> > > >  
> > > > -	flow_offload_fixup_ct_state(flow->ct);
> > > > +	flow_offload_fixup_ct(flow->ct);
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(flow_offload_teardown);
> > > >  
> > > > diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
> > > > index 900d48c810a1..9cc3ea08eb3a 100644
> > > > --- a/net/netfilter/nft_flow_offload.c
> > > > +++ b/net/netfilter/nft_flow_offload.c
> > > > @@ -295,6 +295,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
> > > >  					  sizeof(_tcph), &_tcph);
> > > >  		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > >  			goto out;
> > > > +		if (unlikely(!nf_conntrack_tcp_established(ct)))
> > > > +			goto out;
> > > 
> > > This chunk is not required, from ruleset users can do
> > > 
> > >         ... ct status assured ...
> > > 
> > > instead.
> > 
> > Maybe this should be mentioned in the manual or wiki if
> > it is not necessary in the flow offload code.
> 
> Yes, documentation and wiki can be updated.
> 
> Users might want to offload the flow at a later stage in the TCP
> connection.

Well, actually there is not later stage than established, anything
after established are TCP teardown states.

What's the issue with allowing to offload from SYN_RECV state?
Pablo Neira Ayuso May 16, 2022, 12:13 p.m. UTC | #5
On Mon, May 16, 2022 at 12:56:41PM +0200, Pablo Neira Ayuso wrote:
> On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> > Connections leaving the established state (due to RST / FIN TCP packets)
> > set the flow table teardown flag. The packet path continues to set lower
> > timeout value as per the new TCP state but the offload flag remains set.
> >
> > Hence, the conntrack garbage collector may race to undo the timeout
> > adjustment of the packet path, leaving the conntrack entry in place with
> > the internal offload timeout (one day).
> >
> > Avoid ct gc timeout overwrite by flagging teared down flowtable
> > connections.
> >
> > On the nftables side we only need to allow established TCP connections to
> > create a flow offload entry. Since we can not guaruantee that
> > flow_offload_teardown is called by a TCP FIN packet we also need to make
> > sure that flow_offload_fixup_ct is also called in flow_offload_del
> > and only fixes up established TCP connections.
> [...]
> > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > index 0164e5f522e8..324fdb62c08b 100644
> > --- a/net/netfilter/nf_conntrack_core.c
> > +++ b/net/netfilter/nf_conntrack_core.c
> > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> >  
> >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > -				nf_ct_offload_timeout(tmp);
> 
> Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> path that triggers the race, ie. nf_ct_is_expired()
> 
> The flowtable ct fixup races with conntrack gc collector.
> 
> Clearing IPS_OFFLOAD might result in offloading the entry again for
> the closing packets.
> 
> Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> in a TCP state that represent closure?
> 
>   		if (unlikely(!tcph || tcph->fin || tcph->rst))
>   			goto out;
> 
> this is already the intention in the existing code.

I'm attaching an incomplete sketch patch. My goal is to avoid the
extra IPS_ bit.
Sven Auhagen May 16, 2022, 12:17 p.m. UTC | #6
On Mon, May 16, 2022 at 02:06:28PM +0200, Pablo Neira Ayuso wrote:
> On Mon, May 16, 2022 at 01:37:40PM +0200, Pablo Neira Ayuso wrote:
> > On Mon, May 16, 2022 at 01:18:17PM +0200, Sven Auhagen wrote:
> > > On Mon, May 16, 2022 at 12:56:38PM +0200, Pablo Neira Ayuso wrote:
> > > > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> > > > > Connections leaving the established state (due to RST / FIN TCP packets)
> > > > > set the flow table teardown flag. The packet path continues to set lower
> > > > > timeout value as per the new TCP state but the offload flag remains set.
> > > > >
> > > > > Hence, the conntrack garbage collector may race to undo the timeout
> > > > > adjustment of the packet path, leaving the conntrack entry in place with
> > > > > the internal offload timeout (one day).
> > > > >
> > > > > Avoid ct gc timeout overwrite by flagging teared down flowtable
> > > > > connections.
> > > > >
> > > > > On the nftables side we only need to allow established TCP connections to
> > > > > create a flow offload entry. Since we can not guaruantee that
> > > > > flow_offload_teardown is called by a TCP FIN packet we also need to make
> > > > > sure that flow_offload_fixup_ct is also called in flow_offload_del
> > > > > and only fixes up established TCP connections.
> > > > [...]
> > > > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > > > index 0164e5f522e8..324fdb62c08b 100644
> > > > > --- a/net/netfilter/nf_conntrack_core.c
> > > > > +++ b/net/netfilter/nf_conntrack_core.c
> > > > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > > > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > > > >  
> > > > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > > > -				nf_ct_offload_timeout(tmp);
> > > > 
> > > > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > > > path that triggers the race, ie. nf_ct_is_expired()
> > > > 
> > > > The flowtable ct fixup races with conntrack gc collector.
> > > > 
> > > > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > > > the closing packets.
> > > > 
> > > > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > > > in a TCP state that represent closure?
> > >
> > > >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > >   			goto out;
> > > > 
> > > > this is already the intention in the existing code.
> > > > 
> > > > If this does work, could you keep IPS_OFFLOAD_TEARDOWN_BIT internal,
> > > > ie. no in uapi? Define it at include/net/netfilter/nf_conntrack.h and
> > > > add a comment regarding this to avoid an overlap in the future.
> > > > 
> > > > > +				if (!test_bit(IPS_OFFLOAD_TEARDOWN_BIT, &tmp->status))
> > > > > +					nf_ct_offload_timeout(tmp);
> > > > >  				continue;
> > > > >  			}
> > > > >  
> > > > > diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
> > > > > index 3db256da919b..aaed1a244013 100644
> > > > > --- a/net/netfilter/nf_flow_table_core.c
> > > > > +++ b/net/netfilter/nf_flow_table_core.c
> > > > > @@ -177,14 +177,8 @@ int flow_offload_route_init(struct flow_offload *flow,
> > > > >  }
> > > > >  EXPORT_SYMBOL_GPL(flow_offload_route_init);
> > > > >  
> > > > > -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
> > > > > -{
> > > > > -	tcp->state = TCP_CONNTRACK_ESTABLISHED;
> > > > > -	tcp->seen[0].td_maxwin = 0;
> > > > > -	tcp->seen[1].td_maxwin = 0;
> > > > > -}
> > > > >  
> > > > > -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > > > +static void flow_offload_fixup_ct(struct nf_conn *ct)
> > > > >  {
> > > > >  	struct net *net = nf_ct_net(ct);
> > > > >  	int l4num = nf_ct_protonum(ct);
> > > > > @@ -192,8 +186,12 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > > >  
> > > > >  	if (l4num == IPPROTO_TCP) {
> > > > >  		struct nf_tcp_net *tn = nf_tcp_pernet(net);
> > > > > +		struct ip_ct_tcp *tcp = &ct->proto.tcp;
> > > > > +
> > > > > +		tcp->seen[0].td_maxwin = 0;
> > > > > +		tcp->seen[1].td_maxwin = 0;
> > > > >  
> > > > > -		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> > > > > +		timeout = tn->timeouts[ct->proto.tcp.state];
> > > > >  		timeout -= tn->offload_timeout;
> > > > >  	} else if (l4num == IPPROTO_UDP) {
> > > > >  		struct nf_udp_net *tn = nf_udp_pernet(net);
> > > > > @@ -211,18 +209,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > > >  		WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
> > > > >  }
> > > > >  
> > > > > -static void flow_offload_fixup_ct_state(struct nf_conn *ct)
> > > > > -{
> > > > > -	if (nf_ct_protonum(ct) == IPPROTO_TCP)
> > > > > -		flow_offload_fixup_tcp(&ct->proto.tcp);
> > > > > -}
> > > > > -
> > > > > -static void flow_offload_fixup_ct(struct nf_conn *ct)
> > > > > -{
> > > > > -	flow_offload_fixup_ct_state(ct);
> > > > > -	flow_offload_fixup_ct_timeout(ct);
> > > > > -}
> > > > > -
> > > > >  static void flow_offload_route_release(struct flow_offload *flow)
> > > > >  {
> > > > >  	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
> > > > > @@ -353,6 +339,10 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
> > > > >  static void flow_offload_del(struct nf_flowtable *flow_table,
> > > > >  			     struct flow_offload *flow)
> > > > >  {
> > > > > +	struct nf_conn *ct = flow->ct;
> > > > > +
> > > > > +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> > > > > +
> > > > >  	rhashtable_remove_fast(&flow_table->rhashtable,
> > > > >  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
> > > > >  			       nf_flow_offload_rhash_params);
> > > > > @@ -360,12 +350,11 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> > > > >  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
> > > > >  			       nf_flow_offload_rhash_params);
> > > > >  
> > > > > -	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
> > > > > -
> > > > >  	if (nf_flow_has_expired(flow))
> > > > > -		flow_offload_fixup_ct(flow->ct);
> > > > > -	else
> > > > > -		flow_offload_fixup_ct_timeout(flow->ct);
> > > > > +		flow_offload_fixup_ct(ct);
> > > > 
> > > > Very unlikely, but race might still happen between fixup and
> > > > clear IPS_OFFLOAD_BIT with gc below?
> > > > 
> > > > Without checking from the packet path, the conntrack gc might race to
> > > > refresh the timeout, I don't see a 100% race free solution.
> > > > 
> > > > Probably update the nf_ct_offload_timeout to a shorter value than a
> > > > day would mitigate this issue too.
> > > 
> > > This section of the code is now protected by IPS_OFFLOAD_TEARDOWN_BIT
> > > which will prevent the update via nf_ct_offload_timeout.
> > > We set it at the beginning of flow_offload_del and flow_offload_teardown.
> > > 
> > > Since flow_offload_teardown is only called on TCP packets
> > > we also need to set it at flow_offload_del to prevent the race.
> > > 
> > > This should prevent the race at this point.
> > 
> > OK.
> > 
> > > > > +	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
> > > > > +	clear_bit(IPS_OFFLOAD_TEARDOWN_BIT, &ct->status);
> > > > >  
> > > > >  	flow_offload_free(flow);
> > > > >  }
> > > > > @@ -373,8 +362,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> > > > >  void flow_offload_teardown(struct flow_offload *flow)
> > > > >  {
> > > > >  	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
> > > > > +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> > > > >  
> > > > > -	flow_offload_fixup_ct_state(flow->ct);
> > > > > +	flow_offload_fixup_ct(flow->ct);
> > > > >  }
> > > > >  EXPORT_SYMBOL_GPL(flow_offload_teardown);
> > > > >  
> > > > > diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
> > > > > index 900d48c810a1..9cc3ea08eb3a 100644
> > > > > --- a/net/netfilter/nft_flow_offload.c
> > > > > +++ b/net/netfilter/nft_flow_offload.c
> > > > > @@ -295,6 +295,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
> > > > >  					  sizeof(_tcph), &_tcph);
> > > > >  		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > > >  			goto out;
> > > > > +		if (unlikely(!nf_conntrack_tcp_established(ct)))
> > > > > +			goto out;
> > > > 
> > > > This chunk is not required, from ruleset users can do
> > > > 
> > > >         ... ct status assured ...
> > > > 
> > > > instead.
> > > 
> > > Maybe this should be mentioned in the manual or wiki if
> > > it is not necessary in the flow offload code.
> > 
> > Yes, documentation and wiki can be updated.
> > 
> > Users might want to offload the flow at a later stage in the TCP
> > connection.
> 
> Well, actually there is not later stage than established, anything
> after established are TCP teardown states.
> 
> What's the issue with allowing to offload from SYN_RECV state?

There were multiple problem in general with the code.
flow_offload_fixup_tcp always moves a TCP connection
to established even if it is in FIN or CLOSE.

The flowoffload_del function was always setting the TCP timeout
to ESTABLISHED timeout even when the state was in CLOSE and therefore
creating a very long lasting dead state.

Since we might miss or bump packets to slow path, we do not know
what will happen there when we are still in SYN_RECV.

We will have a better knowledge of the TCP state when we are in 
established first and we know that we are either still in it or
we have moved past it to a closing state.
Sven Auhagen May 16, 2022, 12:23 p.m. UTC | #7
On Mon, May 16, 2022 at 02:13:03PM +0200, Pablo Neira Ayuso wrote:
> On Mon, May 16, 2022 at 12:56:41PM +0200, Pablo Neira Ayuso wrote:
> > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> > > Connections leaving the established state (due to RST / FIN TCP packets)
> > > set the flow table teardown flag. The packet path continues to set lower
> > > timeout value as per the new TCP state but the offload flag remains set.
> > >
> > > Hence, the conntrack garbage collector may race to undo the timeout
> > > adjustment of the packet path, leaving the conntrack entry in place with
> > > the internal offload timeout (one day).
> > >
> > > Avoid ct gc timeout overwrite by flagging teared down flowtable
> > > connections.
> > >
> > > On the nftables side we only need to allow established TCP connections to
> > > create a flow offload entry. Since we can not guaruantee that
> > > flow_offload_teardown is called by a TCP FIN packet we also need to make
> > > sure that flow_offload_fixup_ct is also called in flow_offload_del
> > > and only fixes up established TCP connections.
> > [...]
> > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > index 0164e5f522e8..324fdb62c08b 100644
> > > --- a/net/netfilter/nf_conntrack_core.c
> > > +++ b/net/netfilter/nf_conntrack_core.c
> > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > >  
> > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > -				nf_ct_offload_timeout(tmp);
> > 
> > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > path that triggers the race, ie. nf_ct_is_expired()
> > 
> > The flowtable ct fixup races with conntrack gc collector.
> > 
> > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > the closing packets.
> > 
> > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > in a TCP state that represent closure?
> > 
> >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> >   			goto out;
> > 
> > this is already the intention in the existing code.
> 
> I'm attaching an incomplete sketch patch. My goal is to avoid the
> extra IPS_ bit.

You might create a race with ct gc that will remove the ct
if it is in close or end of close and before flow offload teardown is running
so flow offload teardown might access memory that was freed.
It is not a very likely scenario but never the less it might happen now
since the IPS_OFFLOAD_BIT is not set and the state might just time out.

If someone sets a very small TCP CLOSE timeout it gets more likely.

So Oz and myself were debatting about three possible cases/problems:

1. ct gc sets timeout even though the state is in CLOSE/FIN because the
IPS_OFFLOAD is still set but the flow is in teardown
2. ct gc removes the ct because the IPS_OFFLOAD is not set and
the CLOSE timeout is reached before the flow offload del
3. tcp ct is always set to ESTABLISHED with a very long timeout
in flow offload teardown/delete even though the state is already
CLOSED.

Also as a remark we can not assume that the FIN or RST packet is hitting
flow table teardown as the packet might get bumped to the slow path in
nftables.

Best
Sven
Pablo Neira Ayuso May 16, 2022, 12:43 p.m. UTC | #8
On Mon, May 16, 2022 at 02:23:00PM +0200, Sven Auhagen wrote:
> On Mon, May 16, 2022 at 02:13:03PM +0200, Pablo Neira Ayuso wrote:
> > On Mon, May 16, 2022 at 12:56:41PM +0200, Pablo Neira Ayuso wrote:
> > > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> > > > Connections leaving the established state (due to RST / FIN TCP packets)
> > > > set the flow table teardown flag. The packet path continues to set lower
> > > > timeout value as per the new TCP state but the offload flag remains set.
> > > >
> > > > Hence, the conntrack garbage collector may race to undo the timeout
> > > > adjustment of the packet path, leaving the conntrack entry in place with
> > > > the internal offload timeout (one day).
> > > >
> > > > Avoid ct gc timeout overwrite by flagging teared down flowtable
> > > > connections.
> > > >
> > > > On the nftables side we only need to allow established TCP connections to
> > > > create a flow offload entry. Since we can not guaruantee that
> > > > flow_offload_teardown is called by a TCP FIN packet we also need to make
> > > > sure that flow_offload_fixup_ct is also called in flow_offload_del
> > > > and only fixes up established TCP connections.
> > > [...]
> > > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > > index 0164e5f522e8..324fdb62c08b 100644
> > > > --- a/net/netfilter/nf_conntrack_core.c
> > > > +++ b/net/netfilter/nf_conntrack_core.c
> > > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > > >  
> > > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > > -				nf_ct_offload_timeout(tmp);
> > > 
> > > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > > path that triggers the race, ie. nf_ct_is_expired()
> > > 
> > > The flowtable ct fixup races with conntrack gc collector.
> > > 
> > > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > > the closing packets.
> > > 
> > > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > > in a TCP state that represent closure?
> > > 
> > >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > >   			goto out;
> > > 
> > > this is already the intention in the existing code.
> > 
> > I'm attaching an incomplete sketch patch. My goal is to avoid the
> > extra IPS_ bit.
> 
> You might create a race with ct gc that will remove the ct
> if it is in close or end of close and before flow offload teardown is running
> so flow offload teardown might access memory that was freed.

flow object holds a reference to the ct object until it is released,
no use-after-free can happen.

> It is not a very likely scenario but never the less it might happen now
> since the IPS_OFFLOAD_BIT is not set and the state might just time out.
> 
> If someone sets a very small TCP CLOSE timeout it gets more likely.
> 
> So Oz and myself were debatting about three possible cases/problems:
> 
> 1. ct gc sets timeout even though the state is in CLOSE/FIN because the
> IPS_OFFLOAD is still set but the flow is in teardown
> 2. ct gc removes the ct because the IPS_OFFLOAD is not set and
> the CLOSE timeout is reached before the flow offload del

OK.

> 3. tcp ct is always set to ESTABLISHED with a very long timeout
> in flow offload teardown/delete even though the state is already
> CLOSED.
>
> Also as a remark we can not assume that the FIN or RST packet is hitting
> flow table teardown as the packet might get bumped to the slow path in
> nftables.

I assume this remark is related to 3.?

if IPS_OFFLOAD is unset, then conntrack would update the state
according to this FIN or RST.

Thanks for the summary.
Sven Auhagen May 16, 2022, 1:02 p.m. UTC | #9
On Mon, May 16, 2022 at 02:43:06PM +0200, Pablo Neira Ayuso wrote:
> On Mon, May 16, 2022 at 02:23:00PM +0200, Sven Auhagen wrote:
> > On Mon, May 16, 2022 at 02:13:03PM +0200, Pablo Neira Ayuso wrote:
> > > On Mon, May 16, 2022 at 12:56:41PM +0200, Pablo Neira Ayuso wrote:
> > > > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> > > > > Connections leaving the established state (due to RST / FIN TCP packets)
> > > > > set the flow table teardown flag. The packet path continues to set lower
> > > > > timeout value as per the new TCP state but the offload flag remains set.
> > > > >
> > > > > Hence, the conntrack garbage collector may race to undo the timeout
> > > > > adjustment of the packet path, leaving the conntrack entry in place with
> > > > > the internal offload timeout (one day).
> > > > >
> > > > > Avoid ct gc timeout overwrite by flagging teared down flowtable
> > > > > connections.
> > > > >
> > > > > On the nftables side we only need to allow established TCP connections to
> > > > > create a flow offload entry. Since we can not guaruantee that
> > > > > flow_offload_teardown is called by a TCP FIN packet we also need to make
> > > > > sure that flow_offload_fixup_ct is also called in flow_offload_del
> > > > > and only fixes up established TCP connections.
> > > > [...]
> > > > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > > > index 0164e5f522e8..324fdb62c08b 100644
> > > > > --- a/net/netfilter/nf_conntrack_core.c
> > > > > +++ b/net/netfilter/nf_conntrack_core.c
> > > > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > > > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > > > >  
> > > > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > > > -				nf_ct_offload_timeout(tmp);
> > > > 
> > > > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > > > path that triggers the race, ie. nf_ct_is_expired()
> > > > 
> > > > The flowtable ct fixup races with conntrack gc collector.
> > > > 
> > > > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > > > the closing packets.
> > > > 
> > > > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > > > in a TCP state that represent closure?
> > > > 
> > > >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > >   			goto out;
> > > > 
> > > > this is already the intention in the existing code.
> > > 
> > > I'm attaching an incomplete sketch patch. My goal is to avoid the
> > > extra IPS_ bit.
> > 
> > You might create a race with ct gc that will remove the ct
> > if it is in close or end of close and before flow offload teardown is running
> > so flow offload teardown might access memory that was freed.
> 
> flow object holds a reference to the ct object until it is released,
> no use-after-free can happen.
> 

Also if nf_ct_delete is called before flowtable delete?
Can you let me know why?

> > It is not a very likely scenario but never the less it might happen now
> > since the IPS_OFFLOAD_BIT is not set and the state might just time out.
> > 
> > If someone sets a very small TCP CLOSE timeout it gets more likely.
> > 
> > So Oz and myself were debatting about three possible cases/problems:
> > 
> > 1. ct gc sets timeout even though the state is in CLOSE/FIN because the
> > IPS_OFFLOAD is still set but the flow is in teardown
> > 2. ct gc removes the ct because the IPS_OFFLOAD is not set and
> > the CLOSE timeout is reached before the flow offload del
> 
> OK.
> 
> > 3. tcp ct is always set to ESTABLISHED with a very long timeout
> > in flow offload teardown/delete even though the state is already
> > CLOSED.
> >
> > Also as a remark we can not assume that the FIN or RST packet is hitting
> > flow table teardown as the packet might get bumped to the slow path in
> > nftables.
> 
> I assume this remark is related to 3.?

Yes, exactly.

> 
> if IPS_OFFLOAD is unset, then conntrack would update the state
> according to this FIN or RST.
> 

It will move to a different TCP state anyways only the ct state
will be at IPS_OFFLOAD_BIT and prevent it from beeing garbage collected.
The timeout will be bumped back up as long as IPS_OFFLOAD_BIT is set
even though TCP might already be CLOSED.

> Thanks for the summary.
Pablo Neira Ayuso May 16, 2022, 5:50 p.m. UTC | #10
On Mon, May 16, 2022 at 03:02:13PM +0200, Sven Auhagen wrote:
> On Mon, May 16, 2022 at 02:43:06PM +0200, Pablo Neira Ayuso wrote:
> > On Mon, May 16, 2022 at 02:23:00PM +0200, Sven Auhagen wrote:
> > > On Mon, May 16, 2022 at 02:13:03PM +0200, Pablo Neira Ayuso wrote:
> > > > On Mon, May 16, 2022 at 12:56:41PM +0200, Pablo Neira Ayuso wrote:
> > > > > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
[...]
> > > > > [...]
> > > > > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > > > > index 0164e5f522e8..324fdb62c08b 100644
> > > > > > --- a/net/netfilter/nf_conntrack_core.c
> > > > > > +++ b/net/netfilter/nf_conntrack_core.c
> > > > > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > > > > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > > > > >  
> > > > > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > > > > -				nf_ct_offload_timeout(tmp);
> > > > > 
> > > > > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > > > > path that triggers the race, ie. nf_ct_is_expired()
> > > > > 
> > > > > The flowtable ct fixup races with conntrack gc collector.
> > > > > 
> > > > > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > > > > the closing packets.
> > > > > 
> > > > > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > > > > in a TCP state that represent closure?
> > > > > 
> > > > >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > > >   			goto out;
> > > > > 
> > > > > this is already the intention in the existing code.
> > > > 
> > > > I'm attaching an incomplete sketch patch. My goal is to avoid the
> > > > extra IPS_ bit.
> > > 
> > > You might create a race with ct gc that will remove the ct
> > > if it is in close or end of close and before flow offload teardown is running
> > > so flow offload teardown might access memory that was freed.
> > 
> > flow object holds a reference to the ct object until it is released,
> > no use-after-free can happen.
> > 
> 
> Also if nf_ct_delete is called before flowtable delete?
> Can you let me know why?

nf_ct_delete() removes the conntrack object from lists and it
decrements the reference counter by one.

flow_offload_free() also calls nf_ct_put(). flow_offload_alloc() bumps
the reference count on the conntrack object before creating the flow.

> > > It is not a very likely scenario but never the less it might happen now
> > > since the IPS_OFFLOAD_BIT is not set and the state might just time out.
> > > 
> > > If someone sets a very small TCP CLOSE timeout it gets more likely.
> > > 
> > > So Oz and myself were debatting about three possible cases/problems:
> > > 
> > > 1. ct gc sets timeout even though the state is in CLOSE/FIN because the
> > > IPS_OFFLOAD is still set but the flow is in teardown
> > > 2. ct gc removes the ct because the IPS_OFFLOAD is not set and
> > > the CLOSE timeout is reached before the flow offload del
> > 
> > OK.
> > 
> > > 3. tcp ct is always set to ESTABLISHED with a very long timeout
> > > in flow offload teardown/delete even though the state is already
> > > CLOSED.
> > >
> > > Also as a remark we can not assume that the FIN or RST packet is hitting
> > > flow table teardown as the packet might get bumped to the slow path in
> > > nftables.
> > 
> > I assume this remark is related to 3.?
> 
> Yes, exactly.
> 
> > if IPS_OFFLOAD is unset, then conntrack would update the state
> > according to this FIN or RST.
> 
> It will move to a different TCP state anyways only the ct state
> will be at IPS_OFFLOAD_BIT and prevent it from beeing garbage collected.
> The timeout will be bumped back up as long as IPS_OFFLOAD_BIT is set
> even though TCP might already be CLOSED.

If teardown fixes the ct state and timeout to established, and IPS_OFFLOAD is
unset, then the packet is passed up in a consistent state.

I made a patch, it is based on yours, it's attached:

- If flow timeout expires or rst/fin is seen, ct state and timeout is
  fixed up (to established state) and IPS_OFFLOAD is unset.

- If rst/fin packet is seen, ct state and timeout is fixed up (to
  established state) and IPS_OFFLOAD is unset. The packet continues
  its travel up to the classic path, so conntrack triggers the
  transition from established to one of the close states.

For the case 1., IPS_OFFLOAD is not set anymore, so conntrack gc
cannot race to reset the ct timeout anymore.

For the case 2., if gc conntrack ever removes the ct entry, then the
IPS_DYING bit is set, which implicitly triggers the teardown state
from the flowtable gc. The flowtable still holds a reference to the
ct object, so no UAF can happen.

For the case 3. the conntrack is set to ESTABLISHED with a long
timeout, yes. This is to deal with the two possible cases:

a) flowtable timeout expired, so conntrack recovers control on the
   flow.
b) tcp rst/fin will take back the packet to slow path. The ct has been
   fixed up to established state so it will trasition to one of the
   close states.

Am I missing anything?
Pablo Neira Ayuso May 16, 2022, 5:54 p.m. UTC | #11
On Mon, May 16, 2022 at 02:17:00PM +0200, Sven Auhagen wrote:
> On Mon, May 16, 2022 at 02:06:28PM +0200, Pablo Neira Ayuso wrote:
> > On Mon, May 16, 2022 at 01:37:40PM +0200, Pablo Neira Ayuso wrote:
> > > On Mon, May 16, 2022 at 01:18:17PM +0200, Sven Auhagen wrote:
> > > > On Mon, May 16, 2022 at 12:56:38PM +0200, Pablo Neira Ayuso wrote:
> > > > > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> > > > > > Connections leaving the established state (due to RST / FIN TCP packets)
> > > > > > set the flow table teardown flag. The packet path continues to set lower
> > > > > > timeout value as per the new TCP state but the offload flag remains set.
> > > > > >
> > > > > > Hence, the conntrack garbage collector may race to undo the timeout
> > > > > > adjustment of the packet path, leaving the conntrack entry in place with
> > > > > > the internal offload timeout (one day).
> > > > > >
> > > > > > Avoid ct gc timeout overwrite by flagging teared down flowtable
> > > > > > connections.
> > > > > >
> > > > > > On the nftables side we only need to allow established TCP connections to
> > > > > > create a flow offload entry. Since we can not guaruantee that
> > > > > > flow_offload_teardown is called by a TCP FIN packet we also need to make
> > > > > > sure that flow_offload_fixup_ct is also called in flow_offload_del
> > > > > > and only fixes up established TCP connections.
> > > > > [...]
> > > > > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > > > > index 0164e5f522e8..324fdb62c08b 100644
> > > > > > --- a/net/netfilter/nf_conntrack_core.c
> > > > > > +++ b/net/netfilter/nf_conntrack_core.c
> > > > > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > > > > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > > > > >  
> > > > > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > > > > -				nf_ct_offload_timeout(tmp);
> > > > > 
> > > > > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > > > > path that triggers the race, ie. nf_ct_is_expired()
> > > > > 
> > > > > The flowtable ct fixup races with conntrack gc collector.
> > > > > 
> > > > > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > > > > the closing packets.
> > > > > 
> > > > > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > > > > in a TCP state that represent closure?
> > > >
> > > > >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > > >   			goto out;
> > > > > 
> > > > > this is already the intention in the existing code.
> > > > > 
> > > > > If this does work, could you keep IPS_OFFLOAD_TEARDOWN_BIT internal,
> > > > > ie. no in uapi? Define it at include/net/netfilter/nf_conntrack.h and
> > > > > add a comment regarding this to avoid an overlap in the future.
> > > > > 
> > > > > > +				if (!test_bit(IPS_OFFLOAD_TEARDOWN_BIT, &tmp->status))
> > > > > > +					nf_ct_offload_timeout(tmp);
> > > > > >  				continue;
> > > > > >  			}
> > > > > >  
> > > > > > diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
> > > > > > index 3db256da919b..aaed1a244013 100644
> > > > > > --- a/net/netfilter/nf_flow_table_core.c
> > > > > > +++ b/net/netfilter/nf_flow_table_core.c
> > > > > > @@ -177,14 +177,8 @@ int flow_offload_route_init(struct flow_offload *flow,
> > > > > >  }
> > > > > >  EXPORT_SYMBOL_GPL(flow_offload_route_init);
> > > > > >  
> > > > > > -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
> > > > > > -{
> > > > > > -	tcp->state = TCP_CONNTRACK_ESTABLISHED;
> > > > > > -	tcp->seen[0].td_maxwin = 0;
> > > > > > -	tcp->seen[1].td_maxwin = 0;
> > > > > > -}
> > > > > >  
> > > > > > -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > > > > +static void flow_offload_fixup_ct(struct nf_conn *ct)
> > > > > >  {
> > > > > >  	struct net *net = nf_ct_net(ct);
> > > > > >  	int l4num = nf_ct_protonum(ct);
> > > > > > @@ -192,8 +186,12 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > > > >  
> > > > > >  	if (l4num == IPPROTO_TCP) {
> > > > > >  		struct nf_tcp_net *tn = nf_tcp_pernet(net);
> > > > > > +		struct ip_ct_tcp *tcp = &ct->proto.tcp;
> > > > > > +
> > > > > > +		tcp->seen[0].td_maxwin = 0;
> > > > > > +		tcp->seen[1].td_maxwin = 0;
> > > > > >  
> > > > > > -		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> > > > > > +		timeout = tn->timeouts[ct->proto.tcp.state];
> > > > > >  		timeout -= tn->offload_timeout;
> > > > > >  	} else if (l4num == IPPROTO_UDP) {
> > > > > >  		struct nf_udp_net *tn = nf_udp_pernet(net);
> > > > > > @@ -211,18 +209,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > > > >  		WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
> > > > > >  }
> > > > > >  
> > > > > > -static void flow_offload_fixup_ct_state(struct nf_conn *ct)
> > > > > > -{
> > > > > > -	if (nf_ct_protonum(ct) == IPPROTO_TCP)
> > > > > > -		flow_offload_fixup_tcp(&ct->proto.tcp);
> > > > > > -}
> > > > > > -
> > > > > > -static void flow_offload_fixup_ct(struct nf_conn *ct)
> > > > > > -{
> > > > > > -	flow_offload_fixup_ct_state(ct);
> > > > > > -	flow_offload_fixup_ct_timeout(ct);
> > > > > > -}
> > > > > > -
> > > > > >  static void flow_offload_route_release(struct flow_offload *flow)
> > > > > >  {
> > > > > >  	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
> > > > > > @@ -353,6 +339,10 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
> > > > > >  static void flow_offload_del(struct nf_flowtable *flow_table,
> > > > > >  			     struct flow_offload *flow)
> > > > > >  {
> > > > > > +	struct nf_conn *ct = flow->ct;
> > > > > > +
> > > > > > +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> > > > > > +
> > > > > >  	rhashtable_remove_fast(&flow_table->rhashtable,
> > > > > >  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
> > > > > >  			       nf_flow_offload_rhash_params);
> > > > > > @@ -360,12 +350,11 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> > > > > >  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
> > > > > >  			       nf_flow_offload_rhash_params);
> > > > > >  
> > > > > > -	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
> > > > > > -
> > > > > >  	if (nf_flow_has_expired(flow))
> > > > > > -		flow_offload_fixup_ct(flow->ct);
> > > > > > -	else
> > > > > > -		flow_offload_fixup_ct_timeout(flow->ct);
> > > > > > +		flow_offload_fixup_ct(ct);
> > > > > 
> > > > > Very unlikely, but race might still happen between fixup and
> > > > > clear IPS_OFFLOAD_BIT with gc below?
> > > > > 
> > > > > Without checking from the packet path, the conntrack gc might race to
> > > > > refresh the timeout, I don't see a 100% race free solution.
> > > > > 
> > > > > Probably update the nf_ct_offload_timeout to a shorter value than a
> > > > > day would mitigate this issue too.
> > > > 
> > > > This section of the code is now protected by IPS_OFFLOAD_TEARDOWN_BIT
> > > > which will prevent the update via nf_ct_offload_timeout.
> > > > We set it at the beginning of flow_offload_del and flow_offload_teardown.
> > > > 
> > > > Since flow_offload_teardown is only called on TCP packets
> > > > we also need to set it at flow_offload_del to prevent the race.
> > > > 
> > > > This should prevent the race at this point.
> > > 
> > > OK.
> > > 
> > > > > > +	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
> > > > > > +	clear_bit(IPS_OFFLOAD_TEARDOWN_BIT, &ct->status);
> > > > > >  
> > > > > >  	flow_offload_free(flow);
> > > > > >  }
> > > > > > @@ -373,8 +362,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> > > > > >  void flow_offload_teardown(struct flow_offload *flow)
> > > > > >  {
> > > > > >  	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
> > > > > > +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> > > > > >  
> > > > > > -	flow_offload_fixup_ct_state(flow->ct);
> > > > > > +	flow_offload_fixup_ct(flow->ct);
> > > > > >  }
> > > > > >  EXPORT_SYMBOL_GPL(flow_offload_teardown);
> > > > > >  
> > > > > > diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
> > > > > > index 900d48c810a1..9cc3ea08eb3a 100644
> > > > > > --- a/net/netfilter/nft_flow_offload.c
> > > > > > +++ b/net/netfilter/nft_flow_offload.c
> > > > > > @@ -295,6 +295,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
> > > > > >  					  sizeof(_tcph), &_tcph);
> > > > > >  		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > > > >  			goto out;
> > > > > > +		if (unlikely(!nf_conntrack_tcp_established(ct)))
> > > > > > +			goto out;
> > > > > 
> > > > > This chunk is not required, from ruleset users can do
> > > > > 
> > > > >         ... ct status assured ...
> > > > > 
> > > > > instead.
> > > > 
> > > > Maybe this should be mentioned in the manual or wiki if
> > > > it is not necessary in the flow offload code.
> > > 
> > > Yes, documentation and wiki can be updated.
> > > 
> > > Users might want to offload the flow at a later stage in the TCP
> > > connection.
> > 
> > Well, actually there is not later stage than established, anything
> > after established are TCP teardown states.
> > 
> > What's the issue with allowing to offload from SYN_RECV state?
> 
> There were multiple problem in general with the code.
> flow_offload_fixup_tcp always moves a TCP connection
> to established even if it is in FIN or CLOSE.
> 
> The flowoffload_del function was always setting the TCP timeout
> to ESTABLISHED timeout even when the state was in CLOSE and therefore
> creating a very long lasting dead state.
> 
> Since we might miss or bump packets to slow path, we do not know
> what will happen there when we are still in SYN_RECV.

Right.

> We will have a better knowledge of the TCP state when we are in 
> established first and we know that we are either still in it or
> we have moved past it to a closing state.

It makes sense to restrict this to TCP established only.

Oz and Paul already do this for tc-ct.

Thanks for explaining.
Sven Auhagen May 16, 2022, 6:23 p.m. UTC | #12
On Mon, May 16, 2022 at 07:50:09PM +0200, Pablo Neira Ayuso wrote:
> On Mon, May 16, 2022 at 03:02:13PM +0200, Sven Auhagen wrote:
> > On Mon, May 16, 2022 at 02:43:06PM +0200, Pablo Neira Ayuso wrote:
> > > On Mon, May 16, 2022 at 02:23:00PM +0200, Sven Auhagen wrote:
> > > > On Mon, May 16, 2022 at 02:13:03PM +0200, Pablo Neira Ayuso wrote:
> > > > > On Mon, May 16, 2022 at 12:56:41PM +0200, Pablo Neira Ayuso wrote:
> > > > > > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> [...]
> > > > > > [...]
> > > > > > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > > > > > index 0164e5f522e8..324fdb62c08b 100644
> > > > > > > --- a/net/netfilter/nf_conntrack_core.c
> > > > > > > +++ b/net/netfilter/nf_conntrack_core.c
> > > > > > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > > > > > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > > > > > >  
> > > > > > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > > > > > -				nf_ct_offload_timeout(tmp);
> > > > > > 
> > > > > > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > > > > > path that triggers the race, ie. nf_ct_is_expired()
> > > > > > 
> > > > > > The flowtable ct fixup races with conntrack gc collector.
> > > > > > 
> > > > > > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > > > > > the closing packets.
> > > > > > 
> > > > > > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > > > > > in a TCP state that represent closure?
> > > > > > 
> > > > > >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > > > >   			goto out;
> > > > > > 
> > > > > > this is already the intention in the existing code.
> > > > > 
> > > > > I'm attaching an incomplete sketch patch. My goal is to avoid the
> > > > > extra IPS_ bit.
> > > > 
> > > > You might create a race with ct gc that will remove the ct
> > > > if it is in close or end of close and before flow offload teardown is running
> > > > so flow offload teardown might access memory that was freed.
> > > 
> > > flow object holds a reference to the ct object until it is released,
> > > no use-after-free can happen.
> > > 
> > 
> > Also if nf_ct_delete is called before flowtable delete?
> > Can you let me know why?
> 
> nf_ct_delete() removes the conntrack object from lists and it
> decrements the reference counter by one.
> 
> flow_offload_free() also calls nf_ct_put(). flow_offload_alloc() bumps
> the reference count on the conntrack object before creating the flow.
> 
> > > > It is not a very likely scenario but never the less it might happen now
> > > > since the IPS_OFFLOAD_BIT is not set and the state might just time out.
> > > > 
> > > > If someone sets a very small TCP CLOSE timeout it gets more likely.
> > > > 
> > > > So Oz and myself were debatting about three possible cases/problems:
> > > > 
> > > > 1. ct gc sets timeout even though the state is in CLOSE/FIN because the
> > > > IPS_OFFLOAD is still set but the flow is in teardown
> > > > 2. ct gc removes the ct because the IPS_OFFLOAD is not set and
> > > > the CLOSE timeout is reached before the flow offload del
> > > 
> > > OK.
> > > 
> > > > 3. tcp ct is always set to ESTABLISHED with a very long timeout
> > > > in flow offload teardown/delete even though the state is already
> > > > CLOSED.
> > > >
> > > > Also as a remark we can not assume that the FIN or RST packet is hitting
> > > > flow table teardown as the packet might get bumped to the slow path in
> > > > nftables.
> > > 
> > > I assume this remark is related to 3.?
> > 
> > Yes, exactly.
> > 
> > > if IPS_OFFLOAD is unset, then conntrack would update the state
> > > according to this FIN or RST.
> > 
> > It will move to a different TCP state anyways only the ct state
> > will be at IPS_OFFLOAD_BIT and prevent it from beeing garbage collected.
> > The timeout will be bumped back up as long as IPS_OFFLOAD_BIT is set
> > even though TCP might already be CLOSED.

I see what you are trying to do here, I have some remarks:

> 
> If teardown fixes the ct state and timeout to established, and IPS_OFFLOAD is
> unset, then the packet is passed up in a consistent state.
> 
> I made a patch, it is based on yours, it's attached:
> 
> - If flow timeout expires or rst/fin is seen, ct state and timeout is
>   fixed up (to established state) and IPS_OFFLOAD is unset.
> 
> - If rst/fin packet is seen, ct state and timeout is fixed up (to
>   established state) and IPS_OFFLOAD is unset. The packet continues
>   its travel up to the classic path, so conntrack triggers the
>   transition from established to one of the close states.
> 
> For the case 1., IPS_OFFLOAD is not set anymore, so conntrack gc
> cannot race to reset the ct timeout anymore.
> 
> For the case 2., if gc conntrack ever removes the ct entry, then the
> IPS_DYING bit is set, which implicitly triggers the teardown state
> from the flowtable gc. The flowtable still holds a reference to the
> ct object, so no UAF can happen.
> 
> For the case 3. the conntrack is set to ESTABLISHED with a long
> timeout, yes. This is to deal with the two possible cases:
> 
> a) flowtable timeout expired, so conntrack recovers control on the
>    flow.
> b) tcp rst/fin will take back the packet to slow path. The ct has been
>    fixed up to established state so it will trasition to one of the
>    close states.
> 
> Am I missing anything?

You should not fixup the tcp state back to established.
If flow_offload_teardown is not called because a packet got bumped up to the slow path
and you call flow_offload_teardown from nf_flow_offload_gc_step, the tcp state might already
be in CLOSE state and you just moved it back to established.
The entire function flow_offload_fixup_tcp can go away if we only allow established tcp states
in the flowtable.

Same goes for the timeout. The timeout should really be set to the current tcp state
ct->proto.tcp->state which might not be established anymore.

For me the question remains, why can the ct gc not remove the ct when nf_ct_delete
is called before flow_offload_del is called?

Also you probably want to move the IPS_OFFLOAD_BIT to the beginning of
flow_offload_teardown just to make sure that the ct gc is not bumping up the ct timeout
while it is changed in flow_offload_fixup_ct.
Pablo Neira Ayuso May 17, 2022, 8:32 a.m. UTC | #13
On Mon, May 16, 2022 at 08:23:10PM +0200, Sven Auhagen wrote:
> On Mon, May 16, 2022 at 07:50:09PM +0200, Pablo Neira Ayuso wrote:
> > On Mon, May 16, 2022 at 03:02:13PM +0200, Sven Auhagen wrote:
> > > On Mon, May 16, 2022 at 02:43:06PM +0200, Pablo Neira Ayuso wrote:
> > > > On Mon, May 16, 2022 at 02:23:00PM +0200, Sven Auhagen wrote:
> > > > > On Mon, May 16, 2022 at 02:13:03PM +0200, Pablo Neira Ayuso wrote:
> > > > > > On Mon, May 16, 2022 at 12:56:41PM +0200, Pablo Neira Ayuso wrote:
> > > > > > > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> > [...]
> > > > > > > [...]
> > > > > > > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > > > > > > index 0164e5f522e8..324fdb62c08b 100644
> > > > > > > > --- a/net/netfilter/nf_conntrack_core.c
> > > > > > > > +++ b/net/netfilter/nf_conntrack_core.c
> > > > > > > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > > > > > > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > > > > > > >
> > > > > > > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > > > > > > -				nf_ct_offload_timeout(tmp);
> > > > > > >
> > > > > > > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > > > > > > path that triggers the race, ie. nf_ct_is_expired()
> > > > > > >
> > > > > > > The flowtable ct fixup races with conntrack gc collector.
> > > > > > >
> > > > > > > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > > > > > > the closing packets.
> > > > > > >
> > > > > > > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > > > > > > in a TCP state that represent closure?
> > > > > > >
> > > > > > >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > > > > >   			goto out;
> > > > > > >
> > > > > > > this is already the intention in the existing code.
> > > > > >
> > > > > > I'm attaching an incomplete sketch patch. My goal is to avoid the
> > > > > > extra IPS_ bit.
> > > > >
> > > > > You might create a race with ct gc that will remove the ct
> > > > > if it is in close or end of close and before flow offload teardown is running
> > > > > so flow offload teardown might access memory that was freed.
> > > >
> > > > flow object holds a reference to the ct object until it is released,
> > > > no use-after-free can happen.
> > > >
> > >
> > > Also if nf_ct_delete is called before flowtable delete?
> > > Can you let me know why?
> >
> > nf_ct_delete() removes the conntrack object from lists and it
> > decrements the reference counter by one.
> >
> > flow_offload_free() also calls nf_ct_put(). flow_offload_alloc() bumps
> > the reference count on the conntrack object before creating the flow.
> >
> > > > > It is not a very likely scenario but never the less it might happen now
> > > > > since the IPS_OFFLOAD_BIT is not set and the state might just time out.
> > > > >
> > > > > If someone sets a very small TCP CLOSE timeout it gets more likely.
> > > > >
> > > > > So Oz and myself were debatting about three possible cases/problems:
> > > > >
> > > > > 1. ct gc sets timeout even though the state is in CLOSE/FIN because the
> > > > > IPS_OFFLOAD is still set but the flow is in teardown
> > > > > 2. ct gc removes the ct because the IPS_OFFLOAD is not set and
> > > > > the CLOSE timeout is reached before the flow offload del
> > > >
> > > > OK.
> > > >
> > > > > 3. tcp ct is always set to ESTABLISHED with a very long timeout
> > > > > in flow offload teardown/delete even though the state is already
> > > > > CLOSED.
> > > > >
> > > > > Also as a remark we can not assume that the FIN or RST packet is hitting
> > > > > flow table teardown as the packet might get bumped to the slow path in
> > > > > nftables.
> > > >
> > > > I assume this remark is related to 3.?
> > >
> > > Yes, exactly.
> > >
> > > > if IPS_OFFLOAD is unset, then conntrack would update the state
> > > > according to this FIN or RST.
> > >
> > > It will move to a different TCP state anyways only the ct state
> > > will be at IPS_OFFLOAD_BIT and prevent it from beeing garbage collected.
> > > The timeout will be bumped back up as long as IPS_OFFLOAD_BIT is set
> > > even though TCP might already be CLOSED.
>
> I see what you are trying to do here, I have some remarks:
>
> >
> > If teardown fixes the ct state and timeout to established, and IPS_OFFLOAD is
> > unset, then the packet is passed up in a consistent state.
> >
> > I made a patch, it is based on yours, it's attached:
> >
> > - If flow timeout expires or rst/fin is seen, ct state and timeout is
> >   fixed up (to established state) and IPS_OFFLOAD is unset.
> >
> > - If rst/fin packet is seen, ct state and timeout is fixed up (to
> >   established state) and IPS_OFFLOAD is unset. The packet continues
> >   its travel up to the classic path, so conntrack triggers the
> >   transition from established to one of the close states.
> >
> > For the case 1., IPS_OFFLOAD is not set anymore, so conntrack gc
> > cannot race to reset the ct timeout anymore.
> >
> > For the case 2., if gc conntrack ever removes the ct entry, then the
> > IPS_DYING bit is set, which implicitly triggers the teardown state
> > from the flowtable gc. The flowtable still holds a reference to the
> > ct object, so no UAF can happen.
> >
> > For the case 3. the conntrack is set to ESTABLISHED with a long
> > timeout, yes. This is to deal with the two possible cases:
> >
> > a) flowtable timeout expired, so conntrack recovers control on the
> >    flow.
> > b) tcp rst/fin will take back the packet to slow path. The ct has been
> >    fixed up to established state so it will trasition to one of the
> >    close states.
> >
> > Am I missing anything?
>
> You should not fixup the tcp state back to established.
> If flow_offload_teardown is not called because a packet got bumped up to the slow path
> and you call flow_offload_teardown from nf_flow_offload_gc_step, the tcp state might already
> be in CLOSE state and you just moved it back to established.

OK.

> The entire function flow_offload_fixup_tcp can go away if we only allow established tcp states
> in the flowtable.

I'm keeping it, but I remove the reset of the tcp state.

> Same goes for the timeout. The timeout should really be set to the current tcp state
> ct->proto.tcp->state which might not be established anymore.

OK.

> For me the question remains, why can the ct gc not remove the ct when nf_ct_delete
> is called before flow_offload_del is called?

nf_ct_delete() removes indeed the entry from the conntrack table, then
it calls nf_ct_put() which decrements the refcnt. Given that the
flowtable holds a reference to the conntrack object...

 struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
 {
        struct flow_offload *flow;

        if (unlikely(nf_ct_is_dying(ct) ||
            !refcount_inc_not_zero(&ct->ct_general.use)))
                return NULL;

... use-after-free cannot happen. Note that flow_offload_free() calls
nf_ct_put(flow->ct), so at this point the ct object is released.

Is this your concern?

> Also you probably want to move the IPS_OFFLOAD_BIT to the beginning of
> flow_offload_teardown just to make sure that the ct gc is not bumping up the ct timeout
> while it is changed in flow_offload_fixup_ct.

Done.

See patch attached.
>
>
Sven Auhagen May 17, 2022, 8:36 a.m. UTC | #14
On Tue, May 17, 2022 at 10:32:03AM +0200, Pablo Neira Ayuso wrote:
> On Mon, May 16, 2022 at 08:23:10PM +0200, Sven Auhagen wrote:
> > On Mon, May 16, 2022 at 07:50:09PM +0200, Pablo Neira Ayuso wrote:
> > > On Mon, May 16, 2022 at 03:02:13PM +0200, Sven Auhagen wrote:
> > > > On Mon, May 16, 2022 at 02:43:06PM +0200, Pablo Neira Ayuso wrote:
> > > > > On Mon, May 16, 2022 at 02:23:00PM +0200, Sven Auhagen wrote:
> > > > > > On Mon, May 16, 2022 at 02:13:03PM +0200, Pablo Neira Ayuso wrote:
> > > > > > > On Mon, May 16, 2022 at 12:56:41PM +0200, Pablo Neira Ayuso wrote:
> > > > > > > > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> > > [...]
> > > > > > > > [...]
> > > > > > > > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > > > > > > > index 0164e5f522e8..324fdb62c08b 100644
> > > > > > > > > --- a/net/netfilter/nf_conntrack_core.c
> > > > > > > > > +++ b/net/netfilter/nf_conntrack_core.c
> > > > > > > > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > > > > > > > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > > > > > > > >
> > > > > > > > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > > > > > > > -				nf_ct_offload_timeout(tmp);
> > > > > > > >
> > > > > > > > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > > > > > > > path that triggers the race, ie. nf_ct_is_expired()
> > > > > > > >
> > > > > > > > The flowtable ct fixup races with conntrack gc collector.
> > > > > > > >
> > > > > > > > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > > > > > > > the closing packets.
> > > > > > > >
> > > > > > > > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > > > > > > > in a TCP state that represent closure?
> > > > > > > >
> > > > > > > >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > > > > > >   			goto out;
> > > > > > > >
> > > > > > > > this is already the intention in the existing code.
> > > > > > >
> > > > > > > I'm attaching an incomplete sketch patch. My goal is to avoid the
> > > > > > > extra IPS_ bit.
> > > > > >
> > > > > > You might create a race with ct gc that will remove the ct
> > > > > > if it is in close or end of close and before flow offload teardown is running
> > > > > > so flow offload teardown might access memory that was freed.
> > > > >
> > > > > flow object holds a reference to the ct object until it is released,
> > > > > no use-after-free can happen.
> > > > >
> > > >
> > > > Also if nf_ct_delete is called before flowtable delete?
> > > > Can you let me know why?
> > >
> > > nf_ct_delete() removes the conntrack object from lists and it
> > > decrements the reference counter by one.
> > >
> > > flow_offload_free() also calls nf_ct_put(). flow_offload_alloc() bumps
> > > the reference count on the conntrack object before creating the flow.
> > >
> > > > > > It is not a very likely scenario but never the less it might happen now
> > > > > > since the IPS_OFFLOAD_BIT is not set and the state might just time out.
> > > > > >
> > > > > > If someone sets a very small TCP CLOSE timeout it gets more likely.
> > > > > >
> > > > > > So Oz and myself were debatting about three possible cases/problems:
> > > > > >
> > > > > > 1. ct gc sets timeout even though the state is in CLOSE/FIN because the
> > > > > > IPS_OFFLOAD is still set but the flow is in teardown
> > > > > > 2. ct gc removes the ct because the IPS_OFFLOAD is not set and
> > > > > > the CLOSE timeout is reached before the flow offload del
> > > > >
> > > > > OK.
> > > > >
> > > > > > 3. tcp ct is always set to ESTABLISHED with a very long timeout
> > > > > > in flow offload teardown/delete even though the state is already
> > > > > > CLOSED.
> > > > > >
> > > > > > Also as a remark we can not assume that the FIN or RST packet is hitting
> > > > > > flow table teardown as the packet might get bumped to the slow path in
> > > > > > nftables.
> > > > >
> > > > > I assume this remark is related to 3.?
> > > >
> > > > Yes, exactly.
> > > >
> > > > > if IPS_OFFLOAD is unset, then conntrack would update the state
> > > > > according to this FIN or RST.
> > > >
> > > > It will move to a different TCP state anyways only the ct state
> > > > will be at IPS_OFFLOAD_BIT and prevent it from beeing garbage collected.
> > > > The timeout will be bumped back up as long as IPS_OFFLOAD_BIT is set
> > > > even though TCP might already be CLOSED.
> >
> > I see what you are trying to do here, I have some remarks:
> >
> > >
> > > If teardown fixes the ct state and timeout to established, and IPS_OFFLOAD is
> > > unset, then the packet is passed up in a consistent state.
> > >
> > > I made a patch, it is based on yours, it's attached:
> > >
> > > - If flow timeout expires or rst/fin is seen, ct state and timeout is
> > >   fixed up (to established state) and IPS_OFFLOAD is unset.
> > >
> > > - If rst/fin packet is seen, ct state and timeout is fixed up (to
> > >   established state) and IPS_OFFLOAD is unset. The packet continues
> > >   its travel up to the classic path, so conntrack triggers the
> > >   transition from established to one of the close states.
> > >
> > > For the case 1., IPS_OFFLOAD is not set anymore, so conntrack gc
> > > cannot race to reset the ct timeout anymore.
> > >
> > > For the case 2., if gc conntrack ever removes the ct entry, then the
> > > IPS_DYING bit is set, which implicitly triggers the teardown state
> > > from the flowtable gc. The flowtable still holds a reference to the
> > > ct object, so no UAF can happen.
> > >
> > > For the case 3. the conntrack is set to ESTABLISHED with a long
> > > timeout, yes. This is to deal with the two possible cases:
> > >
> > > a) flowtable timeout expired, so conntrack recovers control on the
> > >    flow.
> > > b) tcp rst/fin will take back the packet to slow path. The ct has been
> > >    fixed up to established state so it will trasition to one of the
> > >    close states.
> > >
> > > Am I missing anything?
> >
> > You should not fixup the tcp state back to established.
> > If flow_offload_teardown is not called because a packet got bumped up to the slow path
> > and you call flow_offload_teardown from nf_flow_offload_gc_step, the tcp state might already
> > be in CLOSE state and you just moved it back to established.
> 
> OK.
> 
> > The entire function flow_offload_fixup_tcp can go away if we only allow established tcp states
> > in the flowtable.
> 
> I'm keeping it, but I remove the reset of the tcp state.
> 
> > Same goes for the timeout. The timeout should really be set to the current tcp state
> > ct->proto.tcp->state which might not be established anymore.
> 
> OK.
> 
> > For me the question remains, why can the ct gc not remove the ct when nf_ct_delete
> > is called before flow_offload_del is called?
> 
> nf_ct_delete() removes indeed the entry from the conntrack table, then
> it calls nf_ct_put() which decrements the refcnt. Given that the
> flowtable holds a reference to the conntrack object...
> 
>  struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
>  {
>         struct flow_offload *flow;
> 
>         if (unlikely(nf_ct_is_dying(ct) ||
>             !refcount_inc_not_zero(&ct->ct_general.use)))
>                 return NULL;
> 
> ... use-after-free cannot happen. Note that flow_offload_free() calls
> nf_ct_put(flow->ct), so at this point the ct object is released.
> 
> Is this your concern?

Ah yes, thank you.
I did not catch the refcount_inc_not_zero call.

> 
> > Also you probably want to move the IPS_OFFLOAD_BIT to the beginning of
> > flow_offload_teardown just to make sure that the ct gc is not bumping up the ct timeout
> > while it is changed in flow_offload_fixup_ct.
> 
> Done.
> 
> See patch attached.
> >
> >

The patch looks good to me, one remark.

This has to be

-		if (unlikely(!tcph || tcph->fin || tcph->rst))
+		if (unlikely(!tcph || tcph->fin || tcph->rst ||
+			     !nf_conntrack_tcp_established(&ct->proto.tcp)))
 			goto out;

You are currently go to out if the tcp state is established but you
want the opposite, not established.

I think this will cover all cases.

Best
Sven
diff mbox series

Patch

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 26071021e986..bb06202a4965 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -118,6 +118,10 @@  enum ip_conntrack_status {
 	IPS_HW_OFFLOAD_BIT = 15,
 	IPS_HW_OFFLOAD = (1 << IPS_HW_OFFLOAD_BIT),
 
+	/* offloaded conntrack entry is marked for deletion. */
+	IPS_OFFLOAD_TEARDOWN_BIT = 16,
+	IPS_OFFLOAD_TEARDOWN = (1 << IPS_OFFLOAD_TEARDOWN_BIT),
+
 	/* Be careful here, modifying these bits can make things messy,
 	 * so don't let users modify them directly.
 	 */
@@ -126,7 +130,7 @@  enum ip_conntrack_status {
 				 IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_UNTRACKED |
 				 IPS_OFFLOAD | IPS_HW_OFFLOAD),
 
-	__IPS_MAX_BIT = 16,
+	__IPS_MAX_BIT = 17,
 };
 
 /* Connection tracking event types */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0164e5f522e8..324fdb62c08b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1477,7 +1477,8 @@  static void gc_worker(struct work_struct *work)
 			tmp = nf_ct_tuplehash_to_ctrack(h);
 
 			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
-				nf_ct_offload_timeout(tmp);
+				if (!test_bit(IPS_OFFLOAD_TEARDOWN_BIT, &tmp->status))
+					nf_ct_offload_timeout(tmp);
 				continue;
 			}
 
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 3db256da919b..aaed1a244013 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -177,14 +177,8 @@  int flow_offload_route_init(struct flow_offload *flow,
 }
 EXPORT_SYMBOL_GPL(flow_offload_route_init);
 
-static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
-{
-	tcp->state = TCP_CONNTRACK_ESTABLISHED;
-	tcp->seen[0].td_maxwin = 0;
-	tcp->seen[1].td_maxwin = 0;
-}
 
-static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct nf_conn *ct)
 {
 	struct net *net = nf_ct_net(ct);
 	int l4num = nf_ct_protonum(ct);
@@ -192,8 +186,12 @@  static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
 
 	if (l4num == IPPROTO_TCP) {
 		struct nf_tcp_net *tn = nf_tcp_pernet(net);
+		struct ip_ct_tcp *tcp = &ct->proto.tcp;
+
+		tcp->seen[0].td_maxwin = 0;
+		tcp->seen[1].td_maxwin = 0;
 
-		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
+		timeout = tn->timeouts[ct->proto.tcp.state];
 		timeout -= tn->offload_timeout;
 	} else if (l4num == IPPROTO_UDP) {
 		struct nf_udp_net *tn = nf_udp_pernet(net);
@@ -211,18 +209,6 @@  static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
 		WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
 }
 
-static void flow_offload_fixup_ct_state(struct nf_conn *ct)
-{
-	if (nf_ct_protonum(ct) == IPPROTO_TCP)
-		flow_offload_fixup_tcp(&ct->proto.tcp);
-}
-
-static void flow_offload_fixup_ct(struct nf_conn *ct)
-{
-	flow_offload_fixup_ct_state(ct);
-	flow_offload_fixup_ct_timeout(ct);
-}
-
 static void flow_offload_route_release(struct flow_offload *flow)
 {
 	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
@@ -353,6 +339,10 @@  static inline bool nf_flow_has_expired(const struct flow_offload *flow)
 static void flow_offload_del(struct nf_flowtable *flow_table,
 			     struct flow_offload *flow)
 {
+	struct nf_conn *ct = flow->ct;
+
+	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
+
 	rhashtable_remove_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
 			       nf_flow_offload_rhash_params);
@@ -360,12 +350,11 @@  static void flow_offload_del(struct nf_flowtable *flow_table,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
 			       nf_flow_offload_rhash_params);
 
-	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
-
 	if (nf_flow_has_expired(flow))
-		flow_offload_fixup_ct(flow->ct);
-	else
-		flow_offload_fixup_ct_timeout(flow->ct);
+		flow_offload_fixup_ct(ct);
+
+	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
+	clear_bit(IPS_OFFLOAD_TEARDOWN_BIT, &ct->status);
 
 	flow_offload_free(flow);
 }
@@ -373,8 +362,9 @@  static void flow_offload_del(struct nf_flowtable *flow_table,
 void flow_offload_teardown(struct flow_offload *flow)
 {
 	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
+	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
 
-	flow_offload_fixup_ct_state(flow->ct);
+	flow_offload_fixup_ct(flow->ct);
 }
 EXPORT_SYMBOL_GPL(flow_offload_teardown);
 
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 900d48c810a1..9cc3ea08eb3a 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -295,6 +295,8 @@  static void nft_flow_offload_eval(const struct nft_expr *expr,
 					  sizeof(_tcph), &_tcph);
 		if (unlikely(!tcph || tcph->fin || tcph->rst))
 			goto out;
+		if (unlikely(!nf_conntrack_tcp_established(ct)))
+			goto out;
 		break;
 	case IPPROTO_UDP:
 		break;