diff mbox series

[ipsec-next,v5,09/17] xfrm: iptfs: add user packet (tunnel ingress) handling

Message ID 20240714202246.1573817-10-chopps@chopps.org (mailing list archive)
State Awaiting Upstream
Delegated to: Netdev Maintainers
Headers show
Series Add IP-TFS mode to xfrm | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply

Commit Message

Christian Hopps July 14, 2024, 8:22 p.m. UTC
From: Christian Hopps <chopps@labn.net>

Add tunnel packet output functionality. This is code handles
the ingress to the tunnel.

Signed-off-by: Christian Hopps <chopps@labn.net>
---
 net/xfrm/xfrm_iptfs.c | 535 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 532 insertions(+), 3 deletions(-)

Comments

Simon Horman July 15, 2024, 12:55 p.m. UTC | #1
On Sun, Jul 14, 2024 at 04:22:37PM -0400, Christian Hopps wrote:
> From: Christian Hopps <chopps@labn.net>
> 
> Add tunnel packet output functionality. This is code handles
> the ingress to the tunnel.
> 
> Signed-off-by: Christian Hopps <chopps@labn.net>
> ---
>  net/xfrm/xfrm_iptfs.c | 535 +++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 532 insertions(+), 3 deletions(-)
> 
> diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c

...

> +static int iptfs_get_cur_pmtu(struct xfrm_state *x,
> +			      struct xfrm_iptfs_data *xtfs, struct sk_buff *skb)
> +{
> +	struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb);
> +	u32 payload_mtu = xtfs->payload_mtu;
> +	u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached);

Hi Christian,

Please consider arranging local variable declarations in Networking
code in reverse xmas tree order - longest line to shortest.
I think that in this case that would involve separating the
declaration and assignment of pmtu.

Edward Cree's tool can be helpful here:
https://github.com/ecree-solarflare/xmastree

> +
> +	if (payload_mtu && payload_mtu < pmtu)
> +		pmtu = payload_mtu;
> +
> +	return pmtu;
> +}

...

> +/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload
> + * (i.e., aggregating or fragmenting as appropriate).
> + * This is set in dst->output for an SA.
> + */
> +static int iptfs_output_collect(struct net *net, struct sock *sk,
> +				struct sk_buff *skb)
> +{
> +	struct dst_entry *dst = skb_dst(skb);
> +	struct xfrm_state *x = dst->xfrm;
> +	struct xfrm_iptfs_data *xtfs = x->mode_data;
> +	struct sk_buff *segs, *nskb;
> +	u32 pmtu = 0;
> +	bool ok = true;
> +	bool was_gso;
> +
> +	/* We have hooked into dst_entry->output which means we have skipped the
> +	 * protocol specific netfilter (see xfrm4_output, xfrm6_output).
> +	 * when our timer runs we will end up calling xfrm_output directly on
> +	 * the encapsulated traffic.
> +	 *
> +	 * For both cases this is the NF_INET_POST_ROUTING hook which allows
> +	 * changing the skb->dst entry which then may not be xfrm based anymore
> +	 * in which case a REROUTED flag is set. and dst_output is called.
> +	 *
> +	 * For IPv6 we are also skipping fragmentation handling for local
> +	 * sockets, which may or may not be good depending on our tunnel DF
> +	 * setting. Normally with fragmentation supported we want to skip this
> +	 * fragmentation.
> +	 */
> +
> +	BUG_ON(!xtfs);
> +
> +	pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
> +
> +	/* Break apart GSO skbs. If the queue is nearing full then we want the
> +	 * accounting and queuing to be based on the individual packets not on the
> +	 * aggregate GSO buffer.
> +	 */
> +	was_gso = skb_is_gso(skb);
> +	if (!was_gso) {
> +		segs = skb;
> +	} else {
> +		segs = skb_gso_segment(skb, 0);
> +		if (IS_ERR_OR_NULL(segs)) {
> +			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
> +			kfree_skb(skb);
> +			return PTR_ERR(segs);

This will return 0 is skb_gso_segment returns NULL,
which occurs if skb doesn't require segmentation.
Is that intentional?

If so, I wonder if it would be slightly nicer
to use PTR_ERR_OR_ZERO() instead of PTR_ERR().

Flagged by Smatch (suggestion is my own).
 (suggestion is my own)
> +		}
> +		consume_skb(skb);
> +		skb = NULL;
> +	}
> +
> +	/* We can be running on multiple cores and from the network softirq or
> +	 * from user context depending on where the packet is coming from.
> +	 */
> +	spin_lock_bh(&x->lock);
> +
> +	skb_list_walk_safe(segs, skb, nskb)
> +	{
> +		skb_mark_not_on_list(skb);
> +
> +		/* Once we drop due to no queue space we continue to drop the
> +		 * rest of the packets from that GRO.
> +		 */
> +		if (!ok) {
> +nospace:
> +			if (skb->dev)
> +				XFRM_INC_STATS(dev_net(skb->dev),
> +					       LINUX_MIB_XFRMOUTNOQSPACE);
> +			kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING);
> +			continue;
> +		}
> +
> +		/* Fragmenting handled in following commits. */
> +		if (iptfs_is_too_big(sk, skb, pmtu)) {
> +			kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
> +			continue;
> +		}
> +
> +		/* Enqueue to send in tunnel */
> +		ok = iptfs_enqueue(xtfs, skb);
> +		if (!ok)
> +			goto nospace;
> +	}
> +
> +	/* Start a delay timer if we don't have one yet */
> +	if (!hrtimer_is_queued(&xtfs->iptfs_timer)) {
> +		hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns,
> +			      IPTFS_HRTIMER_MODE);
> +		xtfs->iptfs_settime = ktime_get_raw_fast_ns();
> +	}
> +
> +	spin_unlock_bh(&x->lock);
> +	return 0;
> +}

...

> +static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me)
> +{
> +	struct sk_buff_head list;
> +	struct xfrm_iptfs_data *xtfs;
> +	struct xfrm_state *x;
> +	time64_t settime;
> +
> +	xtfs = container_of(me, typeof(*xtfs), iptfs_timer);
> +	x = xtfs->x;
> +
> +	/* Process all the queued packets
> +	 *
> +	 * softirq execution order: timer > tasklet > hrtimer
> +	 *
> +	 * Network rx will have run before us giving one last chance to queue
> +	 * ingress packets for us to process and transmit.
> +	 */
> +
> +	spin_lock(&x->lock);
> +	__skb_queue_head_init(&list);
> +	skb_queue_splice_init(&xtfs->queue, &list);
> +	xtfs->queue_size = 0;
> +	settime = xtfs->iptfs_settime;

nit: settime is set but otherwise unused in this function.

     Flagged by W=1 x86_64 allmodconfig builds with gcc-14 and clang-18.

> +	spin_unlock(&x->lock);
> +
> +	/* After the above unlock, packets can begin queuing again, and the
> +	 * timer can be set again, from another CPU either in softirq or user
> +	 * context (not from this one since we are running at softirq level
> +	 * already).
> +	 */
> +
> +	iptfs_output_queued(x, &list);
> +
> +	return HRTIMER_NORESTART;
> +}

...

> @@ -98,10 +607,23 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
>  {
>  	struct xfrm_iptfs_data *xtfs = x->mode_data;
>  	struct xfrm_iptfs_config *xc = &xtfs->cfg;
> -	int ret = 0;
> +	int ret;
> +	u64 q;
> +
> +	if (x->dir == XFRM_SA_DIR_OUT) {
> +		q = xtfs->init_delay_ns;
> +		(void)do_div(q, NSECS_IN_USEC);
> +		ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q);
> +		if (ret)
> +			return ret;
> +
> +		ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE,
> +				  xc->max_queue_size);
> +		if (ret)
> +			return ret;
>  
> -	if (x->dir == XFRM_SA_DIR_OUT)
>  		ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size);
> +	}

ret will be used uninitialised here unless the if condition above is true.

Flagged by W=1 x86_64 allmodconfig build with clang-18, and Smatch.

>  
>  	return ret;
>  }
...
kernel test robot July 16, 2024, 2 p.m. UTC | #2
Hi Christian,

kernel test robot noticed the following build errors:

[auto build test ERROR on klassert-ipsec-next/master]
[also build test ERROR on next-20240716]
[cannot apply to klassert-ipsec/master netfilter-nf/main linus/master nf-next/master v6.10]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Christian-Hopps/xfrm-config-add-CONFIG_XFRM_IPTFS/20240715-042948
base:   https://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git master
patch link:    https://lore.kernel.org/r/20240714202246.1573817-10-chopps%40chopps.org
patch subject: [PATCH ipsec-next v5 09/17] xfrm: iptfs: add user packet (tunnel ingress) handling
config: i386-randconfig-061-20240716 (https://download.01.org/0day-ci/archive/20240716/202407162110.8Xuwy6GR-lkp@intel.com/config)
compiler: gcc-13 (Ubuntu 13.2.0-4ubuntu3) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240716/202407162110.8Xuwy6GR-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202407162110.8Xuwy6GR-lkp@intel.com/

All errors (new ones prefixed by >>):

   ld: net/xfrm/xfrm_iptfs.o: in function `iptfs_user_init':
>> net/xfrm/xfrm_iptfs.c:586:(.text+0x2a0): undefined reference to `__udivdi3'


vim +586 net/xfrm/xfrm_iptfs.c

   547	
   548	/**
   549	 * iptfs_user_init() - initialize the SA with IPTFS options from netlink.
   550	 * @net: the net data
   551	 * @x: xfrm state
   552	 * @attrs: netlink attributes
   553	 * @extack: extack return data
   554	 */
   555	static int iptfs_user_init(struct net *net, struct xfrm_state *x,
   556				   struct nlattr **attrs,
   557				   struct netlink_ext_ack *extack)
   558	{
   559		struct xfrm_iptfs_data *xtfs = x->mode_data;
   560		struct xfrm_iptfs_config *xc;
   561	
   562		xc = &xtfs->cfg;
   563		xc->max_queue_size = net->xfrm.sysctl_iptfs_max_qsize;
   564		xtfs->init_delay_ns =
   565			(u64)net->xfrm.sysctl_iptfs_init_delay * NSECS_IN_USEC;
   566	
   567		if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
   568			xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]);
   569			if (!xc->pkt_size) {
   570				xtfs->payload_mtu = 0;
   571			} else if (xc->pkt_size > x->props.header_len) {
   572				xtfs->payload_mtu = xc->pkt_size - x->props.header_len;
   573			} else {
   574				NL_SET_ERR_MSG(extack,
   575					       "Packet size must be 0 or greater than IPTFS/ESP header length");
   576				return -EINVAL;
   577			}
   578		}
   579		if (attrs[XFRMA_IPTFS_MAX_QSIZE])
   580			xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]);
   581		if (attrs[XFRMA_IPTFS_INIT_DELAY])
   582			xtfs->init_delay_ns =
   583				(u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) *
   584				NSECS_IN_USEC;
   585	
 > 586		xtfs->ecn_queue_size = (u64)xc->max_queue_size * 95 / 100;
   587	
   588		return 0;
   589	}
   590
Christian Hopps July 18, 2024, 5:35 a.m. UTC | #3
Simon Horman via Devel <devel@linux-ipsec.org> writes:

> On Sun, Jul 14, 2024 at 04:22:37PM -0400, Christian Hopps wrote:
>> From: Christian Hopps <chopps@labn.net>
>>
>> Add tunnel packet output functionality. This is code handles
>> the ingress to the tunnel.
>>
>> Signed-off-by: Christian Hopps <chopps@labn.net>
>> ---
>>  net/xfrm/xfrm_iptfs.c | 535 +++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 532 insertions(+), 3 deletions(-)
>>
>> diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c
>
> ...
>
>> +static int iptfs_get_cur_pmtu(struct xfrm_state *x,
>> +			      struct xfrm_iptfs_data *xtfs, struct sk_buff *skb)
>> +{
>> +	struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb);
>> +	u32 payload_mtu = xtfs->payload_mtu;
>> +	u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
>
> Hi Christian,
>
> Please consider arranging local variable declarations in Networking
> code in reverse xmas tree order - longest line to shortest.
> I think that in this case that would involve separating the
> declaration and assignment of pmtu.
>
> Edward Cree's tool can be helpful here:
> https://github.com/ecree-solarflare/xmastree

This does not appear to be a style that is required by the net/xfrm code. I verified this by running the above tool on the other files in net/xfrm/*.c. In this case I'd prefer to not increase the number of lines in the function in order to satisfy the optional style guideline.

>> +
>> +	if (payload_mtu && payload_mtu < pmtu)
>> +		pmtu = payload_mtu;
>> +
>> +	return pmtu;
>> +}
>
> ...
>
>> +/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload
>> + * (i.e., aggregating or fragmenting as appropriate).
>> + * This is set in dst->output for an SA.
>> + */
>> +static int iptfs_output_collect(struct net *net, struct sock *sk,
>> +				struct sk_buff *skb)
>> +{
>> +	struct dst_entry *dst = skb_dst(skb);
>> +	struct xfrm_state *x = dst->xfrm;
>> +	struct xfrm_iptfs_data *xtfs = x->mode_data;
>> +	struct sk_buff *segs, *nskb;
>> +	u32 pmtu = 0;
>> +	bool ok = true;
>> +	bool was_gso;
>> +
>> +	/* We have hooked into dst_entry->output which means we have skipped the
>> +	 * protocol specific netfilter (see xfrm4_output, xfrm6_output).
>> +	 * when our timer runs we will end up calling xfrm_output directly on
>> +	 * the encapsulated traffic.
>> +	 *
>> +	 * For both cases this is the NF_INET_POST_ROUTING hook which allows
>> +	 * changing the skb->dst entry which then may not be xfrm based anymore
>> +	 * in which case a REROUTED flag is set. and dst_output is called.
>> +	 *
>> +	 * For IPv6 we are also skipping fragmentation handling for local
>> +	 * sockets, which may or may not be good depending on our tunnel DF
>> +	 * setting. Normally with fragmentation supported we want to skip this
>> +	 * fragmentation.
>> +	 */
>> +
>> +	BUG_ON(!xtfs);
>> +
>> +	pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
>> +
>> +	/* Break apart GSO skbs. If the queue is nearing full then we want the
>> +	 * accounting and queuing to be based on the individual packets not on the
>> +	 * aggregate GSO buffer.
>> +	 */
>> +	was_gso = skb_is_gso(skb);
>> +	if (!was_gso) {
>> +		segs = skb;
>> +	} else {
>> +		segs = skb_gso_segment(skb, 0);
>> +		if (IS_ERR_OR_NULL(segs)) {
>> +			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
>> +			kfree_skb(skb);
>> +			return PTR_ERR(segs);
>
> This will return 0 is skb_gso_segment returns NULL,
> which occurs if skb doesn't require segmentation.
> Is that intentional?
>
> If so, I wonder if it would be slightly nicer
> to use PTR_ERR_OR_ZERO() instead of PTR_ERR().
>
> Flagged by Smatch (suggestion is my own).
>  (suggestion is my own)

Thanks! Actually in the case of NULL I've changed the code to return -EINVAL as xfrm_output_gso() does.

>> +		}
>> +		consume_skb(skb);
>> +		skb = NULL;
>> +	}
>> +
>> +	/* We can be running on multiple cores and from the network softirq or
>> +	 * from user context depending on where the packet is coming from.
>> +	 */
>> +	spin_lock_bh(&x->lock);
>> +
>> +	skb_list_walk_safe(segs, skb, nskb)
>> +	{
>> +		skb_mark_not_on_list(skb);
>> +
>> +		/* Once we drop due to no queue space we continue to drop the
>> +		 * rest of the packets from that GRO.
>> +		 */
>> +		if (!ok) {
>> +nospace:
>> +			if (skb->dev)
>> +				XFRM_INC_STATS(dev_net(skb->dev),
>> +					       LINUX_MIB_XFRMOUTNOQSPACE);
>> +			kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING);
>> +			continue;
>> +		}
>> +
>> +		/* Fragmenting handled in following commits. */
>> +		if (iptfs_is_too_big(sk, skb, pmtu)) {
>> +			kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
>> +			continue;
>> +		}
>> +
>> +		/* Enqueue to send in tunnel */
>> +		ok = iptfs_enqueue(xtfs, skb);
>> +		if (!ok)
>> +			goto nospace;
>> +	}
>> +
>> +	/* Start a delay timer if we don't have one yet */
>> +	if (!hrtimer_is_queued(&xtfs->iptfs_timer)) {
>> +		hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns,
>> +			      IPTFS_HRTIMER_MODE);
>> +		xtfs->iptfs_settime = ktime_get_raw_fast_ns();
>> +	}
>> +
>> +	spin_unlock_bh(&x->lock);
>> +	return 0;
>> +}
>
> ...
>
>> +static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me)
>> +{
>> +	struct sk_buff_head list;
>> +	struct xfrm_iptfs_data *xtfs;
>> +	struct xfrm_state *x;
>> +	time64_t settime;
>> +
>> +	xtfs = container_of(me, typeof(*xtfs), iptfs_timer);
>> +	x = xtfs->x;
>> +
>> +	/* Process all the queued packets
>> +	 *
>> +	 * softirq execution order: timer > tasklet > hrtimer
>> +	 *
>> +	 * Network rx will have run before us giving one last chance to queue
>> +	 * ingress packets for us to process and transmit.
>> +	 */
>> +
>> +	spin_lock(&x->lock);
>> +	__skb_queue_head_init(&list);
>> +	skb_queue_splice_init(&xtfs->queue, &list);
>> +	xtfs->queue_size = 0;
>> +	settime = xtfs->iptfs_settime;
>
> nit: settime is set but otherwise unused in this function.
>
>      Flagged by W=1 x86_64 allmodconfig builds with gcc-14 and clang-18.
>

Hmm, this value is in fact used inside a trace point function call in this function.

>> +	spin_unlock(&x->lock);
>> +
>> +	/* After the above unlock, packets can begin queuing again, and the
>> +	 * timer can be set again, from another CPU either in softirq or user
>> +	 * context (not from this one since we are running at softirq level
>> +	 * already).
>> +	 */
>> +
>> +	iptfs_output_queued(x, &list);
>> +
>> +	return HRTIMER_NORESTART;
>> +}
>
> ...
>
>> @@ -98,10 +607,23 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
>>  {
>>  	struct xfrm_iptfs_data *xtfs = x->mode_data;
>>  	struct xfrm_iptfs_config *xc = &xtfs->cfg;
>> -	int ret = 0;
>> +	int ret;
>> +	u64 q;
>> +
>> +	if (x->dir == XFRM_SA_DIR_OUT) {
>> +		q = xtfs->init_delay_ns;
>> +		(void)do_div(q, NSECS_IN_USEC);
>> +		ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q);
>> +		if (ret)
>> +			return ret;
>> +
>> +		ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE,
>> +				  xc->max_queue_size);
>> +		if (ret)
>> +			return ret;
>>
>> -	if (x->dir == XFRM_SA_DIR_OUT)
>>  		ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size);
>> +	}
>
> ret will be used uninitialised here unless the if condition above is true.
>
> Flagged by W=1 x86_64 allmodconfig build with clang-18, and Smatch.

Sigh, this is an artifact of splitting up the new file into multiple functionality commits. The final resulting code does not leave the value uninitialized. In any case I will fix it in this middle commit too.

Thanks!
Chris.

>
>>
>>  	return ret;
>>  }
> ...
Christian Hopps July 18, 2024, 6:32 a.m. UTC | #4
Christian Hopps <chopps@chopps.org> writes:

> [[PGP Signed Part:Good signature from 2E1D830ED7B83025 Christian Hopps <chopps@gmail.com> (trust ultimate) created at 2024-07-17T22:55:46-0700 using RSA]]
>
> Simon Horman via Devel <devel@linux-ipsec.org> writes:
>
>> On Sun, Jul 14, 2024 at 04:22:37PM -0400, Christian Hopps wrote:
>>> From: Christian Hopps <chopps@labn.net>
>>>
>>> Add tunnel packet output functionality. This is code handles
>>> the ingress to the tunnel.
>>>
>>> Signed-off-by: Christian Hopps <chopps@labn.net>
>>> ---
>>>  net/xfrm/xfrm_iptfs.c | 535 +++++++++++++++++++++++++++++++++++++++++-
>>>  1 file changed, 532 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c
>>
>> ...
>>
>>> +static int iptfs_get_cur_pmtu(struct xfrm_state *x,
>>> +			      struct xfrm_iptfs_data *xtfs, struct sk_buff *skb)
>>> +{
>>> +	struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb);
>>> +	u32 payload_mtu = xtfs->payload_mtu;
>>> +	u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
>>
>> Hi Christian,
>>
>> Please consider arranging local variable declarations in Networking
>> code in reverse xmas tree order - longest line to shortest.
>> I think that in this case that would involve separating the
>> declaration and assignment of pmtu.
>>
>> Edward Cree's tool can be helpful here:
>> https://github.com/ecree-solarflare/xmastree
>
> This does not appear to be a style that is required by the net/xfrm code. I
> verified this by running the above tool on the other files in net/xfrm/*.c. In
> this case I'd prefer to not increase the number of lines in the function in
> order to satisfy the optional style guideline.
>
>>> +
>>> +	if (payload_mtu && payload_mtu < pmtu)
>>> +		pmtu = payload_mtu;
>>> +
>>> +	return pmtu;
>>> +}
>>
>> ...
>>
>>> +/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload
>>> + * (i.e., aggregating or fragmenting as appropriate).
>>> + * This is set in dst->output for an SA.
>>> + */
>>> +static int iptfs_output_collect(struct net *net, struct sock *sk,
>>> +				struct sk_buff *skb)
>>> +{
>>> +	struct dst_entry *dst = skb_dst(skb);
>>> +	struct xfrm_state *x = dst->xfrm;
>>> +	struct xfrm_iptfs_data *xtfs = x->mode_data;
>>> +	struct sk_buff *segs, *nskb;
>>> +	u32 pmtu = 0;
>>> +	bool ok = true;
>>> +	bool was_gso;
>>> +
>>> +	/* We have hooked into dst_entry->output which means we have skipped the
>>> +	 * protocol specific netfilter (see xfrm4_output, xfrm6_output).
>>> +	 * when our timer runs we will end up calling xfrm_output directly on
>>> +	 * the encapsulated traffic.
>>> +	 *
>>> +	 * For both cases this is the NF_INET_POST_ROUTING hook which allows
>>> +	 * changing the skb->dst entry which then may not be xfrm based anymore
>>> +	 * in which case a REROUTED flag is set. and dst_output is called.
>>> +	 *
>>> +	 * For IPv6 we are also skipping fragmentation handling for local
>>> +	 * sockets, which may or may not be good depending on our tunnel DF
>>> +	 * setting. Normally with fragmentation supported we want to skip this
>>> +	 * fragmentation.
>>> +	 */
>>> +
>>> +	BUG_ON(!xtfs);
>>> +
>>> +	pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
>>> +
>>> +	/* Break apart GSO skbs. If the queue is nearing full then we want the
>>> +	 * accounting and queuing to be based on the individual packets not on the
>>> +	 * aggregate GSO buffer.
>>> +	 */
>>> +	was_gso = skb_is_gso(skb);
>>> +	if (!was_gso) {
>>> +		segs = skb;
>>> +	} else {
>>> +		segs = skb_gso_segment(skb, 0);
>>> +		if (IS_ERR_OR_NULL(segs)) {
>>> +			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
>>> +			kfree_skb(skb);
>>> +			return PTR_ERR(segs);
>>
>> This will return 0 is skb_gso_segment returns NULL,
>> which occurs if skb doesn't require segmentation.
>> Is that intentional?
>>
>> If so, I wonder if it would be slightly nicer
>> to use PTR_ERR_OR_ZERO() instead of PTR_ERR().
>>
>> Flagged by Smatch (suggestion is my own).
>>  (suggestion is my own)
>
> Thanks! Actually in the case of NULL I've changed the code to return -EINVAL as xfrm_output_gso() does.
>
>>> +		}
>>> +		consume_skb(skb);
>>> +		skb = NULL;
>>> +	}
>>> +
>>> +	/* We can be running on multiple cores and from the network softirq or
>>> +	 * from user context depending on where the packet is coming from.
>>> +	 */
>>> +	spin_lock_bh(&x->lock);
>>> +
>>> +	skb_list_walk_safe(segs, skb, nskb)
>>> +	{
>>> +		skb_mark_not_on_list(skb);
>>> +
>>> +		/* Once we drop due to no queue space we continue to drop the
>>> +		 * rest of the packets from that GRO.
>>> +		 */
>>> +		if (!ok) {
>>> +nospace:
>>> +			if (skb->dev)
>>> +				XFRM_INC_STATS(dev_net(skb->dev),
>>> +					       LINUX_MIB_XFRMOUTNOQSPACE);
>>> +			kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING);
>>> +			continue;
>>> +		}
>>> +
>>> +		/* Fragmenting handled in following commits. */
>>> +		if (iptfs_is_too_big(sk, skb, pmtu)) {
>>> +			kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
>>> +			continue;
>>> +		}
>>> +
>>> +		/* Enqueue to send in tunnel */
>>> +		ok = iptfs_enqueue(xtfs, skb);
>>> +		if (!ok)
>>> +			goto nospace;
>>> +	}
>>> +
>>> +	/* Start a delay timer if we don't have one yet */
>>> +	if (!hrtimer_is_queued(&xtfs->iptfs_timer)) {
>>> +		hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns,
>>> +			      IPTFS_HRTIMER_MODE);
>>> +		xtfs->iptfs_settime = ktime_get_raw_fast_ns();
>>> +	}
>>> +
>>> +	spin_unlock_bh(&x->lock);
>>> +	return 0;
>>> +}
>>
>> ...
>>
>>> +static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me)
>>> +{
>>> +	struct sk_buff_head list;
>>> +	struct xfrm_iptfs_data *xtfs;
>>> +	struct xfrm_state *x;
>>> +	time64_t settime;
>>> +
>>> +	xtfs = container_of(me, typeof(*xtfs), iptfs_timer);
>>> +	x = xtfs->x;
>>> +
>>> +	/* Process all the queued packets
>>> +	 *
>>> +	 * softirq execution order: timer > tasklet > hrtimer
>>> +	 *
>>> +	 * Network rx will have run before us giving one last chance to queue
>>> +	 * ingress packets for us to process and transmit.
>>> +	 */
>>> +
>>> +	spin_lock(&x->lock);
>>> +	__skb_queue_head_init(&list);
>>> +	skb_queue_splice_init(&xtfs->queue, &list);
>>> +	xtfs->queue_size = 0;
>>> +	settime = xtfs->iptfs_settime;
>>
>> nit: settime is set but otherwise unused in this function.
>>
>>      Flagged by W=1 x86_64 allmodconfig builds with gcc-14 and clang-18.
>>
>
> Hmm, this value is in fact used inside a trace point function call in this function.


Realized this is another artifact of splitting the commit, I've moved settime to the later tracepoint commit now.

Thanks,
Chris.

>
>>> +	spin_unlock(&x->lock);
>>> +
>>> +	/* After the above unlock, packets can begin queuing again, and the
>>> +	 * timer can be set again, from another CPU either in softirq or user
>>> +	 * context (not from this one since we are running at softirq level
>>> +	 * already).
>>> +	 */
>>> +
>>> +	iptfs_output_queued(x, &list);
>>> +
>>> +	return HRTIMER_NORESTART;
>>> +}
>>
>> ...
>>
>>> @@ -98,10 +607,23 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
>>>  {
>>>  	struct xfrm_iptfs_data *xtfs = x->mode_data;
>>>  	struct xfrm_iptfs_config *xc = &xtfs->cfg;
>>> -	int ret = 0;
>>> +	int ret;
>>> +	u64 q;
>>> +
>>> +	if (x->dir == XFRM_SA_DIR_OUT) {
>>> +		q = xtfs->init_delay_ns;
>>> +		(void)do_div(q, NSECS_IN_USEC);
>>> +		ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q);
>>> +		if (ret)
>>> +			return ret;
>>> +
>>> +		ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE,
>>> +				  xc->max_queue_size);
>>> +		if (ret)
>>> +			return ret;
>>>
>>> -	if (x->dir == XFRM_SA_DIR_OUT)
>>>  		ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size);
>>> +	}
>>
>> ret will be used uninitialised here unless the if condition above is true.
>>
>> Flagged by W=1 x86_64 allmodconfig build with clang-18, and Smatch.
>
> Sigh, this is an artifact of splitting up the new file into multiple
> functionality commits. The final resulting code does not leave the value
> uninitialized. In any case I will fix it in this middle commit too.
>
> Thanks!
> Chris.
>
>>
>>>
>>>  	return ret;
>>>  }
>> ...
>
> [[End of PGP Signed Part]]
diff mbox series

Patch

diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c
index 414035a7a208..82c9027b85f1 100644
--- a/net/xfrm/xfrm_iptfs.c
+++ b/net/xfrm/xfrm_iptfs.c
@@ -19,8 +19,13 @@ 
 
 #include "xfrm_inout.h"
 
+#define NSECS_IN_USEC 1000
+
+#define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT
+
 struct xfrm_iptfs_config {
 	u32 pkt_size;	    /* outer_packet_size or 0 */
+	u32 max_queue_size; /* octets */
 };
 
 struct xfrm_iptfs_data {
@@ -28,9 +33,498 @@  struct xfrm_iptfs_data {
 
 	/* Ingress User Input */
 	struct xfrm_state *x;	    /* owning state */
+	struct sk_buff_head queue;  /* output queue */
+	u32 queue_size;		    /* octets */
+	u32 ecn_queue_size;	    /* octets above which ECN mark */
+	u64 init_delay_ns;	    /* nanoseconds */
+	struct hrtimer iptfs_timer; /* output timer */
+	time64_t iptfs_settime;	    /* time timer was set */
 	u32 payload_mtu;	    /* max payload size */
 };
 
+static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu);
+static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me);
+
+/* ================================= */
+/* IPTFS Sending (ingress) Functions */
+/* ================================= */
+
+/* ------------------------- */
+/* Enqueue to send functions */
+/* ------------------------- */
+
+/**
+ * iptfs_enqueue() - enqueue packet if ok to send.
+ * @xtfs: xtfs state
+ * @skb: the packet
+ *
+ * Return: true if packet enqueued.
+ */
+static bool iptfs_enqueue(struct xfrm_iptfs_data *xtfs, struct sk_buff *skb)
+{
+	u64 newsz = xtfs->queue_size + skb->len;
+	struct iphdr *iph;
+
+	assert_spin_locked(&xtfs->x->lock);
+
+	if (newsz > xtfs->cfg.max_queue_size)
+		return false;
+
+	/* Set ECN CE if we are above our ECN queue threshold */
+	if (newsz > xtfs->ecn_queue_size) {
+		iph = ip_hdr(skb);
+		if (iph->version == 4)
+			IP_ECN_set_ce(iph);
+		else if (iph->version == 6)
+			IP6_ECN_set_ce(skb, ipv6_hdr(skb));
+	}
+
+	__skb_queue_tail(&xtfs->queue, skb);
+	xtfs->queue_size += skb->len;
+	return true;
+}
+
+static int iptfs_get_cur_pmtu(struct xfrm_state *x,
+			      struct xfrm_iptfs_data *xtfs, struct sk_buff *skb)
+{
+	struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb);
+	u32 payload_mtu = xtfs->payload_mtu;
+	u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
+
+	if (payload_mtu && payload_mtu < pmtu)
+		pmtu = payload_mtu;
+
+	return pmtu;
+}
+
+static int iptfs_is_too_big(struct sock *sk, struct sk_buff *skb, u32 pmtu)
+{
+	if (skb->len <= pmtu)
+		return 0;
+
+	/* We only send ICMP too big if the user has configured us as
+	 * dont-fragment.
+	 */
+	if (skb->dev)
+		XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMOUTERROR);
+
+	if (sk) {
+		xfrm_local_error(skb, pmtu);
+	} else if (ip_hdr(skb)->version == 4) {
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+			  htonl(pmtu));
+	} else {
+		WARN_ON_ONCE(ip_hdr(skb)->version != 6);
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, pmtu);
+	}
+	return 1;
+}
+
+/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload
+ * (i.e., aggregating or fragmenting as appropriate).
+ * This is set in dst->output for an SA.
+ */
+static int iptfs_output_collect(struct net *net, struct sock *sk,
+				struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
+	struct xfrm_iptfs_data *xtfs = x->mode_data;
+	struct sk_buff *segs, *nskb;
+	u32 pmtu = 0;
+	bool ok = true;
+	bool was_gso;
+
+	/* We have hooked into dst_entry->output which means we have skipped the
+	 * protocol specific netfilter (see xfrm4_output, xfrm6_output).
+	 * when our timer runs we will end up calling xfrm_output directly on
+	 * the encapsulated traffic.
+	 *
+	 * For both cases this is the NF_INET_POST_ROUTING hook which allows
+	 * changing the skb->dst entry which then may not be xfrm based anymore
+	 * in which case a REROUTED flag is set. and dst_output is called.
+	 *
+	 * For IPv6 we are also skipping fragmentation handling for local
+	 * sockets, which may or may not be good depending on our tunnel DF
+	 * setting. Normally with fragmentation supported we want to skip this
+	 * fragmentation.
+	 */
+
+	BUG_ON(!xtfs);
+
+	pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
+
+	/* Break apart GSO skbs. If the queue is nearing full then we want the
+	 * accounting and queuing to be based on the individual packets not on the
+	 * aggregate GSO buffer.
+	 */
+	was_gso = skb_is_gso(skb);
+	if (!was_gso) {
+		segs = skb;
+	} else {
+		segs = skb_gso_segment(skb, 0);
+		if (IS_ERR_OR_NULL(segs)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+			kfree_skb(skb);
+			return PTR_ERR(segs);
+		}
+		consume_skb(skb);
+		skb = NULL;
+	}
+
+	/* We can be running on multiple cores and from the network softirq or
+	 * from user context depending on where the packet is coming from.
+	 */
+	spin_lock_bh(&x->lock);
+
+	skb_list_walk_safe(segs, skb, nskb)
+	{
+		skb_mark_not_on_list(skb);
+
+		/* Once we drop due to no queue space we continue to drop the
+		 * rest of the packets from that GRO.
+		 */
+		if (!ok) {
+nospace:
+			if (skb->dev)
+				XFRM_INC_STATS(dev_net(skb->dev),
+					       LINUX_MIB_XFRMOUTNOQSPACE);
+			kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING);
+			continue;
+		}
+
+		/* Fragmenting handled in following commits. */
+		if (iptfs_is_too_big(sk, skb, pmtu)) {
+			kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
+			continue;
+		}
+
+		/* Enqueue to send in tunnel */
+		ok = iptfs_enqueue(xtfs, skb);
+		if (!ok)
+			goto nospace;
+	}
+
+	/* Start a delay timer if we don't have one yet */
+	if (!hrtimer_is_queued(&xtfs->iptfs_timer)) {
+		hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns,
+			      IPTFS_HRTIMER_MODE);
+		xtfs->iptfs_settime = ktime_get_raw_fast_ns();
+	}
+
+	spin_unlock_bh(&x->lock);
+	return 0;
+}
+
+/* -------------------------- */
+/* Dequeue and send functions */
+/* -------------------------- */
+
+static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff)
+{
+	struct ip_iptfs_hdr *h;
+	size_t hsz = sizeof(*h);
+
+	/* now reset values to be pointing at the rest of the packets */
+	h = skb_push(skb, hsz);
+	memset(h, 0, hsz);
+	if (blkoff)
+		h->block_offset = htons(blkoff);
+
+	/* network_header current points at the inner IP packet
+	 * move it to the iptfs header
+	 */
+	skb->transport_header = skb->network_header;
+	skb->network_header -= hsz;
+
+	IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
+}
+
+static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp,
+					      struct sk_buff *child)
+{
+	u32 fllen = 0;
+
+	/* It might be possible to account for a frag list in addition to page
+	 * fragment if it's a valid state to be in. The page fragments size
+	 * should be kept as data_len so only the frag_list size is removed,
+	 * this must be done above as well.
+	 */
+	BUG_ON(skb_shinfo(child)->nr_frags);
+	*nextp = skb_shinfo(child)->frag_list;
+	while (*nextp) {
+		fllen += (*nextp)->len;
+		nextp = &(*nextp)->next;
+	}
+	skb_frag_list_init(child);
+	BUG_ON(fllen > child->data_len);
+	child->len -= fllen;
+	child->data_len -= fllen;
+
+	return nextp;
+}
+
+static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
+{
+	struct xfrm_iptfs_data *xtfs = x->mode_data;
+	struct sk_buff *skb, *skb2, **nextp;
+	struct skb_shared_info *shi;
+
+	while ((skb = __skb_dequeue(list))) {
+		u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb);
+		int remaining;
+
+		/* protocol comes to us cleared sometimes */
+		skb->protocol = x->outer_mode.family == AF_INET ?
+					      htons(ETH_P_IP) :
+					      htons(ETH_P_IPV6);
+
+		if (skb->len > mtu) {
+			/* We handle this case before enqueueing so we are only
+			 * here b/c MTU changed after we enqueued before we
+			 * dequeued, just drop these.
+			 */
+			if (skb->dev)
+				XFRM_INC_STATS(dev_net(skb->dev),
+					       LINUX_MIB_XFRMOUTERROR);
+
+			kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
+			continue;
+		}
+
+		/* If we don't have a cksum in the packet we need to add one
+		 * before encapsulation.
+		 */
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			if (skb_checksum_help(skb)) {
+				XFRM_INC_STATS(dev_net(skb_dst(skb)->dev),
+					       LINUX_MIB_XFRMOUTERROR);
+				kfree_skb(skb);
+				continue;
+			}
+		}
+
+		/* Convert first inner packet into an outer IPTFS packet */
+		iptfs_output_prepare_skb(skb, 0);
+
+		/* The space remaining to send more inner packet data is `mtu` -
+		 * (skb->len - sizeof iptfs header). This is b/c the `mtu` value
+		 * has the basic IPTFS header len accounted for, and we added
+		 * that header to the skb so it is a part of skb->len, thus we
+		 * subtract it from the skb length.
+		 */
+		remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr));
+
+		/* Re-home (un-nest) nested fragment lists. We need to do this
+		 * b/c we will simply be appending any following aggregated
+		 * inner packets to the frag list.
+		 */
+		shi = skb_shinfo(skb);
+		nextp = &shi->frag_list;
+		while (*nextp) {
+			if (skb_has_frag_list(*nextp))
+				nextp = iptfs_rehome_fraglist(&(*nextp)->next,
+							      *nextp);
+			else
+				nextp = &(*nextp)->next;
+		}
+
+		/* See if we have enough space to simply append.
+		 *
+		 * NOTE: Maybe do not append if we will be mis-aligned,
+		 * SW-based endpoints will probably have to copy in this
+		 * case.
+		 */
+		while ((skb2 = skb_peek(list))) {
+			if (skb2->len > remaining)
+				break;
+
+			__skb_unlink(skb2, list);
+
+			/* If we don't have a cksum in the packet we need to add
+			 * one before encapsulation.
+			 */
+			if (skb2->ip_summed == CHECKSUM_PARTIAL) {
+				if (skb_checksum_help(skb2)) {
+					XFRM_INC_STATS(
+						dev_net(skb_dst(skb2)->dev),
+						LINUX_MIB_XFRMOUTERROR);
+					kfree_skb(skb2);
+					continue;
+				}
+			}
+
+			/* Do accounting */
+			skb->data_len += skb2->len;
+			skb->len += skb2->len;
+			remaining -= skb2->len;
+
+			/* Append to the frag_list */
+			*nextp = skb2;
+			nextp = &skb2->next;
+			BUG_ON(*nextp);
+			if (skb_has_frag_list(skb2))
+				nextp = iptfs_rehome_fraglist(nextp, skb2);
+			skb->truesize += skb2->truesize;
+		}
+
+		xfrm_output(NULL, skb);
+	}
+}
+
+static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me)
+{
+	struct sk_buff_head list;
+	struct xfrm_iptfs_data *xtfs;
+	struct xfrm_state *x;
+	time64_t settime;
+
+	xtfs = container_of(me, typeof(*xtfs), iptfs_timer);
+	x = xtfs->x;
+
+	/* Process all the queued packets
+	 *
+	 * softirq execution order: timer > tasklet > hrtimer
+	 *
+	 * Network rx will have run before us giving one last chance to queue
+	 * ingress packets for us to process and transmit.
+	 */
+
+	spin_lock(&x->lock);
+	__skb_queue_head_init(&list);
+	skb_queue_splice_init(&xtfs->queue, &list);
+	xtfs->queue_size = 0;
+	settime = xtfs->iptfs_settime;
+	spin_unlock(&x->lock);
+
+	/* After the above unlock, packets can begin queuing again, and the
+	 * timer can be set again, from another CPU either in softirq or user
+	 * context (not from this one since we are running at softirq level
+	 * already).
+	 */
+
+	iptfs_output_queued(x, &list);
+
+	return HRTIMER_NORESTART;
+}
+
+/**
+ * iptfs_encap_add_ipv4() - add outer encaps
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * This was originally taken from xfrm4_tunnel_encap_add. The reason for the
+ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set
+ * the TOS/DSCP bits. Sets the protocol to a different value and doesn't do
+ * anything with inner headers as they aren't pointing into a normal IP
+ * singleton inner packet.
+ */
+static int iptfs_encap_add_ipv4(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct iphdr *top_iph;
+
+	skb_reset_inner_network_header(skb);
+	skb_reset_inner_transport_header(skb);
+
+	skb_set_network_header(skb,
+			       -(x->props.header_len - x->props.enc_hdr_len));
+	skb->mac_header =
+		skb->network_header + offsetof(struct iphdr, protocol);
+	skb->transport_header = skb->network_header + sizeof(*top_iph);
+
+	top_iph = ip_hdr(skb);
+	top_iph->ihl = 5;
+	top_iph->version = 4;
+	top_iph->protocol = IPPROTO_AGGFRAG;
+
+	/* As we have 0, fractional, 1 or N inner packets there's no obviously
+	 * correct DSCP mapping to inherit. ECN should be cleared per RFC9347
+	 * 3.1.
+	 */
+	top_iph->tos = 0;
+
+	top_iph->frag_off = htons(IP_DF);
+	top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst));
+	top_iph->saddr = x->props.saddr.a4;
+	top_iph->daddr = x->id.daddr.a4;
+	ip_select_ident(dev_net(dst->dev), skb, NULL);
+
+	return 0;
+}
+
+/**
+ * iptfs_encap_add_ipv6() - add outer encaps
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * This was originally taken from xfrm6_tunnel_encap_add. The reason for the
+ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set
+ * the flow label and TOS/DSCP bits. It also sets the protocol to a different
+ * value and doesn't do anything with inner headers as they aren't pointing into
+ * a normal IP singleton inner packet.
+ */
+static int iptfs_encap_add_ipv6(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct ipv6hdr *top_iph;
+	int dsfield;
+
+	skb_reset_inner_network_header(skb);
+	skb_reset_inner_transport_header(skb);
+
+	skb_set_network_header(skb,
+			       -x->props.header_len + x->props.enc_hdr_len);
+	skb->mac_header =
+		skb->network_header + offsetof(struct ipv6hdr, nexthdr);
+	skb->transport_header = skb->network_header + sizeof(*top_iph);
+
+	top_iph = ipv6_hdr(skb);
+	top_iph->version = 6;
+	top_iph->priority = 0;
+	memset(top_iph->flow_lbl, 0, sizeof(top_iph->flow_lbl));
+	top_iph->nexthdr = IPPROTO_AGGFRAG;
+
+	/* As we have 0, fractional, 1 or N inner packets there's no obviously
+	 * correct DSCP mapping to inherit. ECN should be cleared per RFC9347
+	 * 3.1.
+	 */
+	dsfield = 0;
+	ipv6_change_dsfield(top_iph, 0, dsfield);
+
+	top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst));
+	top_iph->saddr = *(struct in6_addr *)&x->props.saddr;
+	top_iph->daddr = *(struct in6_addr *)&x->id.daddr;
+
+	return 0;
+}
+
+/**
+ * iptfs_prepare_output() -  prepare the skb for output
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * Return: Error value, if 0 then skb values should be as follows:
+ *    - transport_header should point at ESP header
+ *    - network_header should point at Outer IP header
+ *    - mac_header should point at protocol/nexthdr of the outer IP
+ */
+static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	if (x->outer_mode.family == AF_INET)
+		return iptfs_encap_add_ipv4(x, skb);
+	if (x->outer_mode.family == AF_INET6) {
+#if IS_ENABLED(CONFIG_IPV6)
+		return iptfs_encap_add_ipv6(x, skb);
+#else
+		WARN_ON_ONCE(1);
+		return -EAFNOSUPPORT;
+#endif
+	}
+	WARN_ON_ONCE(1);
+	return -EOPNOTSUPP;
+}
+
 /* ========================== */
 /* State Management Functions */
 /* ========================== */
@@ -66,6 +560,9 @@  static int iptfs_user_init(struct net *net, struct xfrm_state *x,
 	struct xfrm_iptfs_config *xc;
 
 	xc = &xtfs->cfg;
+	xc->max_queue_size = net->xfrm.sysctl_iptfs_max_qsize;
+	xtfs->init_delay_ns =
+		(u64)net->xfrm.sysctl_iptfs_init_delay * NSECS_IN_USEC;
 
 	if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
 		xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]);
@@ -79,6 +576,15 @@  static int iptfs_user_init(struct net *net, struct xfrm_state *x,
 			return -EINVAL;
 		}
 	}
+	if (attrs[XFRMA_IPTFS_MAX_QSIZE])
+		xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]);
+	if (attrs[XFRMA_IPTFS_INIT_DELAY])
+		xtfs->init_delay_ns =
+			(u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) *
+			NSECS_IN_USEC;
+
+	xtfs->ecn_queue_size = (u64)xc->max_queue_size * 95 / 100;
+
 	return 0;
 }
 
@@ -88,8 +594,11 @@  static unsigned int iptfs_sa_len(const struct xfrm_state *x)
 	struct xfrm_iptfs_config *xc = &xtfs->cfg;
 	unsigned int l = 0;
 
-	if (x->dir == XFRM_SA_DIR_OUT)
+	if (x->dir == XFRM_SA_DIR_OUT) {
+		l += nla_total_size(sizeof(u32)); /* init delay usec */
+		l += nla_total_size(sizeof(xc->max_queue_size));
 		l += nla_total_size(sizeof(xc->pkt_size));
+	}
 
 	return l;
 }
@@ -98,10 +607,23 @@  static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct xfrm_iptfs_data *xtfs = x->mode_data;
 	struct xfrm_iptfs_config *xc = &xtfs->cfg;
-	int ret = 0;
+	int ret;
+	u64 q;
+
+	if (x->dir == XFRM_SA_DIR_OUT) {
+		q = xtfs->init_delay_ns;
+		(void)do_div(q, NSECS_IN_USEC);
+		ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q);
+		if (ret)
+			return ret;
+
+		ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE,
+				  xc->max_queue_size);
+		if (ret)
+			return ret;
 
-	if (x->dir == XFRM_SA_DIR_OUT)
 		ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size);
+	}
 
 	return ret;
 }
@@ -109,6 +631,10 @@  static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
 static int __iptfs_init_state(struct xfrm_state *x,
 			      struct xfrm_iptfs_data *xtfs)
 {
+	__skb_queue_head_init(&xtfs->queue);
+	hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE);
+	xtfs->iptfs_timer.function = iptfs_delay_timer;
+
 	/* Modify type (esp) adjustment values */
 
 	if (x->props.family == AF_INET)
@@ -166,6 +692,7 @@  static void iptfs_delete_state(struct xfrm_state *x)
 	if (!xtfs)
 		return;
 
+	hrtimer_cancel(&xtfs->iptfs_timer);
 	kfree_sensitive(xtfs);
 
 	module_put(x->mode_cbs->owner);
@@ -180,6 +707,8 @@  static const struct xfrm_mode_cbs iptfs_mode_cbs = {
 	.sa_len = iptfs_sa_len,
 	.clone = iptfs_clone,
 	.get_inner_mtu = iptfs_get_inner_mtu,
+	.output = iptfs_output_collect,
+	.prepare_output = iptfs_prepare_output,
 };
 
 static int __init xfrm_iptfs_init(void)