diff mbox series

[ipsec-next,v11,10/16] xfrm: iptfs: add fragmenting of larger than MTU user packets

Message ID 20240907022412.1032284-11-chopps@chopps.org (mailing list archive)
State Awaiting Upstream
Delegated to: Netdev Maintainers
Headers show
Series Add IP-TFS mode to xfrm | expand

Checks

Context Check Description
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 16 this patch: 14
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 4 maintainers not CCed: pabeni@redhat.com kuba@kernel.org edumazet@google.com herbert@gondor.apana.org.au
netdev/build_clang fail Errors and warnings before: 16 this patch: 15
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 16 this patch: 14
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 1 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Christian Hopps Sept. 7, 2024, 2:24 a.m. UTC
From: Christian Hopps <chopps@labn.net>

Add support for tunneling user (inner) packets that are larger than the
tunnel's path MTU (outer) using IP-TFS fragmentation.

Signed-off-by: Christian Hopps <chopps@labn.net>
---
 net/xfrm/xfrm_iptfs.c | 350 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 321 insertions(+), 29 deletions(-)

Comments

Antony Antony Sept. 9, 2024, 4:43 p.m. UTC | #1
On Fri, Sep 06, 2024 at 10:24:06PM -0400, Christian Hopps wrote:
> From: Christian Hopps <chopps@labn.net>
> 
> Add support for tunneling user (inner) packets that are larger than the
> tunnel's path MTU (outer) using IP-TFS fragmentation.
> 
> Signed-off-by: Christian Hopps <chopps@labn.net>
> ---
>  net/xfrm/xfrm_iptfs.c | 350 ++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 321 insertions(+), 29 deletions(-)
> 
> diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c
> index e4f13acce32b..3cac47838541 100644
> --- a/net/xfrm/xfrm_iptfs.c
> +++ b/net/xfrm/xfrm_iptfs.c
> @@ -46,12 +46,29 @@
>   */
>  #define IPTFS_DEFAULT_MAX_QUEUE_SIZE	(1024 * 10240)

are you ignoring NIPA errors/warnings? Fixing those would be helpful for 
git-bisects in the future.

../net/xfrm/xfrm_iptfs.c:408:15: error: implicit declaration of function 
‘skb_copy_seq_read’; did you mean ‘skb_abort_seq_read’?  [-Wimplicit-function-declaration]
  408 |         err = skb_copy_seq_read(st, offset, skb_put(skb, copy_len), copy_len);
https://netdev.bots.linux.dev/static/nipa/887940/13794923/build_32bit/stderr

add the following line to xfrm_iptfs.c 

#include <linux/skbuff.h>

>  
> +/* Assumed: skb->head is cache aligned.
> + *
> + * L2 Header resv: Arrange for cacheline to start at skb->data - 16 to keep the
> + * to-be-pushed L2 header in the same cacheline as resulting `skb->data` (i.e.,
> + * the L3 header). If cacheline size is > 64 then skb->data + pushed L2 will all
> + * be in a single cacheline if we simply reserve 64 bytes.
> + *
> + * L3 Header resv: For L3+L2 headers (i.e., skb->data points at the IPTFS payload)
> + * we want `skb->data` to be cacheline aligned and all pushed L2L3 headers will
> + * be in their own cacheline[s]. 128 works for cachelins up to 128 bytes, for
> + * any larger cacheline sizes the pushed headers will simply share the cacheline
> + * with the start of the IPTFS payload (skb->data).
> + */
> +#define XFRM_IPTFS_MIN_L3HEADROOM 128
> +#define XFRM_IPTFS_MIN_L2HEADROOM (L1_CACHE_BYTES > 64 ? 64 : 64 + 16)
> +
>  #define NSECS_IN_USEC 1000
>  
>  #define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT
>  
>  /**
>   * struct xfrm_iptfs_config - configuration for the IPTFS tunnel.
> + * @dont_frag: true to inhibit fragmenting across IPTFS outer packets.
>   * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery,
>   *	otherwise the user specified value.
>   * @max_queue_size: The maximum number of octets allowed to be queued to be sent
> @@ -59,6 +76,7 @@
>   *	packets enqueued.
>   */
>  struct xfrm_iptfs_config {
> +	bool dont_frag : 1;
>  	u32 pkt_size;	    /* outer_packet_size or 0 */
>  	u32 max_queue_size; /* octets */
>  };
> @@ -88,13 +106,73 @@ struct xfrm_iptfs_data {
>  	u32 payload_mtu;	    /* max payload size */
>  };
>  
> -static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu);
> +static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu);
>  static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me);
>  
>  /* ======================= */
>  /* IPTFS SK_BUFF Functions */
>  /* ======================= */
>  
> +/**
> + * iptfs_alloc_skb() - Allocate a new `skb`.
> + * @tpl: the skb to copy required meta-data from.
> + * @len: the linear length of the head data, zero is fine.
> + * @l3resv: true if skb reserve needs to support pushing L3 headers
> + *
> + * A new `skb` is allocated and required meta-data is copied from `tpl`, the
> + * head data is sized to `len` + reserved space set according to the @l3resv
> + * boolean.
> + *
> + * When @l3resv is false, resv is XFRM_IPTFS_MIN_L2HEADROOM which arranges for
> + * `skb->data - 16`  which is a good guess for good cache alignment (placing the
> + * to be pushed L2 header at the start of a cacheline.
> + *
> + * Otherwise, @l3resv is true and resv is set to the correct reserved space for
> + * dst->dev plus the calculated L3 overhead for the xfrm dst or
> + * XFRM_IPTFS_MIN_L3HEADROOM whichever is larger. This is then cache aligned so
> + * that all the headers will commonly fall in a cacheline when possible.
> + *
> + * l3resv=true is used on tunnel ingress (tx), because we need to reserve for
> + * the new IPTFS packet (i.e., L2+L3 headers). On tunnel egress (rx) the data
> + * being copied into the skb includes the user L3 headers already so we only
> + * need to reserve for L2.
> + *
> + * Return: the new skb or NULL.
> + */
> +static struct sk_buff *iptfs_alloc_skb(struct sk_buff *tpl, u32 len,
> +				       bool l3resv)
> +{
> +	struct sk_buff *skb;
> +	u32 resv;
> +
> +	if (!l3resv) {
> +		resv = XFRM_IPTFS_MIN_L2HEADROOM;
> +	} else {
> +		struct dst_entry *dst = skb_dst(tpl);
> +
> +		resv = LL_RESERVED_SPACE(dst->dev) + dst->header_len;
> +		resv = max(resv, XFRM_IPTFS_MIN_L3HEADROOM);
> +		resv = L1_CACHE_ALIGN(resv);
> +	}
> +
> +	skb = alloc_skb(len + resv, GFP_ATOMIC | __GFP_NOWARN);
> +	if (!skb)
> +		return NULL;
> +
> +	skb_reserve(skb, resv);
> +
> +	if (!l3resv) {
> +		/* xfrm_input resume needs dev and xfrm ext from tunnel pkt */
> +		skb->dev = tpl->dev;
> +		__skb_ext_copy(skb, tpl);
> +	}
> +
> +	/* dropped by xfrm_input, used by xfrm_output */
> +	skb_dst_copy(skb, tpl);
> +
> +	return skb;
> +}
> +
>  /**
>   * iptfs_skb_head_to_frag() - initialize a skb_frag_t based on skb head data
>   * @skb: skb with the head data
> @@ -153,7 +231,7 @@ static int iptfs_get_cur_pmtu(struct xfrm_state *x,
>  {
>  	struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb);
>  	u32 payload_mtu = xtfs->payload_mtu;
> -	u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
> +	u32 pmtu = __iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
>  
>  	if (payload_mtu && payload_mtu < pmtu)
>  		pmtu = payload_mtu;
> @@ -216,7 +294,8 @@ static int iptfs_output_collect(struct net *net, struct sock *sk,
>  
>  	WARN_ON_ONCE(!xtfs);
>  
> -	pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
> +	if (xtfs->cfg.dont_frag)
> +		pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
>  
>  	/* Break apart GSO skbs. If the queue is nearing full then we want the
>  	 * accounting and queuing to be based on the individual packets not on the
> @@ -256,8 +335,10 @@ static int iptfs_output_collect(struct net *net, struct sock *sk,
>  			continue;
>  		}
>  
> -		/* Fragmenting handled in following commits. */
> -		if (iptfs_is_too_big(sk, skb, pmtu)) {
> +		/* If the user indicated no iptfs fragmenting check before
> +		 * enqueue.
> +		 */
> +		if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) {
>  			kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
>  			continue;
>  		}
> @@ -301,6 +382,186 @@ static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff)
>  	IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
>  }
>  
> +/**
> + * iptfs_copy_create_frag() - create an inner fragment skb.
> + * @st: The source packet data.
> + * @offset: offset in @st of the new fragment data.
> + * @copy_len: the amount of data to copy from @st.
> + *
> + * Create a new skb holding a single IPTFS inner packet fragment. @copy_len must
> + * not be greater than the max fragment size.
> + *
> + * Return: the new fragment skb or an ERR_PTR().
> + */
> +static struct sk_buff *iptfs_copy_create_frag(struct skb_seq_state *st,
> +					      u32 offset, u32 copy_len)
> +{
> +	struct sk_buff *src = st->root_skb;
> +	struct sk_buff *skb;
> +	int err;
> +
> +	skb = iptfs_alloc_skb(src, copy_len, true);
> +	if (!skb)
> +		return ERR_PTR(-ENOMEM);
> +
> +	/* Now copy `copy_len` data from src */
> +	err = skb_copy_seq_read(st, offset, skb_put(skb, copy_len), copy_len);
> +	if (err) {
> +		kfree_skb(skb);
> +		return ERR_PTR(err);
> +	}
> +
> +	return skb;
> +}
> +
> +/**
> + * iptfs_copy_create_frags() - create and send N-1 fragments of a larger skb.
> + * @skbp: the source packet skb (IN), skb holding the last fragment in
> + *        the fragment stream (OUT).
> + * @xtfs: IPTFS SA state.
> + * @mtu: the max IPTFS fragment size.
> + *
> + * This function is responsible for fragmenting a larger inner packet into a
> + * sequence of IPTFS payload packets. The last fragment is returned rather than
> + * being sent so that the caller can append more inner packets (aggregation) if
> + * there is room.
> + *
> + * Return: 0 on success or a negative error code on failure
> + */
> +static int iptfs_copy_create_frags(struct sk_buff **skbp,
> +				   struct xfrm_iptfs_data *xtfs, u32 mtu)
> +{
> +	struct skb_seq_state skbseq;
> +	struct list_head sublist;
> +	struct sk_buff *skb = *skbp;
> +	struct sk_buff *nskb = *skbp;
> +	u32 copy_len, offset;
> +	u32 to_copy = skb->len - mtu;
> +	int err = 0;
> +
> +	INIT_LIST_HEAD(&sublist);
> +
> +	WARN_ON_ONCE(skb->len <= mtu);
> +	skb_prepare_seq_read(skb, 0, skb->len, &skbseq);
> +
> +	/* A trimmed `skb` will be sent as the first fragment, later. */
> +	offset = mtu;
> +	to_copy = skb->len - offset;
> +	while (to_copy) {
> +		/* Send all but last fragment to allow agg. append */
> +		list_add_tail(&nskb->list, &sublist);
> +
> +		/* FUTURE: if the packet has an odd/non-aligning length we could
> +		 * send less data in the penultimate fragment so that the last
> +		 * fragment then ends on an aligned boundary.
> +		 */
> +		copy_len = min(to_copy, mtu);
> +		nskb = iptfs_copy_create_frag(&skbseq, offset, copy_len);
> +		if (IS_ERR(nskb)) {
> +			XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMOUTERROR);
> +			skb_abort_seq_read(&skbseq);
> +			err = PTR_ERR(nskb);
> +			nskb = NULL;
> +			break;
> +		}
> +		iptfs_output_prepare_skb(nskb, to_copy);
> +		offset += copy_len;
> +		to_copy -= copy_len;
> +	}
> +	skb_abort_seq_read(&skbseq);
> +
> +	/* return last fragment that will be unsent (or NULL) */
> +	*skbp = nskb;
> +
> +	/* trim the original skb to MTU */
> +	if (!err)
> +		err = pskb_trim(skb, mtu);
> +
> +	if (err) {
> +		/* Free all frags. Don't bother sending a partial packet we will
> +		 * never complete.
> +		 */
> +		kfree_skb(nskb);
> +		list_for_each_entry_safe(skb, nskb, &sublist, list) {
> +			skb_list_del_init(skb);
> +			kfree_skb(skb);
> +		}
> +		return err;
> +	}
> +
> +	/* prepare the initial fragment with an iptfs header */
> +	iptfs_output_prepare_skb(skb, 0);
> +
> +	/* Send all but last fragment, if we fail to send a fragment then free
> +	 * the rest -- no point in sending a packet that can't be reassembled.
> +	 */
> +	list_for_each_entry_safe(skb, nskb, &sublist, list) {
> +		skb_list_del_init(skb);
> +		if (!err)
> +			err = xfrm_output(NULL, skb);
> +		else
> +			kfree_skb(skb);
> +	}
> +	if (err)
> +		kfree_skb(*skbp);
> +	return err;
> +}
> +
> +/**
> + * iptfs_first_skb() - handle the first dequeued inner packet for output
> + * @skbp: the source packet skb (IN), skb holding the last fragment in
> + *        the fragment stream (OUT).
> + * @xtfs: IPTFS SA state.
> + * @mtu: the max IPTFS fragment size.
> + *
> + * This function is responsible for fragmenting a larger inner packet into a
> + * sequence of IPTFS payload packets.
> + *
> + * The last fragment is returned rather than being sent so that the caller can
> + * append more inner packets (aggregation) if there is room.
> + *
> + * Return: 0 on success or a negative error code on failure
> + */
> +static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs,
> +			   u32 mtu)
> +{
> +	struct sk_buff *skb = *skbp;
> +	int err;
> +
> +	/* Classic ESP skips the don't fragment ICMP error if DF is clear on
> +	 * the inner packet or ignore_df is set. Otherwise it will send an ICMP
> +	 * or local error if the inner packet won't fit it's MTU.
> +	 *
> +	 * With IPTFS we do not care about the inner packet DF bit. If the
> +	 * tunnel is configured to "don't fragment" we error back if things
> +	 * don't fit in our max packet size. Otherwise we iptfs-fragment as
> +	 * normal.
> +	 */
> +
> +	/* The opportunity for HW offload has ended */
> +	if (skb->ip_summed == CHECKSUM_PARTIAL) {
> +		err = skb_checksum_help(skb);
> +		if (err)
> +			return err;
> +	}
> +
> +	/* We've split these up before queuing */
> +	WARN_ON_ONCE(skb_is_gso(skb));
> +
> +	/* Consider the buffer Tx'd and no longer owned */
> +	skb_orphan(skb);
> +
> +	/* Simple case -- it fits. `mtu` accounted for all the overhead
> +	 * including the basic IPTFS header.
> +	 */
> +	if (skb->len <= mtu) {
> +		iptfs_output_prepare_skb(skb, 0);
> +		return 0;
> +	}
> +
> +	return iptfs_copy_create_frags(skbp, xtfs, mtu);
> +}
> +
>  static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp,
>  					      struct sk_buff *child)
>  {
> @@ -360,6 +621,15 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
>  	struct sk_buff *skb, *skb2, **nextp;
>  	struct skb_shared_info *shi, *shi2;
>  
> +	/* If we are fragmenting due to a large inner packet we will output all
> +	 * the outer IPTFS packets required to contain the fragments of the
> +	 * single large inner packet. These outer packets need to be sent
> +	 * consecutively (ESP seq-wise). Since this output function is always
> +	 * running from a timer we do not need a lock to provide this guarantee.
> +	 * We will output our packets consecutively before the timer is allowed
> +	 * to run again on some other CPU.
> +	 */
> +
>  	while ((skb = __skb_dequeue(list))) {
>  		u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb);
>  		bool share_ok = true;
> @@ -370,7 +640,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
>  					htons(ETH_P_IP) :
>  					htons(ETH_P_IPV6);
>  
> -		if (skb->len > mtu) {
> +		if (skb->len > mtu && xtfs->cfg.dont_frag) {
>  			/* We handle this case before enqueueing so we are only
>  			 * here b/c MTU changed after we enqueued before we
>  			 * dequeued, just drop these.
> @@ -381,29 +651,22 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
>  			continue;
>  		}
>  
> -		/* If we don't have a cksum in the packet we need to add one
> -		 * before encapsulation.
> +		/* Convert first inner packet into an outer IPTFS packet,
> +		 * dealing with any fragmentation into multiple outer packets
> +		 * if necessary.
>  		 */
> -		if (skb->ip_summed == CHECKSUM_PARTIAL) {
> -			if (skb_checksum_help(skb)) {
> -				XFRM_INC_STATS(dev_net(skb_dst(skb)->dev),
> -					       LINUX_MIB_XFRMOUTERROR);
> -				kfree_skb(skb);
> -				continue;
> -			}
> -		}
> -
> -		/* Consider the buffer Tx'd and no longer owned */
> -		skb_orphan(skb);
> -
> -		/* Convert first inner packet into an outer IPTFS packet */
> -		iptfs_output_prepare_skb(skb, 0);
> +		if (iptfs_first_skb(&skb, xtfs, mtu))
> +			continue;
>  
> -		/* The space remaining to send more inner packet data is `mtu` -
> -		 * (skb->len - sizeof iptfs header). This is b/c the `mtu` value
> -		 * has the basic IPTFS header len accounted for, and we added
> -		 * that header to the skb so it is a part of skb->len, thus we
> -		 * subtract it from the skb length.
> +		/* If fragmentation was required the returned skb is the last
> +		 * IPTFS fragment in the chain, and it's IPTFS header blkoff has
> +		 * been set just past the end of the fragment data.
> +		 *
> +		 * In either case the space remaining to send more inner packet
> +		 * data is `mtu` - (skb->len - sizeof iptfs header). This is b/c
> +		 * the `mtu` value has the basic IPTFS header len accounted for,
> +		 * and we added that header to the skb so it is a part of
> +		 * skb->len, thus we subtract it from the skb length.
>  		 */
>  		remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr));
>  
> @@ -649,11 +912,13 @@ static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
>  /* ========================== */
>  
>  /**
> - * iptfs_get_inner_mtu() - return inner MTU with no fragmentation.
> + * __iptfs_get_inner_mtu() - return inner MTU with no fragmentation.
>   * @x: xfrm state.
>   * @outer_mtu: the outer mtu
> + *
> + * Return: Correct MTU taking in to account the encap overhead.
>   */
> -static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
> +static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
>  {
>  	struct crypto_aead *aead;
>  	u32 blksize;
> @@ -664,6 +929,23 @@ static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
>  		~(blksize - 1)) - 2;
>  }
>  
> +/**
> + * iptfs_get_inner_mtu() - return the inner MTU for an IPTFS xfrm.
> + * @x: xfrm state.
> + * @outer_mtu: Outer MTU for the encapsulated packet.
> + *
> + * Return: Correct MTU taking in to account the encap overhead.
> + */
> +static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
> +{
> +	struct xfrm_iptfs_data *xtfs = x->mode_data;
> +
> +	/* If not dont-frag we have no MTU */
> +	if (!xtfs->cfg.dont_frag)
> +		return x->outer_mode.family == AF_INET ? IP_MAX_MTU : IP6_MAX_MTU;
> +	return __iptfs_get_inner_mtu(x, outer_mtu);
> +}
> +
>  /**
>   * iptfs_user_init() - initialize the SA with IPTFS options from netlink.
>   * @net: the net data
> @@ -685,6 +967,8 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x,
>  	xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE;
>  	xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC;
>  
> +	if (attrs[XFRMA_IPTFS_DONT_FRAG])
> +		xc->dont_frag = true;
>  	if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
>  		xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]);
>  		if (!xc->pkt_size) {
> @@ -718,6 +1002,8 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x)
>  	unsigned int l = 0;
>  
>  	if (x->dir == XFRM_SA_DIR_OUT) {
> +		if (xc->dont_frag)
> +			l += nla_total_size(0);	  /* dont-frag flag */
>  		l += nla_total_size(sizeof(u32)); /* init delay usec */
>  		l += nla_total_size(sizeof(xc->max_queue_size));
>  		l += nla_total_size(sizeof(xc->pkt_size));
> @@ -734,6 +1020,12 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
>  	u64 q;
>  
>  	if (x->dir == XFRM_SA_DIR_OUT) {
> +		if (xc->dont_frag) {
> +			ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG);
> +			if (ret)
> +				return ret;
> +		}
> +
>  		q = xtfs->init_delay_ns;
>  		(void)do_div(q, NSECS_IN_USEC);
>  		ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q);
> -- 
> 2.46.0
>
Christian Hopps Sept. 9, 2024, 5:26 p.m. UTC | #2
> On Sep 9, 2024, at 12:43, Antony Antony <antony@phenome.org> wrote:
> 
> On Fri, Sep 06, 2024 at 10:24:06PM -0400, Christian Hopps wrote:
>> From: Christian Hopps <chopps@labn.net>
>> 
>> Add support for tunneling user (inner) packets that are larger than the
>> tunnel's path MTU (outer) using IP-TFS fragmentation.
>> 
>> Signed-off-by: Christian Hopps <chopps@labn.net>
>> ---
>> net/xfrm/xfrm_iptfs.c | 350 ++++++++++++++++++++++++++++++++++++++----
>> 1 file changed, 321 insertions(+), 29 deletions(-)
>> 
>> diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c
>> index e4f13acce32b..3cac47838541 100644
>> --- a/net/xfrm/xfrm_iptfs.c
>> +++ b/net/xfrm/xfrm_iptfs.c
>> @@ -46,12 +46,29 @@
>>  */
>> #define IPTFS_DEFAULT_MAX_QUEUE_SIZE (1024 * 10240)
> 
> are you ignoring NIPA errors/warnings? Fixing those would be helpful for 
> git-bisects in the future.
> 
> ../net/xfrm/xfrm_iptfs.c:408:15: error: implicit declaration of function 
> ‘skb_copy_seq_read’; did you mean ‘skb_abort_seq_read’?  [-Wimplicit-function-declaration]
>  408 |         err = skb_copy_seq_read(st, offset, skb_put(skb, copy_len), copy_len);
> https://netdev.bots.linux.dev/static/nipa/887940/13794923/build_32bit/stderr
> 
> add the following line to xfrm_iptfs.c 
> 
> #include <linux/skbuff.h>

Did you apply the patch to the latest ipsec-next tree? That function was added to ipsec-next recently and is not a part of this patchset so you shouldn't be seeing this error.

Thanks,
Chris.

> 
>> 
>> +/* Assumed: skb->head is cache aligned.
>> + *
>> + * L2 Header resv: Arrange for cacheline to start at skb->data - 16 to keep the
>> + * to-be-pushed L2 header in the same cacheline as resulting `skb->data` (i.e.,
>> + * the L3 header). If cacheline size is > 64 then skb->data + pushed L2 will all
>> + * be in a single cacheline if we simply reserve 64 bytes.
>> + *
>> + * L3 Header resv: For L3+L2 headers (i.e., skb->data points at the IPTFS payload)
>> + * we want `skb->data` to be cacheline aligned and all pushed L2L3 headers will
>> + * be in their own cacheline[s]. 128 works for cachelins up to 128 bytes, for
>> + * any larger cacheline sizes the pushed headers will simply share the cacheline
>> + * with the start of the IPTFS payload (skb->data).
>> + */
>> +#define XFRM_IPTFS_MIN_L3HEADROOM 128
>> +#define XFRM_IPTFS_MIN_L2HEADROOM (L1_CACHE_BYTES > 64 ? 64 : 64 + 16)
>> +
>> #define NSECS_IN_USEC 1000
>> 
>> #define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT
>> 
>> /**
>>  * struct xfrm_iptfs_config - configuration for the IPTFS tunnel.
>> + * @dont_frag: true to inhibit fragmenting across IPTFS outer packets.
>>  * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery,
>>  * otherwise the user specified value.
>>  * @max_queue_size: The maximum number of octets allowed to be queued to be sent
>> @@ -59,6 +76,7 @@
>>  * packets enqueued.
>>  */
>> struct xfrm_iptfs_config {
>> + bool dont_frag : 1;
>> u32 pkt_size;     /* outer_packet_size or 0 */
>> u32 max_queue_size; /* octets */
>> };
>> @@ -88,13 +106,73 @@ struct xfrm_iptfs_data {
>> u32 payload_mtu;     /* max payload size */
>> };
>> 
>> -static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu);
>> +static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu);
>> static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me);
>> 
>> /* ======================= */
>> /* IPTFS SK_BUFF Functions */
>> /* ======================= */
>> 
>> +/**
>> + * iptfs_alloc_skb() - Allocate a new `skb`.
>> + * @tpl: the skb to copy required meta-data from.
>> + * @len: the linear length of the head data, zero is fine.
>> + * @l3resv: true if skb reserve needs to support pushing L3 headers
>> + *
>> + * A new `skb` is allocated and required meta-data is copied from `tpl`, the
>> + * head data is sized to `len` + reserved space set according to the @l3resv
>> + * boolean.
>> + *
>> + * When @l3resv is false, resv is XFRM_IPTFS_MIN_L2HEADROOM which arranges for
>> + * `skb->data - 16`  which is a good guess for good cache alignment (placing the
>> + * to be pushed L2 header at the start of a cacheline.
>> + *
>> + * Otherwise, @l3resv is true and resv is set to the correct reserved space for
>> + * dst->dev plus the calculated L3 overhead for the xfrm dst or
>> + * XFRM_IPTFS_MIN_L3HEADROOM whichever is larger. This is then cache aligned so
>> + * that all the headers will commonly fall in a cacheline when possible.
>> + *
>> + * l3resv=true is used on tunnel ingress (tx), because we need to reserve for
>> + * the new IPTFS packet (i.e., L2+L3 headers). On tunnel egress (rx) the data
>> + * being copied into the skb includes the user L3 headers already so we only
>> + * need to reserve for L2.
>> + *
>> + * Return: the new skb or NULL.
>> + */
>> +static struct sk_buff *iptfs_alloc_skb(struct sk_buff *tpl, u32 len,
>> +        bool l3resv)
>> +{
>> + struct sk_buff *skb;
>> + u32 resv;
>> +
>> + if (!l3resv) {
>> + resv = XFRM_IPTFS_MIN_L2HEADROOM;
>> + } else {
>> + struct dst_entry *dst = skb_dst(tpl);
>> +
>> + resv = LL_RESERVED_SPACE(dst->dev) + dst->header_len;
>> + resv = max(resv, XFRM_IPTFS_MIN_L3HEADROOM);
>> + resv = L1_CACHE_ALIGN(resv);
>> + }
>> +
>> + skb = alloc_skb(len + resv, GFP_ATOMIC | __GFP_NOWARN);
>> + if (!skb)
>> + return NULL;
>> +
>> + skb_reserve(skb, resv);
>> +
>> + if (!l3resv) {
>> + /* xfrm_input resume needs dev and xfrm ext from tunnel pkt */
>> + skb->dev = tpl->dev;
>> + __skb_ext_copy(skb, tpl);
>> + }
>> +
>> + /* dropped by xfrm_input, used by xfrm_output */
>> + skb_dst_copy(skb, tpl);
>> +
>> + return skb;
>> +}
>> +
>> /**
>>  * iptfs_skb_head_to_frag() - initialize a skb_frag_t based on skb head data
>>  * @skb: skb with the head data
>> @@ -153,7 +231,7 @@ static int iptfs_get_cur_pmtu(struct xfrm_state *x,
>> {
>> struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb);
>> u32 payload_mtu = xtfs->payload_mtu;
>> - u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
>> + u32 pmtu = __iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
>> 
>> if (payload_mtu && payload_mtu < pmtu)
>> pmtu = payload_mtu;
>> @@ -216,7 +294,8 @@ static int iptfs_output_collect(struct net *net, struct sock *sk,
>> 
>> WARN_ON_ONCE(!xtfs);
>> 
>> - pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
>> + if (xtfs->cfg.dont_frag)
>> + pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
>> 
>> /* Break apart GSO skbs. If the queue is nearing full then we want the
>>  * accounting and queuing to be based on the individual packets not on the
>> @@ -256,8 +335,10 @@ static int iptfs_output_collect(struct net *net, struct sock *sk,
>> continue;
>> }
>> 
>> - /* Fragmenting handled in following commits. */
>> - if (iptfs_is_too_big(sk, skb, pmtu)) {
>> + /* If the user indicated no iptfs fragmenting check before
>> +  * enqueue.
>> +  */
>> + if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) {
>> kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
>> continue;
>> }
>> @@ -301,6 +382,186 @@ static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff)
>> IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
>> }
>> 
>> +/**
>> + * iptfs_copy_create_frag() - create an inner fragment skb.
>> + * @st: The source packet data.
>> + * @offset: offset in @st of the new fragment data.
>> + * @copy_len: the amount of data to copy from @st.
>> + *
>> + * Create a new skb holding a single IPTFS inner packet fragment. @copy_len must
>> + * not be greater than the max fragment size.
>> + *
>> + * Return: the new fragment skb or an ERR_PTR().
>> + */
>> +static struct sk_buff *iptfs_copy_create_frag(struct skb_seq_state *st,
>> +       u32 offset, u32 copy_len)
>> +{
>> + struct sk_buff *src = st->root_skb;
>> + struct sk_buff *skb;
>> + int err;
>> +
>> + skb = iptfs_alloc_skb(src, copy_len, true);
>> + if (!skb)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + /* Now copy `copy_len` data from src */
>> + err = skb_copy_seq_read(st, offset, skb_put(skb, copy_len), copy_len);
>> + if (err) {
>> + kfree_skb(skb);
>> + return ERR_PTR(err);
>> + }
>> +
>> + return skb;
>> +}
>> +
>> +/**
>> + * iptfs_copy_create_frags() - create and send N-1 fragments of a larger skb.
>> + * @skbp: the source packet skb (IN), skb holding the last fragment in
>> + *        the fragment stream (OUT).
>> + * @xtfs: IPTFS SA state.
>> + * @mtu: the max IPTFS fragment size.
>> + *
>> + * This function is responsible for fragmenting a larger inner packet into a
>> + * sequence of IPTFS payload packets. The last fragment is returned rather than
>> + * being sent so that the caller can append more inner packets (aggregation) if
>> + * there is room.
>> + *
>> + * Return: 0 on success or a negative error code on failure
>> + */
>> +static int iptfs_copy_create_frags(struct sk_buff **skbp,
>> +    struct xfrm_iptfs_data *xtfs, u32 mtu)
>> +{
>> + struct skb_seq_state skbseq;
>> + struct list_head sublist;
>> + struct sk_buff *skb = *skbp;
>> + struct sk_buff *nskb = *skbp;
>> + u32 copy_len, offset;
>> + u32 to_copy = skb->len - mtu;
>> + int err = 0;
>> +
>> + INIT_LIST_HEAD(&sublist);
>> +
>> + WARN_ON_ONCE(skb->len <= mtu);
>> + skb_prepare_seq_read(skb, 0, skb->len, &skbseq);
>> +
>> + /* A trimmed `skb` will be sent as the first fragment, later. */
>> + offset = mtu;
>> + to_copy = skb->len - offset;
>> + while (to_copy) {
>> + /* Send all but last fragment to allow agg. append */
>> + list_add_tail(&nskb->list, &sublist);
>> +
>> + /* FUTURE: if the packet has an odd/non-aligning length we could
>> +  * send less data in the penultimate fragment so that the last
>> +  * fragment then ends on an aligned boundary.
>> +  */
>> + copy_len = min(to_copy, mtu);
>> + nskb = iptfs_copy_create_frag(&skbseq, offset, copy_len);
>> + if (IS_ERR(nskb)) {
>> + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMOUTERROR);
>> + skb_abort_seq_read(&skbseq);
>> + err = PTR_ERR(nskb);
>> + nskb = NULL;
>> + break;
>> + }
>> + iptfs_output_prepare_skb(nskb, to_copy);
>> + offset += copy_len;
>> + to_copy -= copy_len;
>> + }
>> + skb_abort_seq_read(&skbseq);
>> +
>> + /* return last fragment that will be unsent (or NULL) */
>> + *skbp = nskb;
>> +
>> + /* trim the original skb to MTU */
>> + if (!err)
>> + err = pskb_trim(skb, mtu);
>> +
>> + if (err) {
>> + /* Free all frags. Don't bother sending a partial packet we will
>> +  * never complete.
>> +  */
>> + kfree_skb(nskb);
>> + list_for_each_entry_safe(skb, nskb, &sublist, list) {
>> + skb_list_del_init(skb);
>> + kfree_skb(skb);
>> + }
>> + return err;
>> + }
>> +
>> + /* prepare the initial fragment with an iptfs header */
>> + iptfs_output_prepare_skb(skb, 0);
>> +
>> + /* Send all but last fragment, if we fail to send a fragment then free
>> +  * the rest -- no point in sending a packet that can't be reassembled.
>> +  */
>> + list_for_each_entry_safe(skb, nskb, &sublist, list) {
>> + skb_list_del_init(skb);
>> + if (!err)
>> + err = xfrm_output(NULL, skb);
>> + else
>> + kfree_skb(skb);
>> + }
>> + if (err)
>> + kfree_skb(*skbp);
>> + return err;
>> +}
>> +
>> +/**
>> + * iptfs_first_skb() - handle the first dequeued inner packet for output
>> + * @skbp: the source packet skb (IN), skb holding the last fragment in
>> + *        the fragment stream (OUT).
>> + * @xtfs: IPTFS SA state.
>> + * @mtu: the max IPTFS fragment size.
>> + *
>> + * This function is responsible for fragmenting a larger inner packet into a
>> + * sequence of IPTFS payload packets.
>> + *
>> + * The last fragment is returned rather than being sent so that the caller can
>> + * append more inner packets (aggregation) if there is room.
>> + *
>> + * Return: 0 on success or a negative error code on failure
>> + */
>> +static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs,
>> +    u32 mtu)
>> +{
>> + struct sk_buff *skb = *skbp;
>> + int err;
>> +
>> + /* Classic ESP skips the don't fragment ICMP error if DF is clear on
>> +  * the inner packet or ignore_df is set. Otherwise it will send an ICMP
>> +  * or local error if the inner packet won't fit it's MTU.
>> +  *
>> +  * With IPTFS we do not care about the inner packet DF bit. If the
>> +  * tunnel is configured to "don't fragment" we error back if things
>> +  * don't fit in our max packet size. Otherwise we iptfs-fragment as
>> +  * normal.
>> +  */
>> +
>> + /* The opportunity for HW offload has ended */
>> + if (skb->ip_summed == CHECKSUM_PARTIAL) {
>> + err = skb_checksum_help(skb);
>> + if (err)
>> + return err;
>> + }
>> +
>> + /* We've split these up before queuing */
>> + WARN_ON_ONCE(skb_is_gso(skb));
>> +
>> + /* Consider the buffer Tx'd and no longer owned */
>> + skb_orphan(skb);
>> +
>> + /* Simple case -- it fits. `mtu` accounted for all the overhead
>> +  * including the basic IPTFS header.
>> +  */
>> + if (skb->len <= mtu) {
>> + iptfs_output_prepare_skb(skb, 0);
>> + return 0;
>> + }
>> +
>> + return iptfs_copy_create_frags(skbp, xtfs, mtu);
>> +}
>> +
>> static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp,
>>       struct sk_buff *child)
>> {
>> @@ -360,6 +621,15 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
>> struct sk_buff *skb, *skb2, **nextp;
>> struct skb_shared_info *shi, *shi2;
>> 
>> + /* If we are fragmenting due to a large inner packet we will output all
>> +  * the outer IPTFS packets required to contain the fragments of the
>> +  * single large inner packet. These outer packets need to be sent
>> +  * consecutively (ESP seq-wise). Since this output function is always
>> +  * running from a timer we do not need a lock to provide this guarantee.
>> +  * We will output our packets consecutively before the timer is allowed
>> +  * to run again on some other CPU.
>> +  */
>> +
>> while ((skb = __skb_dequeue(list))) {
>> u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb);
>> bool share_ok = true;
>> @@ -370,7 +640,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
>> htons(ETH_P_IP) :
>> htons(ETH_P_IPV6);
>> 
>> - if (skb->len > mtu) {
>> + if (skb->len > mtu && xtfs->cfg.dont_frag) {
>> /* We handle this case before enqueueing so we are only
>>  * here b/c MTU changed after we enqueued before we
>>  * dequeued, just drop these.
>> @@ -381,29 +651,22 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
>> continue;
>> }
>> 
>> - /* If we don't have a cksum in the packet we need to add one
>> -  * before encapsulation.
>> + /* Convert first inner packet into an outer IPTFS packet,
>> +  * dealing with any fragmentation into multiple outer packets
>> +  * if necessary.
>>  */
>> - if (skb->ip_summed == CHECKSUM_PARTIAL) {
>> - if (skb_checksum_help(skb)) {
>> - XFRM_INC_STATS(dev_net(skb_dst(skb)->dev),
>> -        LINUX_MIB_XFRMOUTERROR);
>> - kfree_skb(skb);
>> - continue;
>> - }
>> - }
>> -
>> - /* Consider the buffer Tx'd and no longer owned */
>> - skb_orphan(skb);
>> -
>> - /* Convert first inner packet into an outer IPTFS packet */
>> - iptfs_output_prepare_skb(skb, 0);
>> + if (iptfs_first_skb(&skb, xtfs, mtu))
>> + continue;
>> 
>> - /* The space remaining to send more inner packet data is `mtu` -
>> -  * (skb->len - sizeof iptfs header). This is b/c the `mtu` value
>> -  * has the basic IPTFS header len accounted for, and we added
>> -  * that header to the skb so it is a part of skb->len, thus we
>> -  * subtract it from the skb length.
>> + /* If fragmentation was required the returned skb is the last
>> +  * IPTFS fragment in the chain, and it's IPTFS header blkoff has
>> +  * been set just past the end of the fragment data.
>> +  *
>> +  * In either case the space remaining to send more inner packet
>> +  * data is `mtu` - (skb->len - sizeof iptfs header). This is b/c
>> +  * the `mtu` value has the basic IPTFS header len accounted for,
>> +  * and we added that header to the skb so it is a part of
>> +  * skb->len, thus we subtract it from the skb length.
>>  */
>> remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr));
>> 
>> @@ -649,11 +912,13 @@ static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
>> /* ========================== */
>> 
>> /**
>> - * iptfs_get_inner_mtu() - return inner MTU with no fragmentation.
>> + * __iptfs_get_inner_mtu() - return inner MTU with no fragmentation.
>>  * @x: xfrm state.
>>  * @outer_mtu: the outer mtu
>> + *
>> + * Return: Correct MTU taking in to account the encap overhead.
>>  */
>> -static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
>> +static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
>> {
>> struct crypto_aead *aead;
>> u32 blksize;
>> @@ -664,6 +929,23 @@ static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
>> ~(blksize - 1)) - 2;
>> }
>> 
>> +/**
>> + * iptfs_get_inner_mtu() - return the inner MTU for an IPTFS xfrm.
>> + * @x: xfrm state.
>> + * @outer_mtu: Outer MTU for the encapsulated packet.
>> + *
>> + * Return: Correct MTU taking in to account the encap overhead.
>> + */
>> +static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
>> +{
>> + struct xfrm_iptfs_data *xtfs = x->mode_data;
>> +
>> + /* If not dont-frag we have no MTU */
>> + if (!xtfs->cfg.dont_frag)
>> + return x->outer_mode.family == AF_INET ? IP_MAX_MTU : IP6_MAX_MTU;
>> + return __iptfs_get_inner_mtu(x, outer_mtu);
>> +}
>> +
>> /**
>>  * iptfs_user_init() - initialize the SA with IPTFS options from netlink.
>>  * @net: the net data
>> @@ -685,6 +967,8 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x,
>> xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE;
>> xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC;
>> 
>> + if (attrs[XFRMA_IPTFS_DONT_FRAG])
>> + xc->dont_frag = true;
>> if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
>> xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]);
>> if (!xc->pkt_size) {
>> @@ -718,6 +1002,8 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x)
>> unsigned int l = 0;
>> 
>> if (x->dir == XFRM_SA_DIR_OUT) {
>> + if (xc->dont_frag)
>> + l += nla_total_size(0);   /* dont-frag flag */
>> l += nla_total_size(sizeof(u32)); /* init delay usec */
>> l += nla_total_size(sizeof(xc->max_queue_size));
>> l += nla_total_size(sizeof(xc->pkt_size));
>> @@ -734,6 +1020,12 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
>> u64 q;
>> 
>> if (x->dir == XFRM_SA_DIR_OUT) {
>> + if (xc->dont_frag) {
>> + ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG);
>> + if (ret)
>> + return ret;
>> + }
>> +
>> q = xtfs->init_delay_ns;
>> (void)do_div(q, NSECS_IN_USEC);
>> ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q);
>> -- 
>> 2.46.0
diff mbox series

Patch

diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c
index e4f13acce32b..3cac47838541 100644
--- a/net/xfrm/xfrm_iptfs.c
+++ b/net/xfrm/xfrm_iptfs.c
@@ -46,12 +46,29 @@ 
  */
 #define IPTFS_DEFAULT_MAX_QUEUE_SIZE	(1024 * 10240)
 
+/* Assumed: skb->head is cache aligned.
+ *
+ * L2 Header resv: Arrange for cacheline to start at skb->data - 16 to keep the
+ * to-be-pushed L2 header in the same cacheline as resulting `skb->data` (i.e.,
+ * the L3 header). If cacheline size is > 64 then skb->data + pushed L2 will all
+ * be in a single cacheline if we simply reserve 64 bytes.
+ *
+ * L3 Header resv: For L3+L2 headers (i.e., skb->data points at the IPTFS payload)
+ * we want `skb->data` to be cacheline aligned and all pushed L2L3 headers will
+ * be in their own cacheline[s]. 128 works for cachelins up to 128 bytes, for
+ * any larger cacheline sizes the pushed headers will simply share the cacheline
+ * with the start of the IPTFS payload (skb->data).
+ */
+#define XFRM_IPTFS_MIN_L3HEADROOM 128
+#define XFRM_IPTFS_MIN_L2HEADROOM (L1_CACHE_BYTES > 64 ? 64 : 64 + 16)
+
 #define NSECS_IN_USEC 1000
 
 #define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT
 
 /**
  * struct xfrm_iptfs_config - configuration for the IPTFS tunnel.
+ * @dont_frag: true to inhibit fragmenting across IPTFS outer packets.
  * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery,
  *	otherwise the user specified value.
  * @max_queue_size: The maximum number of octets allowed to be queued to be sent
@@ -59,6 +76,7 @@ 
  *	packets enqueued.
  */
 struct xfrm_iptfs_config {
+	bool dont_frag : 1;
 	u32 pkt_size;	    /* outer_packet_size or 0 */
 	u32 max_queue_size; /* octets */
 };
@@ -88,13 +106,73 @@  struct xfrm_iptfs_data {
 	u32 payload_mtu;	    /* max payload size */
 };
 
-static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu);
+static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu);
 static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me);
 
 /* ======================= */
 /* IPTFS SK_BUFF Functions */
 /* ======================= */
 
+/**
+ * iptfs_alloc_skb() - Allocate a new `skb`.
+ * @tpl: the skb to copy required meta-data from.
+ * @len: the linear length of the head data, zero is fine.
+ * @l3resv: true if skb reserve needs to support pushing L3 headers
+ *
+ * A new `skb` is allocated and required meta-data is copied from `tpl`, the
+ * head data is sized to `len` + reserved space set according to the @l3resv
+ * boolean.
+ *
+ * When @l3resv is false, resv is XFRM_IPTFS_MIN_L2HEADROOM which arranges for
+ * `skb->data - 16`  which is a good guess for good cache alignment (placing the
+ * to be pushed L2 header at the start of a cacheline.
+ *
+ * Otherwise, @l3resv is true and resv is set to the correct reserved space for
+ * dst->dev plus the calculated L3 overhead for the xfrm dst or
+ * XFRM_IPTFS_MIN_L3HEADROOM whichever is larger. This is then cache aligned so
+ * that all the headers will commonly fall in a cacheline when possible.
+ *
+ * l3resv=true is used on tunnel ingress (tx), because we need to reserve for
+ * the new IPTFS packet (i.e., L2+L3 headers). On tunnel egress (rx) the data
+ * being copied into the skb includes the user L3 headers already so we only
+ * need to reserve for L2.
+ *
+ * Return: the new skb or NULL.
+ */
+static struct sk_buff *iptfs_alloc_skb(struct sk_buff *tpl, u32 len,
+				       bool l3resv)
+{
+	struct sk_buff *skb;
+	u32 resv;
+
+	if (!l3resv) {
+		resv = XFRM_IPTFS_MIN_L2HEADROOM;
+	} else {
+		struct dst_entry *dst = skb_dst(tpl);
+
+		resv = LL_RESERVED_SPACE(dst->dev) + dst->header_len;
+		resv = max(resv, XFRM_IPTFS_MIN_L3HEADROOM);
+		resv = L1_CACHE_ALIGN(resv);
+	}
+
+	skb = alloc_skb(len + resv, GFP_ATOMIC | __GFP_NOWARN);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, resv);
+
+	if (!l3resv) {
+		/* xfrm_input resume needs dev and xfrm ext from tunnel pkt */
+		skb->dev = tpl->dev;
+		__skb_ext_copy(skb, tpl);
+	}
+
+	/* dropped by xfrm_input, used by xfrm_output */
+	skb_dst_copy(skb, tpl);
+
+	return skb;
+}
+
 /**
  * iptfs_skb_head_to_frag() - initialize a skb_frag_t based on skb head data
  * @skb: skb with the head data
@@ -153,7 +231,7 @@  static int iptfs_get_cur_pmtu(struct xfrm_state *x,
 {
 	struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb);
 	u32 payload_mtu = xtfs->payload_mtu;
-	u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
+	u32 pmtu = __iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
 
 	if (payload_mtu && payload_mtu < pmtu)
 		pmtu = payload_mtu;
@@ -216,7 +294,8 @@  static int iptfs_output_collect(struct net *net, struct sock *sk,
 
 	WARN_ON_ONCE(!xtfs);
 
-	pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
+	if (xtfs->cfg.dont_frag)
+		pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
 
 	/* Break apart GSO skbs. If the queue is nearing full then we want the
 	 * accounting and queuing to be based on the individual packets not on the
@@ -256,8 +335,10 @@  static int iptfs_output_collect(struct net *net, struct sock *sk,
 			continue;
 		}
 
-		/* Fragmenting handled in following commits. */
-		if (iptfs_is_too_big(sk, skb, pmtu)) {
+		/* If the user indicated no iptfs fragmenting check before
+		 * enqueue.
+		 */
+		if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) {
 			kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
 			continue;
 		}
@@ -301,6 +382,186 @@  static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff)
 	IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
 }
 
+/**
+ * iptfs_copy_create_frag() - create an inner fragment skb.
+ * @st: The source packet data.
+ * @offset: offset in @st of the new fragment data.
+ * @copy_len: the amount of data to copy from @st.
+ *
+ * Create a new skb holding a single IPTFS inner packet fragment. @copy_len must
+ * not be greater than the max fragment size.
+ *
+ * Return: the new fragment skb or an ERR_PTR().
+ */
+static struct sk_buff *iptfs_copy_create_frag(struct skb_seq_state *st,
+					      u32 offset, u32 copy_len)
+{
+	struct sk_buff *src = st->root_skb;
+	struct sk_buff *skb;
+	int err;
+
+	skb = iptfs_alloc_skb(src, copy_len, true);
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	/* Now copy `copy_len` data from src */
+	err = skb_copy_seq_read(st, offset, skb_put(skb, copy_len), copy_len);
+	if (err) {
+		kfree_skb(skb);
+		return ERR_PTR(err);
+	}
+
+	return skb;
+}
+
+/**
+ * iptfs_copy_create_frags() - create and send N-1 fragments of a larger skb.
+ * @skbp: the source packet skb (IN), skb holding the last fragment in
+ *        the fragment stream (OUT).
+ * @xtfs: IPTFS SA state.
+ * @mtu: the max IPTFS fragment size.
+ *
+ * This function is responsible for fragmenting a larger inner packet into a
+ * sequence of IPTFS payload packets. The last fragment is returned rather than
+ * being sent so that the caller can append more inner packets (aggregation) if
+ * there is room.
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_copy_create_frags(struct sk_buff **skbp,
+				   struct xfrm_iptfs_data *xtfs, u32 mtu)
+{
+	struct skb_seq_state skbseq;
+	struct list_head sublist;
+	struct sk_buff *skb = *skbp;
+	struct sk_buff *nskb = *skbp;
+	u32 copy_len, offset;
+	u32 to_copy = skb->len - mtu;
+	int err = 0;
+
+	INIT_LIST_HEAD(&sublist);
+
+	WARN_ON_ONCE(skb->len <= mtu);
+	skb_prepare_seq_read(skb, 0, skb->len, &skbseq);
+
+	/* A trimmed `skb` will be sent as the first fragment, later. */
+	offset = mtu;
+	to_copy = skb->len - offset;
+	while (to_copy) {
+		/* Send all but last fragment to allow agg. append */
+		list_add_tail(&nskb->list, &sublist);
+
+		/* FUTURE: if the packet has an odd/non-aligning length we could
+		 * send less data in the penultimate fragment so that the last
+		 * fragment then ends on an aligned boundary.
+		 */
+		copy_len = min(to_copy, mtu);
+		nskb = iptfs_copy_create_frag(&skbseq, offset, copy_len);
+		if (IS_ERR(nskb)) {
+			XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMOUTERROR);
+			skb_abort_seq_read(&skbseq);
+			err = PTR_ERR(nskb);
+			nskb = NULL;
+			break;
+		}
+		iptfs_output_prepare_skb(nskb, to_copy);
+		offset += copy_len;
+		to_copy -= copy_len;
+	}
+	skb_abort_seq_read(&skbseq);
+
+	/* return last fragment that will be unsent (or NULL) */
+	*skbp = nskb;
+
+	/* trim the original skb to MTU */
+	if (!err)
+		err = pskb_trim(skb, mtu);
+
+	if (err) {
+		/* Free all frags. Don't bother sending a partial packet we will
+		 * never complete.
+		 */
+		kfree_skb(nskb);
+		list_for_each_entry_safe(skb, nskb, &sublist, list) {
+			skb_list_del_init(skb);
+			kfree_skb(skb);
+		}
+		return err;
+	}
+
+	/* prepare the initial fragment with an iptfs header */
+	iptfs_output_prepare_skb(skb, 0);
+
+	/* Send all but last fragment, if we fail to send a fragment then free
+	 * the rest -- no point in sending a packet that can't be reassembled.
+	 */
+	list_for_each_entry_safe(skb, nskb, &sublist, list) {
+		skb_list_del_init(skb);
+		if (!err)
+			err = xfrm_output(NULL, skb);
+		else
+			kfree_skb(skb);
+	}
+	if (err)
+		kfree_skb(*skbp);
+	return err;
+}
+
+/**
+ * iptfs_first_skb() - handle the first dequeued inner packet for output
+ * @skbp: the source packet skb (IN), skb holding the last fragment in
+ *        the fragment stream (OUT).
+ * @xtfs: IPTFS SA state.
+ * @mtu: the max IPTFS fragment size.
+ *
+ * This function is responsible for fragmenting a larger inner packet into a
+ * sequence of IPTFS payload packets.
+ *
+ * The last fragment is returned rather than being sent so that the caller can
+ * append more inner packets (aggregation) if there is room.
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs,
+			   u32 mtu)
+{
+	struct sk_buff *skb = *skbp;
+	int err;
+
+	/* Classic ESP skips the don't fragment ICMP error if DF is clear on
+	 * the inner packet or ignore_df is set. Otherwise it will send an ICMP
+	 * or local error if the inner packet won't fit it's MTU.
+	 *
+	 * With IPTFS we do not care about the inner packet DF bit. If the
+	 * tunnel is configured to "don't fragment" we error back if things
+	 * don't fit in our max packet size. Otherwise we iptfs-fragment as
+	 * normal.
+	 */
+
+	/* The opportunity for HW offload has ended */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		err = skb_checksum_help(skb);
+		if (err)
+			return err;
+	}
+
+	/* We've split these up before queuing */
+	WARN_ON_ONCE(skb_is_gso(skb));
+
+	/* Consider the buffer Tx'd and no longer owned */
+	skb_orphan(skb);
+
+	/* Simple case -- it fits. `mtu` accounted for all the overhead
+	 * including the basic IPTFS header.
+	 */
+	if (skb->len <= mtu) {
+		iptfs_output_prepare_skb(skb, 0);
+		return 0;
+	}
+
+	return iptfs_copy_create_frags(skbp, xtfs, mtu);
+}
+
 static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp,
 					      struct sk_buff *child)
 {
@@ -360,6 +621,15 @@  static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
 	struct sk_buff *skb, *skb2, **nextp;
 	struct skb_shared_info *shi, *shi2;
 
+	/* If we are fragmenting due to a large inner packet we will output all
+	 * the outer IPTFS packets required to contain the fragments of the
+	 * single large inner packet. These outer packets need to be sent
+	 * consecutively (ESP seq-wise). Since this output function is always
+	 * running from a timer we do not need a lock to provide this guarantee.
+	 * We will output our packets consecutively before the timer is allowed
+	 * to run again on some other CPU.
+	 */
+
 	while ((skb = __skb_dequeue(list))) {
 		u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb);
 		bool share_ok = true;
@@ -370,7 +640,7 @@  static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
 					htons(ETH_P_IP) :
 					htons(ETH_P_IPV6);
 
-		if (skb->len > mtu) {
+		if (skb->len > mtu && xtfs->cfg.dont_frag) {
 			/* We handle this case before enqueueing so we are only
 			 * here b/c MTU changed after we enqueued before we
 			 * dequeued, just drop these.
@@ -381,29 +651,22 @@  static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
 			continue;
 		}
 
-		/* If we don't have a cksum in the packet we need to add one
-		 * before encapsulation.
+		/* Convert first inner packet into an outer IPTFS packet,
+		 * dealing with any fragmentation into multiple outer packets
+		 * if necessary.
 		 */
-		if (skb->ip_summed == CHECKSUM_PARTIAL) {
-			if (skb_checksum_help(skb)) {
-				XFRM_INC_STATS(dev_net(skb_dst(skb)->dev),
-					       LINUX_MIB_XFRMOUTERROR);
-				kfree_skb(skb);
-				continue;
-			}
-		}
-
-		/* Consider the buffer Tx'd and no longer owned */
-		skb_orphan(skb);
-
-		/* Convert first inner packet into an outer IPTFS packet */
-		iptfs_output_prepare_skb(skb, 0);
+		if (iptfs_first_skb(&skb, xtfs, mtu))
+			continue;
 
-		/* The space remaining to send more inner packet data is `mtu` -
-		 * (skb->len - sizeof iptfs header). This is b/c the `mtu` value
-		 * has the basic IPTFS header len accounted for, and we added
-		 * that header to the skb so it is a part of skb->len, thus we
-		 * subtract it from the skb length.
+		/* If fragmentation was required the returned skb is the last
+		 * IPTFS fragment in the chain, and it's IPTFS header blkoff has
+		 * been set just past the end of the fragment data.
+		 *
+		 * In either case the space remaining to send more inner packet
+		 * data is `mtu` - (skb->len - sizeof iptfs header). This is b/c
+		 * the `mtu` value has the basic IPTFS header len accounted for,
+		 * and we added that header to the skb so it is a part of
+		 * skb->len, thus we subtract it from the skb length.
 		 */
 		remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr));
 
@@ -649,11 +912,13 @@  static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
 /* ========================== */
 
 /**
- * iptfs_get_inner_mtu() - return inner MTU with no fragmentation.
+ * __iptfs_get_inner_mtu() - return inner MTU with no fragmentation.
  * @x: xfrm state.
  * @outer_mtu: the outer mtu
+ *
+ * Return: Correct MTU taking in to account the encap overhead.
  */
-static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
+static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
 {
 	struct crypto_aead *aead;
 	u32 blksize;
@@ -664,6 +929,23 @@  static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
 		~(blksize - 1)) - 2;
 }
 
+/**
+ * iptfs_get_inner_mtu() - return the inner MTU for an IPTFS xfrm.
+ * @x: xfrm state.
+ * @outer_mtu: Outer MTU for the encapsulated packet.
+ *
+ * Return: Correct MTU taking in to account the encap overhead.
+ */
+static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
+{
+	struct xfrm_iptfs_data *xtfs = x->mode_data;
+
+	/* If not dont-frag we have no MTU */
+	if (!xtfs->cfg.dont_frag)
+		return x->outer_mode.family == AF_INET ? IP_MAX_MTU : IP6_MAX_MTU;
+	return __iptfs_get_inner_mtu(x, outer_mtu);
+}
+
 /**
  * iptfs_user_init() - initialize the SA with IPTFS options from netlink.
  * @net: the net data
@@ -685,6 +967,8 @@  static int iptfs_user_init(struct net *net, struct xfrm_state *x,
 	xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE;
 	xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC;
 
+	if (attrs[XFRMA_IPTFS_DONT_FRAG])
+		xc->dont_frag = true;
 	if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
 		xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]);
 		if (!xc->pkt_size) {
@@ -718,6 +1002,8 @@  static unsigned int iptfs_sa_len(const struct xfrm_state *x)
 	unsigned int l = 0;
 
 	if (x->dir == XFRM_SA_DIR_OUT) {
+		if (xc->dont_frag)
+			l += nla_total_size(0);	  /* dont-frag flag */
 		l += nla_total_size(sizeof(u32)); /* init delay usec */
 		l += nla_total_size(sizeof(xc->max_queue_size));
 		l += nla_total_size(sizeof(xc->pkt_size));
@@ -734,6 +1020,12 @@  static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
 	u64 q;
 
 	if (x->dir == XFRM_SA_DIR_OUT) {
+		if (xc->dont_frag) {
+			ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG);
+			if (ret)
+				return ret;
+		}
+
 		q = xtfs->init_delay_ns;
 		(void)do_div(q, NSECS_IN_USEC);
 		ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q);