diff mbox series

[ipsec-next,v12,15/16] xfrm: iptfs: handle reordering of received packets

Message ID 20241007135928.1218955-16-chopps@chopps.org (mailing list archive)
State Awaiting Upstream
Delegated to: Netdev Maintainers
Headers show
Series Add IP-TFS mode to xfrm | expand

Checks

Context Check Description
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 6 this patch: 6
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 4 maintainers not CCed: edumazet@google.com pabeni@redhat.com herbert@gondor.apana.org.au kuba@kernel.org
netdev/build_clang success Errors and warnings before: 6 this patch: 6
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 5 this patch: 5
netdev/checkpatch warning WARNING: line length of 87 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 2 this patch: 1
netdev/source_inline success Was 0 now: 0

Commit Message

Christian Hopps Oct. 7, 2024, 1:59 p.m. UTC
From: Christian Hopps <chopps@labn.net>

Handle the receipt of the outer tunnel packets out-of-order. Pointers to
the out-of-order packets are saved in a window (array) awaiting needed
prior packets. When the required prior packets are received the now
in-order packets are then passed on to the regular packet receive code.
A timer is used to consider missing earlier packet as lost so the
algorithm will advance.

Signed-off-by: Christian Hopps <chopps@labn.net>
---
 net/xfrm/xfrm_iptfs.c | 509 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 496 insertions(+), 13 deletions(-)

Comments

Steffen Klassert Oct. 21, 2024, 8:21 a.m. UTC | #1
On Mon, Oct 07, 2024 at 09:59:27AM -0400, Christian Hopps wrote:
> From: Christian Hopps <chopps@labn.net>
> 
> +static u32 __reorder_drop(struct xfrm_iptfs_data *xtfs, struct list_head *list)
> +
> +{
> +	struct skb_wseq *s, *se;
> +	const u32 savedlen = xtfs->w_savedlen;
> +	time64_t now = ktime_get_raw_fast_ns();
> +	u32 count = 0;
> +	u32 scount = 0;
> +
> +	WARN_ON_ONCE(!savedlen);
> +	if (xtfs->w_saved[0].drop_time > now)
> +		goto set_timer;
> +
> +	++xtfs->w_wantseq;
> +
> +	/* Keep flushing packets until we reach a drop time greater than now. */
> +	s = xtfs->w_saved;
> +	se = s + savedlen;
> +	do {
> +		/* Walking past empty slots until we reach a packet */
> +		for (; s < se && !s->skb; s++)
> +			if (s->drop_time > now)
> +				goto outerdone;

Please use braces if there is more that one line in the loop.

> +
> +static void __reorder_future_shifts(struct xfrm_iptfs_data *xtfs,
> +				    struct sk_buff *inskb,
> +				    struct list_head *list,
> +				    struct list_head *freelist)

freelist is unused in this function.

> +{
> +	const u32 nslots = xtfs->cfg.reorder_win_size + 1;
> +	const u64 inseq = __esp_seq(inskb);
> +	u32 savedlen = xtfs->w_savedlen;
> +	u64 wantseq = xtfs->w_wantseq;
> +	struct sk_buff *slot0 = NULL;
> +	struct skb_wseq *wnext;
> +	u32 beyond, shifting, slot;
> +	u64 distance;
> +
> +	WARN_ON_ONCE(inseq <= wantseq);

Do we really need this warning? You checked exactly this before calling
__reorder_future_shifts.

> +	distance = inseq - wantseq;
> +	WARN_ON_ONCE(distance <= nslots - 1);

Same here. There are a lot of these WARN_ON_ONCE all over the
place. I don't think we need it at places where it is clear
that the warn condition will never be true.

> +	beyond = distance - (nslots - 1);
> +
> +	/* Handle future sequence number received.
> +	 *
> +	 * IMPORTANT: we are at least advancing w_wantseq (i.e., wantseq) by 1
> +	 * b/c we are beyond the window boundary.
> +	 *
> +	 * We know we don't have the wantseq so that counts as a drop.
> +	 */
> +
> +	/* ex: slot count is 4, array size is 3 savedlen is 2, slot 0 is the

What means 'ex:'?

> +	 * missing sequence number.
> +	 *
> +	 * the final slot at savedlen (index savedlen - 1) is always occupied.
> +	 *
> +	 * beyond is "beyond array size" not savedlen.
> +	 *
> +	 *          +--------- array length (savedlen == 2)
> +	 *          |   +----- array size (nslots - 1 == 3)
> +	 *          |   |   +- window boundary (nslots == 4)
> +	 *          V   V | V

window boundary points to seq number 6 here. In all other
examples, it points between 5 and 6. Is that intentional?

> +	 *                |
> +	 *  0   1   2   3 |   slot number
> +	 * ---  0   1   2 |   array index

This looks odd. I guess this is because slot0 does
not belong to the array. Why is that?

Can we have slot0 integrated in the array? This would make
the counting much simpler IMO.

> +	 *     [b] [c] : :|   array
> +	 *                |
> +	 * "2" "3" "4" "5"|*6*  seq numbers

Is it so that in the example above packet [a] with seq number 2
is missing and we have packet [b] with seq number 3 in slot 1
and packet [c] with seq number 4 in slot 2?

> +	 *
> +	 * We receive seq number 6
> +	 * distance == 4 [inseq(6) - w_wantseq(2)]
> +	 * newslot == distance
> +	 * index == 3 [distance(4) - 1]
> +	 * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
> +	 * shifting == 1 [min(savedlen(2), beyond(1)]
> +	 * slot0_skb == [b], and should match w_wantseq

We don't have slot0_skb, I guess this is slot0.

How will the example above look like after shifting?
Is it this:

	 *  0   1   2   3 |   slot number
	 * ---  0   1   2 |   array index
	 * [b] [c] : : [e]|   array
	 *                |
	 * "3" "4" "5" "6"|  seq numbers

Whereby [e] is the packet with seq number 6.

> +	 *
> +	 *                +--- window boundary (nslots == 4)
> +	 *  0   1   2   3 | 4   slot number
> +	 * ---  0   1   2 | 3   array index
> +	 *     [b] : : : :|     array
> +	 * "2" "3" "4" "5" *6*  seq numbers

What's the difference to the first example? Is it
the same but packet [c] is missing?

> +	 *
> +	 * We receive seq number 6
> +	 * distance == 4 [inseq(6) - w_wantseq(2)]
> +	 * newslot == distance
> +	 * index == 3 [distance(4) - 1]
> +	 * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
> +	 * shifting == 1 [min(savedlen(1), beyond(1)]
> +	 * slot0_skb == [b] and should match w_wantseq
> +	 *
> +	 *                +-- window boundary (nslots == 4)
> +	 *  0   1   2   3 | 4   5   6   slot number
> +	 * ---  0   1   2 | 3   4   5   array index
> +	 *     [-] [c] : :|             array

What does [-] mean? Is it the [b] is missing?

> +	 * "2" "3" "4" "5" "6" "7" *8*  seq numbers
> +	 *
> +	 * savedlen = 2, beyond = 3
> +	 * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-]
> +	 * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c]
> +	 * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1)
> +	 *
> +	 * We receive seq number 8
> +	 * distance == 6 [inseq(8) - w_wantseq(2)]
> +	 * newslot == distance
> +	 * index == 5 [distance(6) - 1]
> +	 * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))]
> +	 * shifting == 2 [min(savedlen(2), beyond(3)]
> +	 *
> +	 * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true.
> +	 */
> +
> +	/* Now send any packets that are being shifted out of saved, and account
> +	 * for missing packets that are exiting the window as we shift it.
> +	 */

Documentation is in genaral good, but it must be clear what's
documented here. It took me quite a while to figure out what
these examples mean, and I'm still not absoluely sure if
I'm correct. All that might be clear to you when you wrote
that, but it is very hard to guess what you thought when
writing this :)

> +	/* If savedlen > beyond we are shifting some, else all. */
> +	shifting = min(savedlen, beyond);
> +
> +	/* slot0 is the buf that just shifted out and into slot0 */
> +	slot0 = NULL;

slot0 was set to NULL already at the beginning of this function.

It is just hard to notice because of the huge coment in the middle
of the function. It might make sense to put all that bigger comments
to a central place. It would make the functions more readable,
at least.

> @@ -2222,6 +2700,11 @@ static void iptfs_destroy_state(struct xfrm_state *x)
>  	if (xtfs->ra_newskb)
>  		kfree_skb(xtfs->ra_newskb);
>  
> +	for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++)
> +		if (s->skb)
> +			kfree_skb(s->skb);

Braces please.
Christian Hopps Nov. 2, 2024, 4:30 p.m. UTC | #2
Steffen Klassert <steffen.klassert@secunet.com> writes:

> On Mon, Oct 07, 2024 at 09:59:27AM -0400, Christian Hopps wrote:
>> From: Christian Hopps <chopps@labn.net>
>>
>> +static u32 __reorder_drop(struct xfrm_iptfs_data *xtfs, struct list_head *list)
>> +
>> +{
>> +	struct skb_wseq *s, *se;
>> +	const u32 savedlen = xtfs->w_savedlen;
>> +	time64_t now = ktime_get_raw_fast_ns();
>> +	u32 count = 0;
>> +	u32 scount = 0;
>> +
>> +	WARN_ON_ONCE(!savedlen);
>> +	if (xtfs->w_saved[0].drop_time > now)
>> +		goto set_timer;
>> +
>> +	++xtfs->w_wantseq;
>> +
>> +	/* Keep flushing packets until we reach a drop time greater than now. */
>> +	s = xtfs->w_saved;
>> +	se = s + savedlen;
>> +	do {
>> +		/* Walking past empty slots until we reach a packet */
>> +		for (; s < se && !s->skb; s++)
>> +			if (s->drop_time > now)
>> +				goto outerdone;
>
> Please use braces if there is more that one line in the loop.

Done.

>> +
>> +static void __reorder_future_shifts(struct xfrm_iptfs_data *xtfs,
>> +				    struct sk_buff *inskb,
>> +				    struct list_head *list,
>> +				    struct list_head *freelist)
>
> freelist is unused in this function.

Removed.

>> +{
>> +	const u32 nslots = xtfs->cfg.reorder_win_size + 1;
>> +	const u64 inseq = __esp_seq(inskb);
>> +	u32 savedlen = xtfs->w_savedlen;
>> +	u64 wantseq = xtfs->w_wantseq;
>> +	struct sk_buff *slot0 = NULL;
>> +	struct skb_wseq *wnext;
>> +	u32 beyond, shifting, slot;
>> +	u64 distance;
>> +
>> +	WARN_ON_ONCE(inseq <= wantseq);
>
> Do we really need this warning? You checked exactly this before calling
> __reorder_future_shifts.
>
>> +	distance = inseq - wantseq;
>> +	WARN_ON_ONCE(distance <= nslots - 1);
>
> Same here. There are a lot of these WARN_ON_ONCE all over the
> place. I don't think we need it at places where it is clear
> that the warn condition will never be true.

Removed.

>> +	beyond = distance - (nslots - 1);
>> +
>> +	/* Handle future sequence number received.
>> +	 *
>> +	 * IMPORTANT: we are at least advancing w_wantseq (i.e., wantseq) by 1
>> +	 * b/c we are beyond the window boundary.
>> +	 *
>> +	 * We know we don't have the wantseq so that counts as a drop.
>> +	 */
>> +
>> +	/* ex: slot count is 4, array size is 3 savedlen is 2, slot 0 is the
>
> What means 'ex:'?

"Example" - expanded.

>> +	 * missing sequence number.
>> +	 *
>> +	 * the final slot at savedlen (index savedlen - 1) is always occupied.
>> +	 *
>> +	 * beyond is "beyond array size" not savedlen.
>> +	 *
>> +	 *          +--------- array length (savedlen == 2)
>> +	 *          |   +----- array size (nslots - 1 == 3)
>> +	 *          |   |   +- window boundary (nslots == 4)
>> +	 *          V   V | V
>
> window boundary points to seq number 6 here. In all other
> examples, it points between 5 and 6. Is that intentional?

Fixed to match others.

>> +	 *                |
>> +	 *  0   1   2   3 |   slot number
>> +	 * ---  0   1   2 |   array index
>
> This looks odd. I guess this is because slot0 does
> not belong to the array. Why is that?

the slots are the logical window, the array implements the physical data structure requirements for holding received future packets. This never includes the missing "next in line" required packet by definition (slot 0 in the window).

> Can we have slot0 integrated in the array? This would make
> the counting much simpler IMO.

We don't want to do this b/c then simple shifting logic doesn't work as slot0 is, by definition, always missing, the code will actually get more complex with an always missing initial array item to be special cased.

>> +	 *     [b] [c] : :|   array
>> +	 *                |
>> +	 * "2" "3" "4" "5"|*6*  seq numbers
>
> Is it so that in the example above packet [a] with seq number 2
> is missing and we have packet [b] with seq number 3 in slot 1
> and packet [c] with seq number 4 in slot 2?

Yes.

>> +	 *
>> +	 * We receive seq number 6
>> +	 * distance == 4 [inseq(6) - w_wantseq(2)]
>> +	 * newslot == distance
>> +	 * index == 3 [distance(4) - 1]
>> +	 * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
>> +	 * shifting == 1 [min(savedlen(2), beyond(1)]
>> +	 * slot0_skb == [b], and should match w_wantseq
>
> We don't have slot0_skb, I guess this is slot0.
>
> How will the example above look like after shifting?
> Is it this:
>
> 	 *  0   1   2   3 |   slot number
> 	 * ---  0   1   2 |   array index
> 	 * [b] [c] : : [e]|   array
> 	 *                |
> 	 * "3" "4" "5" "6"|  seq numbers
>
> Whereby [e] is the packet with seq number 6.

Yes.

>> +	 *
>> +	 *                +--- window boundary (nslots == 4)
>> +	 *  0   1   2   3 | 4   slot number
>> +	 * ---  0   1   2 | 3   array index
>> +	 *     [b] : : : :|     array
>> +	 * "2" "3" "4" "5" *6*  seq numbers
>
> What's the difference to the first example? Is it
> the same but packet [c] is missing?

Yes.

>> +	 *
>> +	 * We receive seq number 6
>> +	 * distance == 4 [inseq(6) - w_wantseq(2)]
>> +	 * newslot == distance
>> +	 * index == 3 [distance(4) - 1]
>> +	 * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
>> +	 * shifting == 1 [min(savedlen(1), beyond(1)]
>> +	 * slot0_skb == [b] and should match w_wantseq
>> +	 *
>> +	 *                +-- window boundary (nslots == 4)
>> +	 *  0   1   2   3 | 4   5   6   slot number
>> +	 * ---  0   1   2 | 3   4   5   array index
>> +	 *     [-] [c] : :|             array
>
> What does [-] mean? Is it the [b] is missing?

Yes. empty (NULL) array item.

>> +	 * "2" "3" "4" "5" "6" "7" *8*  seq numbers
>> +	 *
>> +	 * savedlen = 2, beyond = 3
>> +	 * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-]
>> +	 * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c]
>> +	 * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1)
>> +	 *
>> +	 * We receive seq number 8
>> +	 * distance == 6 [inseq(8) - w_wantseq(2)]
>> +	 * newslot == distance
>> +	 * index == 5 [distance(6) - 1]
>> +	 * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))]
>> +	 * shifting == 2 [min(savedlen(2), beyond(3)]
>> +	 *
>> +	 * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true.
>> +	 */
>> +
>> +	/* Now send any packets that are being shifted out of saved, and account
>> +	 * for missing packets that are exiting the window as we shift it.
>> +	 */
>
> Documentation is in genaral good, but it must be clear what's
> documented here. It took me quite a while to figure out what
> these examples mean, and I'm still not absoluely sure if
> I'm correct. All that might be clear to you when you wrote
> that, but it is very hard to guess what you thought when
> writing this :)

You understood everything correctly. Sliding window code is by nature complex, and as you noted I really wanted to document the crap out of it to try and help. You got it all right so I think I accomplished the goal. :)

>> +	/* If savedlen > beyond we are shifting some, else all. */
>> +	shifting = min(savedlen, beyond);
>> +
>> +	/* slot0 is the buf that just shifted out and into slot0 */
>> +	slot0 = NULL;
>
> slot0 was set to NULL already at the beginning of this function.
>
> It is just hard to notice because of the huge coment in the middle
> of the function. It might make sense to put all that bigger comments
> to a central place. It would make the functions more readable,
> at least.

That was the intention. :) I removed the redundant assignment above, and moved 2 others below to join the rest.

>> @@ -2222,6 +2700,11 @@ static void iptfs_destroy_state(struct xfrm_state *x)
>>  	if (xtfs->ra_newskb)
>>  		kfree_skb(xtfs->ra_newskb);
>>
>> +	for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++)
>> +		if (s->skb)
>> +			kfree_skb(s->skb);
>
> Braces please.

Done.

Thanks,
Chris.
diff mbox series

Patch

diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c
index ef4c23159471..c8ba4a9f77a5 100644
--- a/net/xfrm/xfrm_iptfs.c
+++ b/net/xfrm/xfrm_iptfs.c
@@ -39,6 +39,17 @@ 
  */
 #define IPTFS_DEFAULT_DROP_TIME_USECS	1000000
 
+/**
+ * define IPTFS_DEFAULT_REORDER_WINDOW - default reorder window size
+ *
+ * The default IPTFS reorder window size. The reorder window size dictates the
+ * maximum number of IPTFS tunnel packets in a sequence that may arrive out of
+ * order.
+ *
+ * Default 3. (tcp folks suggested)
+ */
+#define IPTFS_DEFAULT_REORDER_WINDOW	3
+
 /* ------------------------------------------------ */
 /* IPTFS default SA values (tunnel ingress/dir-out) */
 /* ------------------------------------------------ */
@@ -91,6 +102,8 @@ 
 /**
  * struct xfrm_iptfs_config - configuration for the IPTFS tunnel.
  * @dont_frag: true to inhibit fragmenting across IPTFS outer packets.
+ * @reorder_win_size: the number slots in the reorder window, thus the number of
+ *	packets that may arrive out of order.
  * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery,
  *	otherwise the user specified value.
  * @max_queue_size: The maximum number of octets allowed to be queued to be sent
@@ -99,10 +112,16 @@ 
  */
 struct xfrm_iptfs_config {
 	bool dont_frag : 1;
+	u16 reorder_win_size;
 	u32 pkt_size;	    /* outer_packet_size or 0 */
 	u32 max_queue_size; /* octets */
 };
 
+struct skb_wseq {
+	struct sk_buff *skb;
+	u64 drop_time;
+};
+
 /**
  * struct xfrm_iptfs_data - mode specific xfrm state.
  * @cfg: IPTFS tunnel config.
@@ -113,6 +132,10 @@  struct xfrm_iptfs_config {
  * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet.
  * @iptfs_timer: output timer.
  * @payload_mtu: max payload size.
+ * @w_seq_set: true after first seq received.
+ * @w_wantseq: waiting for this seq number as next to process (in order).
+ * @w_saved: the saved buf array (reorder window).
+ * @w_savedlen: the saved len (not size).
  * @drop_lock: lock to protect reorder queue.
  * @drop_timer: timer for considering next packet lost.
  * @drop_time_ns: timer intervan in nanoseconds.
@@ -134,12 +157,16 @@  struct xfrm_iptfs_data {
 	struct hrtimer iptfs_timer; /* output timer */
 	u32 payload_mtu;	    /* max payload size */
 
-	/* Tunnel egress */
+	/* Tunnel input reordering */
+	bool w_seq_set;		  /* true after first seq received */
+	u64 w_wantseq;		  /* expected next sequence */
+	struct skb_wseq *w_saved; /* the saved buf array */
+	u32 w_savedlen;		  /* the saved len (not size) */
 	spinlock_t drop_lock;
 	struct hrtimer drop_timer;
 	u64 drop_time_ns;
 
-	/* Tunnel egress reassembly */
+	/* Tunnel input reassembly */
 	struct sk_buff *ra_newskb; /* new pkt being reassembled */
 	u64 ra_wantseq;		   /* expected next sequence */
 	u8 ra_runt[6];		   /* last pkt bytes from last skb */
@@ -891,15 +918,13 @@  static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq,
 }
 
 /**
- * iptfs_input() - handle receipt of iptfs payload
+ * iptfs_input_ordered() - handle next in order IPTFS payload.
  * @x: xfrm state
- * @skb: the packet
+ * @skb: current packet
  *
  * Process the IPTFS payload in `skb` and consume it afterwards.
- *
- * Returns 0.
  */
-static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb)
+static void iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb)
 {
 	u8 hbytes[sizeof(struct ipv6hdr)];
 	struct ip_iptfs_cc_hdr iptcch;
@@ -1221,12 +1246,368 @@  static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb)
 		WARN_ON_ONCE(!skb);
 		kfree_skb(skb);
 	}
+}
 
-	/* We always have dealt with the input SKB, either we are re-using it,
-	 * or we have freed it. Return EINPROGRESS so that xfrm_input stops
-	 * processing it.
+/* ------------------------------- */
+/* Input (Egress) Re-ordering Code */
+/* ------------------------------- */
+
+static void __vec_shift(struct xfrm_iptfs_data *xtfs, u32 shift)
+{
+	u32 savedlen = xtfs->w_savedlen;
+
+	if (shift > savedlen)
+		shift = savedlen;
+	if (shift != savedlen)
+		memcpy(xtfs->w_saved, xtfs->w_saved + shift,
+		       (savedlen - shift) * sizeof(*xtfs->w_saved));
+	memset(xtfs->w_saved + savedlen - shift, 0,
+	       shift * sizeof(*xtfs->w_saved));
+	xtfs->w_savedlen -= shift;
+}
+
+static void __reorder_past(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb,
+			   struct list_head *freelist)
+{
+	list_add_tail(&inskb->list, freelist);
+}
+
+static u32 __reorder_drop(struct xfrm_iptfs_data *xtfs, struct list_head *list)
+
+{
+	struct skb_wseq *s, *se;
+	const u32 savedlen = xtfs->w_savedlen;
+	time64_t now = ktime_get_raw_fast_ns();
+	u32 count = 0;
+	u32 scount = 0;
+
+	WARN_ON_ONCE(!savedlen);
+	if (xtfs->w_saved[0].drop_time > now)
+		goto set_timer;
+
+	++xtfs->w_wantseq;
+
+	/* Keep flushing packets until we reach a drop time greater than now. */
+	s = xtfs->w_saved;
+	se = s + savedlen;
+	do {
+		/* Walking past empty slots until we reach a packet */
+		for (; s < se && !s->skb; s++)
+			if (s->drop_time > now)
+				goto outerdone;
+		/* Sending packets until we hit another empty slot. */
+		for (; s < se && s->skb; scount++, s++)
+			list_add_tail(&s->skb->list, list);
+	} while (s < se);
+outerdone:
+
+	count = s - xtfs->w_saved;
+	if (count) {
+		xtfs->w_wantseq += count;
+
+		/* Shift handled slots plus final empty slot into slot 0. */
+		__vec_shift(xtfs, count);
+	}
+
+	if (xtfs->w_savedlen) {
+set_timer:
+		/* Drifting is OK */
+		hrtimer_start(&xtfs->drop_timer,
+			      xtfs->w_saved[0].drop_time - now,
+			      IPTFS_HRTIMER_MODE);
+	}
+	return scount;
+}
+
+static void __reorder_this(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb,
+			   struct list_head *list)
+{
+	struct skb_wseq *s, *se;
+	const u32 savedlen = xtfs->w_savedlen;
+	u32 count = 0;
+
+	/* Got what we wanted. */
+	list_add_tail(&inskb->list, list);
+	++xtfs->w_wantseq;
+	if (!savedlen)
+		return;
+
+	/* Flush remaining consecutive packets. */
+
+	/* Keep sending until we hit another missed pkt. */
+	for (s = xtfs->w_saved, se = s + savedlen; s < se && s->skb; s++)
+		list_add_tail(&s->skb->list, list);
+	count = s - xtfs->w_saved;
+	if (count)
+		xtfs->w_wantseq += count;
+
+	/* Shift handled slots plus final empty slot into slot 0. */
+	__vec_shift(xtfs, count + 1);
+}
+
+/* Set the slot's drop time and all the empty slots below it until reaching a
+ * filled slot which will already be set.
+ */
+static void iptfs_set_window_drop_times(struct xfrm_iptfs_data *xtfs, int index)
+{
+	const u32 savedlen = xtfs->w_savedlen;
+	struct skb_wseq *s = xtfs->w_saved;
+	time64_t drop_time;
+
+	assert_spin_locked(&xtfs->drop_lock);
+
+	if (savedlen > index + 1) {
+		/* we are below another, our drop time and the timer are already set */
+		WARN_ON_ONCE(xtfs->w_saved[index + 1].drop_time !=
+			     xtfs->w_saved[index].drop_time);
+		return;
+	}
+	/* we are the most future so get a new drop time. */
+	drop_time = ktime_get_raw_fast_ns();
+	drop_time += xtfs->drop_time_ns;
+
+	/* Walk back through the array setting drop times as we go */
+	s[index].drop_time = drop_time;
+	while (index-- > 0 && !s[index].skb)
+		s[index].drop_time = drop_time;
+
+	/* If we walked all the way back, schedule the drop timer if needed */
+	if (index == -1 && !hrtimer_is_queued(&xtfs->drop_timer))
+		hrtimer_start(&xtfs->drop_timer, xtfs->drop_time_ns,
+			      IPTFS_HRTIMER_MODE);
+}
+
+static void __reorder_future_fits(struct xfrm_iptfs_data *xtfs,
+				  struct sk_buff *inskb,
+				  struct list_head *freelist)
+{
+	const u32 nslots = xtfs->cfg.reorder_win_size + 1;
+	const u64 inseq = __esp_seq(inskb);
+	const u64 wantseq = xtfs->w_wantseq;
+	const u64 distance = inseq - wantseq;
+	const u32 savedlen = xtfs->w_savedlen;
+	const u32 index = distance - 1;
+
+	WARN_ON_ONCE(distance >= nslots);
+
+	/* Handle future sequence number received which fits in the window.
+	 *
+	 * We know we don't have the seq we want so we won't be able to flush
+	 * anything.
 	 */
-	return -EINPROGRESS;
+
+	/* slot count is 4, saved size is 3 savedlen is 2
+	 *
+	 * "window boundary" is based on the fixed window size
+	 * distance is also slot number
+	 * index is an array index (i.e., - 1 of slot)
+	 * : : - implicit NULL after array len
+	 *
+	 *          +--------- used length (savedlen == 2)
+	 *          |   +----- array size (nslots - 1 == 3)
+	 *          |   |   + window boundary (nslots == 4)
+	 *          V   V | V
+	 *                |
+	 *  0   1   2   3 |   slot number
+	 * ---  0   1   2 |   array index
+	 *     [-] [b] : :|   array
+	 *
+	 * "2" "3" "4" *5*|   seq numbers
+	 *
+	 * We receive seq number 5
+	 * distance == 3 [inseq(5) - w_wantseq(2)]
+	 * index == 2 [distance(6) - 1]
+	 */
+
+	if (xtfs->w_saved[index].skb) {
+		/* a dup of a future */
+		list_add_tail(&inskb->list, freelist);
+		return;
+	}
+
+	xtfs->w_saved[index].skb = inskb;
+	xtfs->w_savedlen = max(savedlen, index + 1);
+	iptfs_set_window_drop_times(xtfs, index);
+}
+
+static void __reorder_future_shifts(struct xfrm_iptfs_data *xtfs,
+				    struct sk_buff *inskb,
+				    struct list_head *list,
+				    struct list_head *freelist)
+{
+	const u32 nslots = xtfs->cfg.reorder_win_size + 1;
+	const u64 inseq = __esp_seq(inskb);
+	u32 savedlen = xtfs->w_savedlen;
+	u64 wantseq = xtfs->w_wantseq;
+	struct sk_buff *slot0 = NULL;
+	struct skb_wseq *wnext;
+	u32 beyond, shifting, slot;
+	u64 distance;
+
+	WARN_ON_ONCE(inseq <= wantseq);
+	distance = inseq - wantseq;
+	WARN_ON_ONCE(distance <= nslots - 1);
+	beyond = distance - (nslots - 1);
+
+	/* Handle future sequence number received.
+	 *
+	 * IMPORTANT: we are at least advancing w_wantseq (i.e., wantseq) by 1
+	 * b/c we are beyond the window boundary.
+	 *
+	 * We know we don't have the wantseq so that counts as a drop.
+	 */
+
+	/* ex: slot count is 4, array size is 3 savedlen is 2, slot 0 is the
+	 * missing sequence number.
+	 *
+	 * the final slot at savedlen (index savedlen - 1) is always occupied.
+	 *
+	 * beyond is "beyond array size" not savedlen.
+	 *
+	 *          +--------- array length (savedlen == 2)
+	 *          |   +----- array size (nslots - 1 == 3)
+	 *          |   |   +- window boundary (nslots == 4)
+	 *          V   V | V
+	 *                |
+	 *  0   1   2   3 |   slot number
+	 * ---  0   1   2 |   array index
+	 *     [b] [c] : :|   array
+	 *                |
+	 * "2" "3" "4" "5"|*6*  seq numbers
+	 *
+	 * We receive seq number 6
+	 * distance == 4 [inseq(6) - w_wantseq(2)]
+	 * newslot == distance
+	 * index == 3 [distance(4) - 1]
+	 * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
+	 * shifting == 1 [min(savedlen(2), beyond(1)]
+	 * slot0_skb == [b], and should match w_wantseq
+	 *
+	 *                +--- window boundary (nslots == 4)
+	 *  0   1   2   3 | 4   slot number
+	 * ---  0   1   2 | 3   array index
+	 *     [b] : : : :|     array
+	 * "2" "3" "4" "5" *6*  seq numbers
+	 *
+	 * We receive seq number 6
+	 * distance == 4 [inseq(6) - w_wantseq(2)]
+	 * newslot == distance
+	 * index == 3 [distance(4) - 1]
+	 * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
+	 * shifting == 1 [min(savedlen(1), beyond(1)]
+	 * slot0_skb == [b] and should match w_wantseq
+	 *
+	 *                +-- window boundary (nslots == 4)
+	 *  0   1   2   3 | 4   5   6   slot number
+	 * ---  0   1   2 | 3   4   5   array index
+	 *     [-] [c] : :|             array
+	 * "2" "3" "4" "5" "6" "7" *8*  seq numbers
+	 *
+	 * savedlen = 2, beyond = 3
+	 * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-]
+	 * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c]
+	 * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1)
+	 *
+	 * We receive seq number 8
+	 * distance == 6 [inseq(8) - w_wantseq(2)]
+	 * newslot == distance
+	 * index == 5 [distance(6) - 1]
+	 * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))]
+	 * shifting == 2 [min(savedlen(2), beyond(3)]
+	 *
+	 * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true.
+	 */
+
+	/* Now send any packets that are being shifted out of saved, and account
+	 * for missing packets that are exiting the window as we shift it.
+	 */
+
+	/* If savedlen > beyond we are shifting some, else all. */
+	shifting = min(savedlen, beyond);
+
+	/* slot0 is the buf that just shifted out and into slot0 */
+	slot0 = NULL;
+	wnext = xtfs->w_saved;
+	for (slot = 1; slot <= shifting; slot++, wnext++) {
+		/* handle what was in slot0 before we occupy it */
+		if (slot0)
+			list_add_tail(&slot0->list, list);
+		slot0 = wnext->skb;
+		wnext->skb = NULL;
+	}
+
+	/* slot0 is now either NULL (in which case it's what we now are waiting
+	 * for, or a buf in which case we need to handle it like we received it;
+	 * however, we may be advancing past that buffer as well..
+	 */
+
+	/* Handle case where we need to shift more than we had saved, slot0 will
+	 * be NULL iff savedlen is 0, otherwise slot0 will always be
+	 * non-NULL b/c we shifted the final element, which is always set if
+	 * there is any saved, into slot0.
+	 */
+	if (savedlen < beyond) {
+		if (savedlen == 0) {
+			WARN_ON_ONCE(slot0);
+		} else {
+			WARN_ON_ONCE(!slot0);
+			list_add_tail(&slot0->list, list);
+		}
+		slot0 = NULL;
+		/* slot0 has had an empty slot pushed into it */
+	}
+
+	/* Remove the entries */
+	__vec_shift(xtfs, beyond);
+
+	/* Advance want seq */
+	xtfs->w_wantseq += beyond;
+
+	/* Process drops here when implementing congestion control */
+
+	/* We've shifted. plug the packet in at the end. */
+	xtfs->w_savedlen = nslots - 1;
+	xtfs->w_saved[xtfs->w_savedlen - 1].skb = inskb;
+	iptfs_set_window_drop_times(xtfs, xtfs->w_savedlen - 1);
+
+	/* if we don't have a slot0 then we must wait for it */
+	if (!slot0)
+		return;
+
+	/* If slot0, seq must match new want seq */
+	WARN_ON_ONCE(xtfs->w_wantseq != __esp_seq(slot0));
+
+	/* slot0 is valid, treat like we received expected. */
+	__reorder_this(xtfs, slot0, list);
+}
+
+/* Receive a new packet into the reorder window. Return a list of ordered
+ * packets from the window.
+ */
+static void iptfs_input_reorder(struct xfrm_iptfs_data *xtfs,
+				struct sk_buff *inskb, struct list_head *list,
+				struct list_head *freelist)
+{
+	const u32 nslots = xtfs->cfg.reorder_win_size + 1;
+	u64 inseq = __esp_seq(inskb);
+	u64 wantseq;
+
+	assert_spin_locked(&xtfs->drop_lock);
+
+	if (unlikely(!xtfs->w_seq_set)) {
+		xtfs->w_seq_set = true;
+		xtfs->w_wantseq = inseq;
+	}
+	wantseq = xtfs->w_wantseq;
+
+	if (likely(inseq == wantseq))
+		__reorder_this(xtfs, inskb, list);
+	else if (inseq < wantseq)
+		__reorder_past(xtfs, inskb, freelist);
+	else if ((inseq - wantseq) < nslots)
+		__reorder_future_fits(xtfs, inskb, freelist);
+	else
+		__reorder_future_shifts(xtfs, inskb, list, freelist);
 }
 
 /**
@@ -1253,23 +1634,92 @@  static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb)
  */
 static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me)
 {
+	struct sk_buff *skb, *next;
+	struct list_head list;
 	struct xfrm_iptfs_data *xtfs;
-	struct sk_buff *skb;
+	struct xfrm_state *x;
+	u32 count;
 
 	xtfs = container_of(me, typeof(*xtfs), drop_timer);
+	x = xtfs->x;
+
+	INIT_LIST_HEAD(&list);
 
-	/* Drop any in progress packet */
 	spin_lock(&xtfs->drop_lock);
+
+	/* Drop any in progress packet */
 	skb = xtfs->ra_newskb;
 	xtfs->ra_newskb = NULL;
+
+	/* Now drop as many packets as we should from the reordering window
+	 * saved array
+	 */
+	count = xtfs->w_savedlen ? __reorder_drop(xtfs, &list) : 0;
+
 	spin_unlock(&xtfs->drop_lock);
 
 	if (skb)
 		kfree_skb_reason(skb, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
 
+	if (count) {
+		list_for_each_entry_safe(skb, next, &list, list) {
+			skb_list_del_init(skb);
+			iptfs_input_ordered(x, skb);
+		}
+	}
+
 	return HRTIMER_NORESTART;
 }
 
+/**
+ * iptfs_input() - handle receipt of iptfs payload
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * We have an IPTFS payload order it if needed, then process newly in order
+ * packets.
+ *
+ * Return: -EINPROGRESS to inform xfrm_input to stop processing the skb.
+ */
+static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct list_head freelist, list;
+	struct xfrm_iptfs_data *xtfs = x->mode_data;
+	struct sk_buff *next;
+
+	/* Fast path for no reorder window. */
+	if (xtfs->cfg.reorder_win_size == 0) {
+		iptfs_input_ordered(x, skb);
+		goto done;
+	}
+
+	/* Fetch list of in-order packets from the reordering window as well as
+	 * a list of buffers we need to now free.
+	 */
+	INIT_LIST_HEAD(&list);
+	INIT_LIST_HEAD(&freelist);
+
+	spin_lock(&xtfs->drop_lock);
+	iptfs_input_reorder(xtfs, skb, &list, &freelist);
+	spin_unlock(&xtfs->drop_lock);
+
+	list_for_each_entry_safe(skb, next, &list, list) {
+		skb_list_del_init(skb);
+		iptfs_input_ordered(x, skb);
+	}
+
+	list_for_each_entry_safe(skb, next, &freelist, list) {
+		skb_list_del_init(skb);
+		kfree_skb(skb);
+	}
+done:
+	/* We always have dealt with the input SKB, either we are re-using it,
+	 * or we have freed it. Return EINPROGRESS so that xfrm_input stops
+	 * processing it.
+	 */
+	return -EINPROGRESS;
+}
+
 /* ================================= */
 /* IPTFS Sending (ingress) Functions */
 /* ================================= */
@@ -2048,11 +2498,24 @@  static int iptfs_user_init(struct net *net, struct xfrm_state *x,
 
 	xc = &xtfs->cfg;
 	xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE;
+	xc->reorder_win_size = IPTFS_DEFAULT_REORDER_WINDOW;
 	xtfs->drop_time_ns = IPTFS_DEFAULT_DROP_TIME_USECS * NSECS_IN_USEC;
 	xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC;
 
 	if (attrs[XFRMA_IPTFS_DONT_FRAG])
 		xc->dont_frag = true;
+	if (attrs[XFRMA_IPTFS_REORDER_WINDOW])
+		xc->reorder_win_size =
+			nla_get_u16(attrs[XFRMA_IPTFS_REORDER_WINDOW]);
+	/* saved array is for saving 1..N seq nums from wantseq */
+	if (xc->reorder_win_size) {
+		xtfs->w_saved = kcalloc(xc->reorder_win_size,
+					sizeof(*xtfs->w_saved), GFP_KERNEL);
+		if (!xtfs->w_saved) {
+			NL_SET_ERR_MSG(extack, "Cannot alloc reorder window");
+			return -ENOMEM;
+		}
+	}
 	if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
 		xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]);
 		if (!xc->pkt_size) {
@@ -2091,6 +2554,7 @@  static unsigned int iptfs_sa_len(const struct xfrm_state *x)
 
 	if (x->dir == XFRM_SA_DIR_IN) {
 		l += nla_total_size(sizeof(u32)); /* drop time usec */
+		l += nla_total_size(sizeof(xc->reorder_win_size));
 	} else {
 		if (xc->dont_frag)
 			l += nla_total_size(0);	  /* dont-frag flag */
@@ -2113,6 +2577,11 @@  static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
 		q = xtfs->drop_time_ns;
 		(void)do_div(q, NSECS_IN_USEC);
 		ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, q);
+		if (ret)
+			return ret;
+
+		ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW,
+				  xc->reorder_win_size);
 	} else {
 		if (xc->dont_frag) {
 			ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG);
@@ -2175,6 +2644,14 @@  static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig)
 	xtfs->x = x;
 
 	xtfs->ra_newskb = NULL;
+	if (xtfs->cfg.reorder_win_size) {
+		xtfs->w_saved = kcalloc(xtfs->cfg.reorder_win_size,
+					sizeof(*xtfs->w_saved), GFP_KERNEL);
+		if (!xtfs->w_saved) {
+			kfree_sensitive(xtfs);
+			return -ENOMEM;
+		}
+	}
 
 	return 0;
 }
@@ -2201,6 +2678,7 @@  static void iptfs_destroy_state(struct xfrm_state *x)
 {
 	struct xfrm_iptfs_data *xtfs = x->mode_data;
 	struct sk_buff_head list;
+	struct skb_wseq *s, *se;
 	struct sk_buff *skb;
 
 	if (!xtfs)
@@ -2222,6 +2700,11 @@  static void iptfs_destroy_state(struct xfrm_state *x)
 	if (xtfs->ra_newskb)
 		kfree_skb(xtfs->ra_newskb);
 
+	for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++)
+		if (s->skb)
+			kfree_skb(s->skb);
+
+	kfree_sensitive(xtfs->w_saved);
 	kfree_sensitive(xtfs);
 
 	module_put(x->mode_cbs->owner);