[ipsec-next,v5,13/17] xfrm: iptfs: handle received fragmented inner packets

Message ID	20240714202246.1573817-14-chopps@chopps.org (mailing list archive)
State	Awaiting Upstream
Delegated to:	Netdev Maintainers
Headers	show Received: from smtp.chopps.org (smtp.chopps.org [54.88.81.56]) by smtp.subspace.kernel.org (Postfix) with ESMTP id BDD0C1428F3 for <netdev@vger.kernel.org>; Sun, 14 Jul 2024 20:23:51 +0000 (UTC) From: Christian Hopps <chopps@chopps.org> To: devel@linux-ipsec.org Cc: Steffen Klassert <steffen.klassert@secunet.com>, netdev@vger.kernel.org, Christian Hopps <chopps@chopps.org>, Christian Hopps <chopps@labn.net> Subject: [PATCH ipsec-next v5 13/17] xfrm: iptfs: handle received fragmented inner packets Date: Sun, 14 Jul 2024 16:22:41 -0400 Message-ID: <20240714202246.1573817-14-chopps@chopps.org> In-Reply-To: <20240714202246.1573817-1-chopps@chopps.org> References: <20240714202246.1573817-1-chopps@chopps.org> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	Add IP-TFS mode to xfrm \| expand [ipsec-next,v5,00/17] Add IP-TFS mode to xfrm [ipsec-next,v5,01/17] xfrm: config: add CONFIG_XFRM_IPTFS [ipsec-next,v5,02/17] include: uapi: add ip_tfs_*_hdr packet formats [ipsec-next,v5,03/17] include: uapi: add IPPROTO_AGGFRAG for AGGFRAG in ESP [ipsec-next,v5,04/17] xfrm: sysctl: allow configuration of global default values [ipsec-next,v5,05/17] xfrm: netlink: add config (netlink) options [ipsec-next,v5,06/17] xfrm: add mode_cbs module functionality [ipsec-next,v5,07/17] xfrm: add generic iptfs defines and functionality [ipsec-next,v5,08/17] xfrm: iptfs: add new iptfs xfrm mode impl [ipsec-next,v5,09/17] xfrm: iptfs: add user packet (tunnel ingress) handling [ipsec-next,v5,10/17] xfrm: iptfs: share page fragments of inner packets [ipsec-next,v5,11/17] xfrm: iptfs: add fragmenting of larger than MTU user packets [ipsec-next,v5,12/17] xfrm: iptfs: add basic receive packet (tunnel egress) handling [ipsec-next,v5,13/17] xfrm: iptfs: handle received fragmented inner packets [ipsec-next,v5,14/17] xfrm: iptfs: add reusing received skb for the tunnel egress packet [ipsec-next,v5,15/17] xfrm: iptfs: add skb-fragment sharing code [ipsec-next,v5,16/17] xfrm: iptfs: handle reordering of received packets [ipsec-next,v5,17/17] xfrm: iptfs: add tracepoint functionality

Context	Check	Description
netdev/tree_selection	success	Guessing tree name failed - patch did not apply

diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 0060f8825599..f608f0d10614 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -56,10 +56,22 @@ struct xfrm_iptfs_data { struct hrtimer iptfs_timer; /* output timer */ time64_t iptfs_settime; /* time timer was set */ u32 payload_mtu; /* max payload size */ + + /* Tunnel egress */ + spinlock_t drop_lock; + struct hrtimer drop_timer; + u64 drop_time_ns; + + /* Tunnel egress reassembly */ + struct sk_buff *ra_newskb; /* new pkt being reassembled */ + u64 ra_wantseq; /* expected next sequence */ + u8 ra_runt[6]; /* last pkt bytes from last skb */ + u8 ra_runtlen; /* count of ra_runt */ }; static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); +static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me); /* ================= */ /* Utility Functions */ @@ -217,6 +229,63 @@ iptfs_pskb_extract_seq(u32 skblen, struct skb_seq_state *st, u32 off, int len) return skb; } +/** + * iptfs_input_save_runt() - save data in xtfs runt space. + * @xtfs: xtfs state + * @seq: the current sequence + * @buf: packet data + * @len: length of packet data + * + * Save the small (`len`) start of a fragmented packet in `buf` in the xtfs data + * runt space. + */ +static void iptfs_input_save_runt(struct xfrm_iptfs_data *xtfs, u64 seq, + u8 *buf, int len) +{ + BUG_ON(xtfs->ra_newskb); /* we won't have a new SKB yet */ + + memcpy(xtfs->ra_runt, buf, len); + + xtfs->ra_runtlen = len; + xtfs->ra_wantseq = seq + 1; +} + +/** + * __iptfs_iphlen() - return the v4/v6 header length using packet data. + * @data: pointer at octet with version nibble + * + * The version data is expected to be valid (i.e., either 4 or 6). + */ +static u32 __iptfs_iphlen(u8 *data) +{ + struct iphdr *iph = (struct iphdr *)data; + + if (iph->version == 0x4) + return sizeof(*iph); + BUG_ON(iph->version != 0x6); + return sizeof(struct ipv6hdr); +} + +/** + * __iptfs_iplen() - return the v4/v6 length using packet data. + * @data: pointer to ip (v4/v6) packet header + * + * Grab the IPv4 or IPv6 length value in the start of the inner packet header + * pointed to by `data`. Assumes data len is enough for the length field only. + * + * The version data is expected to be valid (i.e., either 4 or 6). + */ +static u32 __iptfs_iplen(u8 *data) +{ + struct iphdr *iph = (struct iphdr *)data; + + if (iph->version == 0x4) + return ntohs(iph->tot_len); + BUG_ON(iph->version != 0x6); + return ntohs(((struct ipv6hdr *)iph)->payload_len) + + sizeof(struct ipv6hdr); +} + /** * iptfs_complete_inner_skb() - finish preparing the inner packet for gro recv. * @x: xfrm state @@ -266,6 +335,238 @@ static void iptfs_complete_inner_skb(struct xfrm_state *x, struct sk_buff *skb) } } +static void __iptfs_reassem_done(struct xfrm_iptfs_data *xtfs, bool free) +{ + assert_spin_locked(&xtfs->drop_lock); + + /* We don't care if it works locking takes care of things */ + hrtimer_try_to_cancel(&xtfs->drop_timer); + if (free) + kfree_skb(xtfs->ra_newskb); + xtfs->ra_newskb = NULL; +} + +/** + * iptfs_reassem_abort() - In-progress packet is aborted free the state. + * @xtfs: xtfs state + */ +static void iptfs_reassem_abort(struct xfrm_iptfs_data *xtfs) +{ + __iptfs_reassem_done(xtfs, true); +} + +/** + * iptfs_reassem_done() - In-progress packet is complete, clear the state. + * @xtfs: xtfs state + */ +static void iptfs_reassem_done(struct xfrm_iptfs_data *xtfs) +{ + __iptfs_reassem_done(xtfs, false); +} + +/** + * iptfs_reassem_cont() - Continue the reassembly of an inner packets. + * @xtfs: xtfs state + * @seq: sequence of current packet + * @st: seq read stat for current packet + * @skb: current packet + * @data: offset into sequential packet data + * @blkoff: packet blkoff value + * @list: list of skbs to enqueue completed packet on + * + * Process an IPTFS payload that has a non-zero `blkoff` or when we are + * expecting the continuation b/c we have a runt or in-progress packet. + */ +static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, + struct skb_seq_state *st, struct sk_buff *skb, + u32 data, u32 blkoff, struct list_head *list) +{ + struct sk_buff *newskb = xtfs->ra_newskb; + u32 remaining = skb->len - data; + u32 runtlen = xtfs->ra_runtlen; + u32 copylen, fraglen, ipremain, iphlen, iphremain, rrem; + + /* Handle packet fragment we aren't expecting */ + if (!runtlen && !xtfs->ra_newskb) + return data + min(blkoff, remaining); + + /* Important to remember that input to this function is an ordered + * packet stream (unless the user disabled the reorder window). Thus if + * we are waiting for, and expecting the next packet so we can continue + * assembly, a newer sequence number indicates older ones are not coming + * (or if they do should be ignored). Technically we can receive older + * ones when the reorder window is disabled; however, the user should + * have disabled fragmentation in this case, and regardless we don't + * deal with it. + * + * blkoff could be zero if the stream is messed up (or it's an all pad + * insertion) be careful to handle that case in each of the below + */ + + /* Too old case: This can happen when the reorder window is disabled so + * ordering isn't actually guaranteed. + */ + if (seq < xtfs->ra_wantseq) + return data + remaining; + + /* Too new case: We missed what we wanted cleanup. */ + if (seq > xtfs->ra_wantseq) { + XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + if (blkoff == 0) { + if ((*skb->data & 0xF0) != 0) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + /* Handle all pad case, advance expected sequence number. + * (RFC 9347 S2.2.3) + */ + xtfs->ra_wantseq++; + /* will end parsing */ + return data + remaining; + } + + if (runtlen) { + BUG_ON(xtfs->ra_newskb); + + /* Regardless of what happens we're done with the runt */ + xtfs->ra_runtlen = 0; + + /* The start of this inner packet was at the very end of the last + * iptfs payload which didn't include enough for the ip header + * length field. We must have *at least* that now. + */ + rrem = sizeof(xtfs->ra_runt) - runtlen; + if (remaining < rrem || blkoff < rrem) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + /* fill in the runt data */ + if (skb_copy_bits_seq(st, data, &xtfs->ra_runt[runtlen], + rrem)) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + + /* We have enough data to get the ip length value now, + * allocate an in progress skb + */ + ipremain = __iptfs_iplen(xtfs->ra_runt); + if (ipremain < sizeof(xtfs->ra_runt)) { + /* length has to be at least runtsize large */ + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + /* For the runt case we don't attempt sharing currently. NOTE: + * Currently, this IPTFS implementation will not create runts. + */ + + newskb = iptfs_alloc_skb(skb, ipremain, false); + if (!newskb) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINERROR); + goto abandon; + } + xtfs->ra_newskb = newskb; + + /* Copy the runt data into the buffer, but leave data + * pointers the same as normal non-runt case. The extra `rrem` + * recopied bytes are basically cacheline free. Allows using + * same logic below to complete. + */ + memcpy(skb_put(newskb, runtlen), xtfs->ra_runt, + sizeof(xtfs->ra_runt)); + } + + /* Continue reassembling the packet */ + ipremain = __iptfs_iplen(newskb->data); + iphlen = __iptfs_iphlen(newskb->data); + + /* Sanity check, we created the newskb knowing the IP length so the IP + * length can't now be shorter. + */ + BUG_ON(newskb->len > ipremain); + + ipremain -= newskb->len; + if (blkoff < ipremain) { + /* Corrupt data, we don't have enough to complete the packet */ + XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + /* We want the IP header in linear space */ + if (newskb->len < iphlen) { + iphremain = iphlen - newskb->len; + if (blkoff < iphremain) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + fraglen = min(blkoff, remaining); + copylen = min(fraglen, iphremain); + BUG_ON(skb_tailroom(newskb) < copylen); + if (skb_copy_bits_seq(st, data, skb_put(newskb, copylen), + copylen)) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + /* this is a silly condition that might occur anyway */ + if (copylen < iphremain) { + xtfs->ra_wantseq++; + return data + fraglen; + } + /* update data and things derived from it */ + data += copylen; + blkoff -= copylen; + remaining -= copylen; + ipremain -= copylen; + } + + fraglen = min(blkoff, remaining); + copylen = min(fraglen, ipremain); + + /* We verified this was true in the main receive routine */ + BUG_ON(skb_tailroom(newskb) < copylen); + + /* copy fragment data into newskb */ + if (skb_copy_bits_seq(st, data, skb_put(newskb, copylen), copylen)) { + XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + + if (copylen < ipremain) { + xtfs->ra_wantseq++; + } else { + /* We are done with packet reassembly! */ + BUG_ON(copylen != ipremain); + iptfs_reassem_done(xtfs); + iptfs_complete_inner_skb(xtfs->x, newskb); + list_add_tail(&newskb->list, list); + } + + /* will continue on to new data block or end */ + return data + fraglen; + +abandon: + if (xtfs->ra_newskb) { + iptfs_reassem_abort(xtfs); + } else { + xtfs->ra_runtlen = 0; + xtfs->ra_wantseq = 0; + } + /* skip past fragment, maybe to end */ + return data + min(blkoff, remaining); +} + /** * iptfs_input() - handle receipt of iptfs payload * @x: xfrm state @@ -286,7 +587,7 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *iph; struct net *net; u32 remaining, iplen, iphlen, data, tail; - u32 blkoff; + u32 blkoff, capturelen; u64 seq; xtfs = x->mode_data; @@ -332,12 +633,27 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) INIT_LIST_HEAD(&sublist); - /* Fragment handling in following commits */ + /* Handle fragment at start of payload, and/or waiting reassembly. */ + blkoff = ntohs(ipth->block_offset); - data += blkoff; + /* check before locking i.e., maybe */ + if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { + spin_lock(&xtfs->drop_lock); + + /* check again after lock */ + if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { + data = iptfs_reassem_cont(xtfs, seq, &skbseq, skb, data, + blkoff, &sublist); + } + + spin_unlock(&xtfs->drop_lock); + } /* New packets */ + tail = skb->len; + BUG_ON(xtfs->ra_newskb && data < tail); + while (data < tail) { __be16 protocol = 0; @@ -356,8 +672,13 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) iph = (struct iphdr *)hbytes; if (iph->version == 0x4) { /* must have at least tot_len field present */ - if (remaining < 4) + if (remaining < 4) { + /* save the bytes we have, advance data and exit */ + iptfs_input_save_runt(xtfs, seq, hbytes, + remaining); + data += remaining; break; + } iplen = be16_to_cpu(iph->tot_len); iphlen = iph->ihl << 2; @@ -365,8 +686,13 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) XFRM_MODE_SKB_CB(skbseq.root_skb)->tos = iph->tos; } else if (iph->version == 0x6) { /* must have at least payload_len field present */ - if (remaining < 6) + if (remaining < 6) { + /* save the bytes we have, advance data and exit */ + iptfs_input_save_runt(xtfs, seq, hbytes, + remaining); + data += remaining; break; + } iplen = be16_to_cpu( ((struct ipv6hdr *)hbytes)->payload_len); @@ -377,6 +703,7 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) ipv6_get_dsfield((struct ipv6hdr *)iph); } else if (iph->version == 0x0) { /* pad */ + data = tail; break; } else { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); @@ -396,16 +723,14 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) if (!first_skb) first_skb = skb; - /* Fragment handling in following commits */ - if (iplen > remaining) - break; - - skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, iplen); + capturelen = min(iplen, remaining); + skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, capturelen); if (!skb) { /* skip to next packet or done */ - data += iplen; + data += capturelen; continue; } + BUG_ON(skb->len != capturelen); skb->protocol = protocol; if (old_mac) { @@ -416,11 +741,38 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) eth_hdr(skb)->h_proto = skb->protocol; } - data += iplen; + data += capturelen; + + if (skb->len < iplen) { + BUG_ON(data != tail); + BUG_ON(xtfs->ra_newskb); + + /* Start reassembly */ + spin_lock(&xtfs->drop_lock); + + xtfs->ra_newskb = skb; + xtfs->ra_wantseq = seq + 1; + if (!hrtimer_is_queued(&xtfs->drop_timer)) { + /* softirq blocked lest the timer fire and interrupt us */ + BUG_ON(!in_interrupt()); + hrtimer_start(&xtfs->drop_timer, + xtfs->drop_time_ns, + IPTFS_HRTIMER_MODE); + } + + spin_unlock(&xtfs->drop_lock); + + break; + } + iptfs_complete_inner_skb(x, skb); list_add_tail(&skb->list, &sublist); } + if (data != tail) + /* this should not happen from the above code */ + XFRM_INC_STATS(net, LINUX_MIB_XFRMINIPTFSERROR); + /* Send the packets! */ list_for_each_entry_safe(skb, next, &sublist, list) { skb_list_del_init(skb); @@ -448,6 +800,45 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) return -EINPROGRESS; } +/** + * iptfs_drop_timer() - Handle drop timer expiry. + * @me: the timer + * + * This is similar to our input function. + * + * The drop timer is set when we start an in progress reassembly, and also when + * we save a future packet in the window saved array. + * + * NOTE packets in the save window are always newer WRT drop times as + * they get further in the future. i.e. for: + * + * if slots (S0, S1, ... Sn) and `Dn` is the drop time for slot `Sn`, + * then D(n-1) <= D(n). + * + * So, regardless of why the timer is firing we can always discard any inprogress + * fragment; either it's the reassembly timer, or slot 0 is going to be + * dropped as S0 must have the most recent drop time, and slot 0 holds the + * continuation fragment of the in progress packet. + */ +static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me) +{ + struct xfrm_iptfs_data *xtfs; + struct xfrm_state *x; + + xtfs = container_of(me, typeof(*xtfs), drop_timer); + x = xtfs->x; + + /* Drop any in progress packet */ + spin_lock(&xtfs->drop_lock); + if (xtfs->ra_newskb) { + kfree_skb(xtfs->ra_newskb); + xtfs->ra_newskb = NULL; + } + spin_unlock(&xtfs->drop_lock); + + return HRTIMER_NORESTART; +} + /* ================================= */ /* IPTFS Sending (ingress) Functions */ /* ================================= */ @@ -1255,6 +1646,8 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, xc->max_queue_size = net->xfrm.sysctl_iptfs_max_qsize; xtfs->init_delay_ns = (u64)net->xfrm.sysctl_iptfs_init_delay * NSECS_IN_USEC; + xtfs->drop_time_ns = + (u64)net->xfrm.sysctl_iptfs_drop_time * NSECS_IN_USEC; if (attrs[XFRMA_IPTFS_DONT_FRAG]) xc->dont_frag = true; @@ -1272,6 +1665,10 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, } if (attrs[XFRMA_IPTFS_MAX_QSIZE]) xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]); + if (attrs[XFRMA_IPTFS_DROP_TIME]) + xtfs->drop_time_ns = + (u64)nla_get_u32(attrs[XFRMA_IPTFS_DROP_TIME]) * + NSECS_IN_USEC; if (attrs[XFRMA_IPTFS_INIT_DELAY]) xtfs->init_delay_ns = (u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * @@ -1288,7 +1685,9 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x) struct xfrm_iptfs_config *xc = &xtfs->cfg; unsigned int l = 0; - if (x->dir == XFRM_SA_DIR_OUT) { + if (x->dir == XFRM_SA_DIR_IN) { + l += nla_total_size(sizeof(u32)); /* drop time usec */ + } else { if (xc->dont_frag) l += nla_total_size(0); /* dont-frag flag */ l += nla_total_size(sizeof(u32)); /* init delay usec */ @@ -1306,7 +1705,11 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) int ret; u64 q; - if (x->dir == XFRM_SA_DIR_OUT) { + if (x->dir == XFRM_SA_DIR_IN) { + q = xtfs->drop_time_ns; + (void)do_div(q, NSECS_IN_USEC); + ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, q); + } else { if (xc->dont_frag) { ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); if (ret) @@ -1337,6 +1740,10 @@ static int __iptfs_init_state(struct xfrm_state *x, hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); xtfs->iptfs_timer.function = iptfs_delay_timer; + spin_lock_init(&xtfs->drop_lock); + hrtimer_init(&xtfs->drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); + xtfs->drop_timer.function = iptfs_drop_timer; + /* Modify type (esp) adjustment values */ if (x->props.family == AF_INET) @@ -1364,6 +1771,8 @@ static int iptfs_clone(struct xfrm_state *x, struct xfrm_state *orig) if (!xtfs) return -ENOMEM; + xtfs->ra_newskb = NULL; + err = __iptfs_init_state(x, xtfs); if (err) return err; @@ -1394,7 +1803,14 @@ static void iptfs_delete_state(struct xfrm_state *x) if (!xtfs) return; + spin_lock_bh(&xtfs->drop_lock); hrtimer_cancel(&xtfs->iptfs_timer); + hrtimer_cancel(&xtfs->drop_timer); + spin_unlock_bh(&xtfs->drop_lock); + + if (xtfs->ra_newskb) + kfree_skb(xtfs->ra_newskb); + kfree_sensitive(xtfs); module_put(x->mode_cbs->owner);

[ipsec-next,v5,13/17] xfrm: iptfs: handle received fragmented inner packets

Checks

Commit Message

Patch