diff mbox series

[net-next,2/2] act_mirred: use the backlog for nested calls to mirred ingress

Message ID 5fdd584d53f0807f743c07b1c0381cf5649495cd.1674233458.git.dcaratti@redhat.com (mailing list archive)
State Accepted
Commit ca22da2fbd693b54dc8e3b7b54ccc9f7e9ba3640
Delegated to: Netdev Maintainers
Headers show
Series net/sched: use the backlog for nested mirred ingress | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 7 maintainers not CCed: edumazet@google.com idosch@nvidia.com linux-kselftest@vger.kernel.org davem@davemloft.net kuba@kernel.org shuah@kernel.org vladimir.oltean@nxp.com
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success net selftest script(s) already in Makefile
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 80 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Davide Caratti Jan. 20, 2023, 5:01 p.m. UTC
William reports kernel soft-lockups on some OVS topologies when TC mirred
egress->ingress action is hit by local TCP traffic [1].
The same can also be reproduced with SCTP (thanks Xin for verifying), when
client and server reach themselves through mirred egress to ingress, and
one of the two peers sends a "heartbeat" packet (from within a timer).

Enqueueing to backlog proved to fix this soft lockup; however, as Cong
noticed [2], we should preserve - when possible - the current mirred
behavior that counts as "overlimits" any eventual packet drop subsequent to
the mirred forwarding action [3]. A compromise solution might use the
backlog only when tcf_mirred_act() has a nest level greater than one:
change tcf_mirred_forward() accordingly.

Also, add a kselftest that can reproduce the lockup and verifies TC mirred
ability to account for further packet drops after TC mirred egress->ingress
(when the nest level is 1).

 [1] https://lore.kernel.org/netdev/33dc43f587ec1388ba456b4915c75f02a8aae226.1663945716.git.dcaratti@redhat.com/
 [2] https://lore.kernel.org/netdev/Y0w%2FWWY60gqrtGLp@pop-os.localdomain/
 [3] such behavior is not guaranteed: for example, if RPS or skb RX
     timestamping is enabled on the mirred target device, the kernel
     can defer receiving the skb and return NET_RX_SUCCESS inside
     tcf_mirred_forward().

Reported-by: William Zhao <wizhao@redhat.com>
CC: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 net/sched/act_mirred.c                        |  7 +++
 .../selftests/net/forwarding/tc_actions.sh    | 49 ++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

Comments

Marcelo Ricardo Leitner Jan. 23, 2023, 5:22 p.m. UTC | #1
On Fri, Jan 20, 2023 at 06:01:40PM +0100, Davide Caratti wrote:
> William reports kernel soft-lockups on some OVS topologies when TC mirred
> egress->ingress action is hit by local TCP traffic [1].
> The same can also be reproduced with SCTP (thanks Xin for verifying), when
> client and server reach themselves through mirred egress to ingress, and
> one of the two peers sends a "heartbeat" packet (from within a timer).
> 
> Enqueueing to backlog proved to fix this soft lockup; however, as Cong
> noticed [2], we should preserve - when possible - the current mirred
> behavior that counts as "overlimits" any eventual packet drop subsequent to
> the mirred forwarding action [3]. A compromise solution might use the
> backlog only when tcf_mirred_act() has a nest level greater than one:
> change tcf_mirred_forward() accordingly.
> 
> Also, add a kselftest that can reproduce the lockup and verifies TC mirred
> ability to account for further packet drops after TC mirred egress->ingress
> (when the nest level is 1).
> 
>  [1] https://lore.kernel.org/netdev/33dc43f587ec1388ba456b4915c75f02a8aae226.1663945716.git.dcaratti@redhat.com/
>  [2] https://lore.kernel.org/netdev/Y0w%2FWWY60gqrtGLp@pop-os.localdomain/
>  [3] such behavior is not guaranteed: for example, if RPS or skb RX
>      timestamping is enabled on the mirred target device, the kernel
>      can defer receiving the skb and return NET_RX_SUCCESS inside
>      tcf_mirred_forward().
> 
> Reported-by: William Zhao <wizhao@redhat.com>
> CC: Xin Long <lucien.xin@gmail.com>
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>

Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Jamal Hadi Salim Jan. 23, 2023, 7:41 p.m. UTC | #2
On Mon, Jan 23, 2023 at 12:22 PM Marcelo Ricardo Leitner
<marcelo.leitner@gmail.com> wrote:
>
> On Fri, Jan 20, 2023 at 06:01:40PM +0100, Davide Caratti wrote:
> > William reports kernel soft-lockups on some OVS topologies when TC mirred
> > egress->ingress action is hit by local TCP traffic [1].
> > The same can also be reproduced with SCTP (thanks Xin for verifying), when
> > client and server reach themselves through mirred egress to ingress, and
> > one of the two peers sends a "heartbeat" packet (from within a timer).
> >
> > Enqueueing to backlog proved to fix this soft lockup; however, as Cong
> > noticed [2], we should preserve - when possible - the current mirred
> > behavior that counts as "overlimits" any eventual packet drop subsequent to
> > the mirred forwarding action [3]. A compromise solution might use the
> > backlog only when tcf_mirred_act() has a nest level greater than one:
> > change tcf_mirred_forward() accordingly.
> >
> > Also, add a kselftest that can reproduce the lockup and verifies TC mirred
> > ability to account for further packet drops after TC mirred egress->ingress
> > (when the nest level is 1).
> >
> >  [1] https://lore.kernel.org/netdev/33dc43f587ec1388ba456b4915c75f02a8aae226.1663945716.git.dcaratti@redhat.com/
> >  [2] https://lore.kernel.org/netdev/Y0w%2FWWY60gqrtGLp@pop-os.localdomain/
> >  [3] such behavior is not guaranteed: for example, if RPS or skb RX
> >      timestamping is enabled on the mirred target device, the kernel
> >      can defer receiving the skb and return NET_RX_SUCCESS inside
> >      tcf_mirred_forward().
> >
> > Reported-by: William Zhao <wizhao@redhat.com>
> > CC: Xin Long <lucien.xin@gmail.com>
> > Signed-off-by: Davide Caratti <dcaratti@redhat.com>
>
> Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>

cheers,
jamal
Jamal Hadi Salim Dec. 4, 2023, 8:24 p.m. UTC | #3
On Fri, Jan 20, 2023 at 12:02 PM Davide Caratti <dcaratti@redhat.com> wrote:
>
> William reports kernel soft-lockups on some OVS topologies when TC mirred
> egress->ingress action is hit by local TCP traffic [1].
> The same can also be reproduced with SCTP (thanks Xin for verifying), when
> client and server reach themselves through mirred egress to ingress, and
> one of the two peers sends a "heartbeat" packet (from within a timer).
>
> Enqueueing to backlog proved to fix this soft lockup; however, as Cong
> noticed [2], we should preserve - when possible - the current mirred
> behavior that counts as "overlimits" any eventual packet drop subsequent to
> the mirred forwarding action [3]. A compromise solution might use the
> backlog only when tcf_mirred_act() has a nest level greater than one:
> change tcf_mirred_forward() accordingly.
>
> Also, add a kselftest that can reproduce the lockup and verifies TC mirred
> ability to account for further packet drops after TC mirred egress->ingress
> (when the nest level is 1).
>

I am afraid this broke things. Here's a simple use case which causes
an infinite loop (that we found while testing blockcasting but
simplified to demonstrate the issue):

----
sudo ip netns add p4node
sudo ip link add p4port0 address 10:00:00:01:AA:BB type veth peer
port0 address 10:00:00:02:AA:BB
sudo ip link set dev port0 netns p4node
sudo ip a add 10.0.0.1/24 dev p4port0
sudo ip neigh add 10.0.0.2 dev p4port0 lladdr 10:00:00:02:aa:bb
sudo ip netns exec p4node ip a add 10.0.0.2/24 dev port0
sudo ip netns exec p4node ip l set dev port0 up
sudo ip l set dev p4port0 up
sudo ip netns exec p4node tc qdisc add dev port0 clsact
sudo ip netns exec p4node tc filter add dev port0 ingress protocol ip
prio 10 matchall action mirred ingress redirect dev port0

ping -I p4port0 10.0.0.2 -c 1
-----

reverting the patch fixes things and it gets caught by the nested
recursion check.

Frankly, I believe we should restore a proper ttl from what was removed here:
https://lore.kernel.org/all/1430765318-13788-1-git-send-email-fw@strlen.de/
The headaches(and time consumed) trying to save the 3-4 bits removing
the ttl field is not worth it imo.

cheers,
jamal
Davide Caratti Dec. 5, 2023, 10:54 a.m. UTC | #4
hello Jamal, thanks for looking at this!

On Mon, Dec 4, 2023 at 9:24 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On Fri, Jan 20, 2023 at 12:02 PM Davide Caratti <dcaratti@redhat.com> wrote:
> >
> > William reports kernel soft-lockups on some OVS topologies when TC mirred
> > egress->ingress action is hit by local TCP traffic [1].
> > The same can also be reproduced with SCTP (thanks Xin for verifying), when
> > client and server reach themselves through mirred egress to ingress, and
> > one of the two peers sends a "heartbeat" packet (from within a timer).

[...]

> I am afraid this broke things. Here's a simple use case which causes
> an infinite loop (that we found while testing blockcasting but
> simplified to demonstrate the issue):

[...]

> sudo ip netns exec p4node tc qdisc add dev port0 clsact
> sudo ip netns exec p4node tc filter add dev port0 ingress protocol ip
> prio 10 matchall action mirred ingress redirect dev port0

the above rule is taking packets from port0 ingress and putting it
again in the mirred ingress of the same device, hence the loop.
I don't see it much different than what we can obtain with bridges:

# ip link add name one type veth peer name two
# ip link add name three type veth peer name four
# for n in even odd; do ip link add name $n type bridge; done
# for n in one two three four even odd; do ip link set dev $n up; done
# for n in one three; do ip link set dev $n master odd; done
# for n in two four; do ip link set dev $n master even; done

there is a practical difference: with bridges we have protocols (like
STP) that can detect and act-upon loops - while TC mirred needs some
facility on top (not 100% sure, but the same might also apply to
similar tools, such as users of bpf_redirect() helper)

> reverting the patch fixes things and it gets caught by the nested
> recursion check.

the price of that revert is: we'll see those soft-lockups again with
L4 protocols when peers communicate through mirred egress -> ingress.
And even if it would fix mirred egress->ingress loops, we would still
suffer from soft-lockups (on the qdisc root lock) when the same rule
is done with mirred egress (see an example at
https://github.com/multipath-tcp/mptcp_net-next/issues/451#issuecomment-1782690200)
[1]

> Frankly, I believe we should restore a proper ttl from what was removed here:
> https://lore.kernel.org/all/1430765318-13788-1-git-send-email-fw@strlen.de/
> The headaches(and time consumed) trying to save the 3-4 bits removing
> the ttl field is not worth it imo.

TTL would protect us against loops when they are on the same node:
what do you think about inserting a rule that detects BPDU before the
mirred ingress rule?

thanks!
Jamal Hadi Salim Dec. 5, 2023, 3:12 p.m. UTC | #5
On Tue, Dec 5, 2023 at 5:54 AM Davide Caratti <dcaratti@redhat.com> wrote:
>
> hello Jamal, thanks for looking at this!
>
> On Mon, Dec 4, 2023 at 9:24 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> >
> > On Fri, Jan 20, 2023 at 12:02 PM Davide Caratti <dcaratti@redhat.com> wrote:
> > >
> > > William reports kernel soft-lockups on some OVS topologies when TC mirred
> > > egress->ingress action is hit by local TCP traffic [1].
> > > The same can also be reproduced with SCTP (thanks Xin for verifying), when
> > > client and server reach themselves through mirred egress to ingress, and
> > > one of the two peers sends a "heartbeat" packet (from within a timer).
>
> [...]
>
> > I am afraid this broke things. Here's a simple use case which causes
> > an infinite loop (that we found while testing blockcasting but
> > simplified to demonstrate the issue):
>
> [...]
>
> > sudo ip netns exec p4node tc qdisc add dev port0 clsact
> > sudo ip netns exec p4node tc filter add dev port0 ingress protocol ip
> > prio 10 matchall action mirred ingress redirect dev port0
>
> the above rule is taking packets from port0 ingress and putting it
> again in the mirred ingress of the same device, hence the loop.

Right - that was intentional to show the loop. We are worrying about
extending mirred now to also broadcast (see the blockcast discussion)
to more ports making the loop even worse. The loop should terminate at
some point - in this case it does not...

> I don't see it much different than what we can obtain with bridges:
>
> # ip link add name one type veth peer name two
> # ip link add name three type veth peer name four
> # for n in even odd; do ip link add name $n type bridge; done
> # for n in one two three four even odd; do ip link set dev $n up; done
> # for n in one three; do ip link set dev $n master odd; done
> # for n in two four; do ip link set dev $n master even; done
>

Sure that is another way to reproduce.

> there is a practical difference: with bridges we have protocols (like
> STP) that can detect and act-upon loops - while TC mirred needs some
> facility on top (not 100% sure, but the same might also apply to
> similar tools, such as users of bpf_redirect() helper)
>

I dont think we can run something equivalent inside the kernel. The
ttl worked fine. BTW, the example shown breaks even when you have
everything running on a single cpu (and packets being queued on the
backlog)

> > reverting the patch fixes things and it gets caught by the nested
> > recursion check.
>
> the price of that revert is: we'll see those soft-lockups again with
> L4 protocols when peers communicate through mirred egress -> ingress.
>
> And even if it would fix mirred egress->ingress loops, we would still
> suffer from soft-lockups (on the qdisc root lock) when the same rule
> is done with mirred egress (see an example at
> https://github.com/multipath-tcp/mptcp_net-next/issues/451#issuecomment-1782690200)
> [1]

Yes, we need to make sure those are fixed with whatever replacement..
The loops will happen even on egress->egress (the example only showed
ingress-ingress).

We will try restoring the ttl and see if it continues to work with
your patch intact... unless there are other ideas.

> > Frankly, I believe we should restore a proper ttl from what was removed here:
> > https://lore.kernel.org/all/1430765318-13788-1-git-send-email-fw@strlen.de/
> > The headaches(and time consumed) trying to save the 3-4 bits removing
> > the ttl field is not worth it imo.
>
> TTL would protect us against loops when they are on the same node:
> what do you think about inserting a rule that detects BPDU before the
> mirred ingress rule?

I dont think running STP will save us from this, unless i am mistaken.
This happens within the kernel before the packet hits the "wire".
Besides this happens without using any bridging.

> [1] by the way: the POC patch at
> https://github.com/multipath-tcp/mptcp_net-next/issues/451#issuecomment-1782654075
> silcences lockdep false warnings, and it preserves the splat when the
> real deadlock happens with TC marred egress. If you agree I will send
> it soon to this ML for review.

please do.

cheers,
jamal
Jamal Hadi Salim Dec. 7, 2023, 2:10 p.m. UTC | #6
Hi Davide,


On Tue, Dec 5, 2023 at 10:12 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On Tue, Dec 5, 2023 at 5:54 AM Davide Caratti <dcaratti@redhat.com> wrote:
> >
> > hello Jamal, thanks for looking at this!
> >
> > On Mon, Dec 4, 2023 at 9:24 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > >
> > > On Fri, Jan 20, 2023 at 12:02 PM Davide Caratti <dcaratti@redhat.com> wrote:
> > > >
> > > > William reports kernel soft-lockups on some OVS topologies when TC mirred
> > > > egress->ingress action is hit by local TCP traffic [1].
> > > > The same can also be reproduced with SCTP (thanks Xin for verifying), when
> > > > client and server reach themselves through mirred egress to ingress, and
> > > > one of the two peers sends a "heartbeat" packet (from within a timer).
> >
> > [...]
> >
> > > I am afraid this broke things. Here's a simple use case which causes
> > > an infinite loop (that we found while testing blockcasting but
> > > simplified to demonstrate the issue):
> >
> > [...]
> >
> > > sudo ip netns exec p4node tc qdisc add dev port0 clsact
> > > sudo ip netns exec p4node tc filter add dev port0 ingress protocol ip
> > > prio 10 matchall action mirred ingress redirect dev port0
> >
> > the above rule is taking packets from port0 ingress and putting it
> > again in the mirred ingress of the same device, hence the loop.
>
> Right - that was intentional to show the loop. We are worrying about
> extending mirred now to also broadcast (see the blockcast discussion)
> to more ports making the loop even worse. The loop should terminate at
> some point - in this case it does not...
>
> > I don't see it much different than what we can obtain with bridges:
> >
> > # ip link add name one type veth peer name two
> > # ip link add name three type veth peer name four
> > # for n in even odd; do ip link add name $n type bridge; done
> > # for n in one two three four even odd; do ip link set dev $n up; done
> > # for n in one three; do ip link set dev $n master odd; done
> > # for n in two four; do ip link set dev $n master even; done
> >
>
> Sure that is another way to reproduce.

Ok, so i can verify that re-introduction of the ttl field in the
skb[1] fixes the issue. But restoring that patch may cause too much
bikeshedding. Victor will work on a better approach using the cb
struct instead - there may. Are you able to test with/out your patch
and see if this same patch fixes it?

cheers,
jamal

[1]https://lore.kernel.org/all/1430765318-13788-1-git-send-email-fw@strlen.de/

> > there is a practical difference: with bridges we have protocols (like
> > STP) that can detect and act-upon loops - while TC mirred needs some
> > facility on top (not 100% sure, but the same might also apply to
> > similar tools, such as users of bpf_redirect() helper)
> >
>
> I dont think we can run something equivalent inside the kernel. The
> ttl worked fine. BTW, the example shown breaks even when you have
> everything running on a single cpu (and packets being queued on the
> backlog)
>
> > > reverting the patch fixes things and it gets caught by the nested
> > > recursion check.
> >
> > the price of that revert is: we'll see those soft-lockups again with
> > L4 protocols when peers communicate through mirred egress -> ingress.
> >
> > And even if it would fix mirred egress->ingress loops, we would still
> > suffer from soft-lockups (on the qdisc root lock) when the same rule
> > is done with mirred egress (see an example at
> > https://github.com/multipath-tcp/mptcp_net-next/issues/451#issuecomment-1782690200)
> > [1]
>
> Yes, we need to make sure those are fixed with whatever replacement..
> The loops will happen even on egress->egress (the example only showed
> ingress-ingress).
>
> We will try restoring the ttl and see if it continues to work with
> your patch intact... unless there are other ideas.
>
> > > Frankly, I believe we should restore a proper ttl from what was removed here:
> > > https://lore.kernel.org/all/1430765318-13788-1-git-send-email-fw@strlen.de/
> > > The headaches(and time consumed) trying to save the 3-4 bits removing
> > > the ttl field is not worth it imo.
> >
> > TTL would protect us against loops when they are on the same node:
> > what do you think about inserting a rule that detects BPDU before the
> > mirred ingress rule?
>
> I dont think running STP will save us from this, unless i am mistaken.
> This happens within the kernel before the packet hits the "wire".
> Besides this happens without using any bridging.
>
> > [1] by the way: the POC patch at
> > https://github.com/multipath-tcp/mptcp_net-next/issues/451#issuecomment-1782654075
> > silcences lockdep false warnings, and it preserves the splat when the
> > real deadlock happens with TC marred egress. If you agree I will send
> > it soon to this ML for review.
>
> please do.
>
> cheers,
> jamal
Davide Caratti Dec. 11, 2023, 1:07 p.m. UTC | #7
hello Jamal,

On Thu, Dec 07, 2023 at 09:10:13AM -0500, Jamal Hadi Salim wrote:
> Hi Davide,
>

[...]
 
> > > > I am afraid this broke things. Here's a simple use case which causes
> > > > an infinite loop (that we found while testing blockcasting but
> > > > simplified to demonstrate the issue):
> > >
> > > [...]
> > >
> > > > sudo ip netns exec p4node tc qdisc add dev port0 clsact
> > > > sudo ip netns exec p4node tc filter add dev port0 ingress protocol ip
> > > > prio 10 matchall action mirred ingress redirect dev port0
> > >
> > > the above rule is taking packets from port0 ingress and putting it
> > > again in the mirred ingress of the same device, hence the loop.
> >
> > Right - that was intentional to show the loop. We are worrying about
> > extending mirred now to also broadcast (see the blockcast discussion)
> > to more ports making the loop even worse. The loop should terminate at
> > some point - in this case it does not...
> >
> > > I don't see it much different than what we can obtain with bridges:
> > >
> > > # ip link add name one type veth peer name two
> > > # ip link add name three type veth peer name four
> > > # for n in even odd; do ip link add name $n type bridge; done
> > > # for n in one two three four even odd; do ip link set dev $n up; done
> > > # for n in one three; do ip link set dev $n master odd; done
> > > # for n in two four; do ip link set dev $n master even; done
> > >
> >
> > Sure that is another way to reproduce.
> 
> Ok, so i can verify that re-introduction of the ttl field in the
> skb[1] fixes the issue. But restoring that patch may cause too much
> bikeshedding. Victor will work on a better approach using the cb
> struct instead - there may. Are you able to test with/out your patch
> and see if this same patch fixes it?

I'm also more optimistic on the use of qdisc cb for that purpose :)
Just share the code, i will be happy to review/test.
With regular TC mirred, the deadlock happened with TCP and SCTP socket
locks, as they were sending an ACK back for a packet that was sent by
the peer using egress->ingress.

AFAIR there is a small reproducer in tc_actions.sh kselftest, namely

mirred_egress_to_ingress_tcp_test()

maybe it's useful for pre-verification also.

[...]

my 2 cents  below:

> > I dont think we can run something equivalent inside the kernel. The
> > ttl worked fine. BTW, the example shown breaks even when you have
> > everything running on a single cpu (and packets being queued on the
> > backlog)

[...]

> > Yes, we need to make sure those are fixed with whatever replacement..
> > The loops will happen even on egress->egress (the example only showed
> > ingress-ingress).

if you try to make a loop using mirred egress/redirect, the first packet
will trigger a deadlock on the root qdisc lock - see [1]. It's worse
than a loop, because user can't fix it by just removing the "offending"
mirred action. Would the ttl be helpful here?

(in the meanwhile, I ill try to figure out if it's possible at least to
silence false lockdep warnings without using dynamic keys, as per
Eric reply).

TIA!
Jamal Hadi Salim Dec. 11, 2023, 3:50 p.m. UTC | #8
Hi Davide,

On Mon, Dec 11, 2023 at 8:08 AM Davide Caratti <dcaratti@redhat.com> wrote:
>
> hello Jamal,
>
> On Thu, Dec 07, 2023 at 09:10:13AM -0500, Jamal Hadi Salim wrote:
> > Hi Davide,
> >
>
> [...]
>
> > > > > I am afraid this broke things. Here's a simple use case which causes
> > > > > an infinite loop (that we found while testing blockcasting but
> > > > > simplified to demonstrate the issue):
> > > >
> > > > [...]
> > > >
> > > > > sudo ip netns exec p4node tc qdisc add dev port0 clsact
> > > > > sudo ip netns exec p4node tc filter add dev port0 ingress protocol ip
> > > > > prio 10 matchall action mirred ingress redirect dev port0
> > > >
> > > > the above rule is taking packets from port0 ingress and putting it
> > > > again in the mirred ingress of the same device, hence the loop.
> > >
> > > Right - that was intentional to show the loop. We are worrying about
> > > extending mirred now to also broadcast (see the blockcast discussion)
> > > to more ports making the loop even worse. The loop should terminate at
> > > some point - in this case it does not...
> > >
> > > > I don't see it much different than what we can obtain with bridges:
> > > >
> > > > # ip link add name one type veth peer name two
> > > > # ip link add name three type veth peer name four
> > > > # for n in even odd; do ip link add name $n type bridge; done
> > > > # for n in one two three four even odd; do ip link set dev $n up; done
> > > > # for n in one three; do ip link set dev $n master odd; done
> > > > # for n in two four; do ip link set dev $n master even; done
> > > >
> > >
> > > Sure that is another way to reproduce.
> >
> > Ok, so i can verify that re-introduction of the ttl field in the
> > skb[1] fixes the issue. But restoring that patch may cause too much
> > bikeshedding. Victor will work on a better approach using the cb
> > struct instead - there may. Are you able to test with/out your patch
> > and see if this same patch fixes it?
>
> I'm also more optimistic on the use of qdisc cb for that purpose :)
> Just share the code, i will be happy to review/test.
> With regular TC mirred, the deadlock happened with TCP and SCTP socket
> locks, as they were sending an ACK back for a packet that was sent by
> the peer using egress->ingress.
>
> AFAIR there is a small reproducer in tc_actions.sh kselftest, namely
>
> mirred_egress_to_ingress_tcp_test()
>
> maybe it's useful for pre-verification also.
>

Ah, good - didnt realize there was a reproducer for your use case.
We'll try it out.

> [...]
>
> my 2 cents  below:
>
> > > I dont think we can run something equivalent inside the kernel. The
> > > ttl worked fine. BTW, the example shown breaks even when you have
> > > everything running on a single cpu (and packets being queued on the
> > > backlog)
>
> [...]
>
> > > Yes, we need to make sure those are fixed with whatever replacement..
> > > The loops will happen even on egress->egress (the example only showed
> > > ingress-ingress).
>
> if you try to make a loop using mirred egress/redirect, the first packet
> will trigger a deadlock on the root qdisc lock - see [1]. It's worse
> than a loop, because user can't fix it by just removing the "offending"
> mirred action. Would the ttl be helpful here?
>

Possible. So the test is just to create a loop?
Lets test with the reproducer you pointed out then see where we go from there.

> (in the meanwhile, I ill try to figure out if it's possible at least to
> silence false lockdep warnings without using dynamic keys, as per
> Eric reply).
>

Sorry, wasnt helpful - i have been in travel mode for the last week.

cheers,
jamal
> TIA!
>
> --
> davide
>
> [1] https://github.com/multipath-tcp/mptcp_net-next/issues/451#issuecomment-1782690200
>
>
diff mbox series

Patch

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index c8abb5136491..8037ec9b1d31 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -206,12 +206,19 @@  static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	return err;
 }
 
+static bool is_mirred_nested(void)
+{
+	return unlikely(__this_cpu_read(mirred_nest_level) > 1);
+}
+
 static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb)
 {
 	int err;
 
 	if (!want_ingress)
 		err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
+	else if (is_mirred_nested())
+		err = netif_rx(skb);
 	else
 		err = netif_receive_skb(skb);
 
diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
index 1e0a62f638fe..919c0dd9fe4b 100755
--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -3,7 +3,8 @@ 
 
 ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \
 	mirred_egress_mirror_test matchall_mirred_egress_mirror_test \
-	gact_trap_test mirred_egress_to_ingress_test"
+	gact_trap_test mirred_egress_to_ingress_test \
+	mirred_egress_to_ingress_tcp_test"
 NUM_NETIFS=4
 source tc_common.sh
 source lib.sh
@@ -198,6 +199,52 @@  mirred_egress_to_ingress_test()
 	log_test "mirred_egress_to_ingress ($tcflags)"
 }
 
+mirred_egress_to_ingress_tcp_test()
+{
+	local tmpfile=$(mktemp) tmpfile1=$(mktemp)
+
+	RET=0
+	dd conv=sparse status=none if=/dev/zero bs=1M count=2 of=$tmpfile
+	tc filter add dev $h1 protocol ip pref 100 handle 100 egress flower \
+		$tcflags ip_proto tcp src_ip 192.0.2.1 dst_ip 192.0.2.2 \
+			action ct commit nat src addr 192.0.2.2 pipe \
+			action ct clear pipe \
+			action ct commit nat dst addr 192.0.2.1 pipe \
+			action ct clear pipe \
+			action skbedit ptype host pipe \
+			action mirred ingress redirect dev $h1
+	tc filter add dev $h1 protocol ip pref 101 handle 101 egress flower \
+		$tcflags ip_proto icmp \
+			action mirred ingress redirect dev $h1
+	tc filter add dev $h1 protocol ip pref 102 handle 102 ingress flower \
+		ip_proto icmp \
+			action drop
+
+	ip vrf exec v$h1 nc --recv-only -w10 -l -p 12345 -o $tmpfile1  &
+	local rpid=$!
+	ip vrf exec v$h1 nc -w1 --send-only 192.0.2.2 12345 <$tmpfile
+	wait -n $rpid
+	cmp -s $tmpfile $tmpfile1
+	check_err $? "server output check failed"
+
+	$MZ $h1 -c 10 -p 64 -a $h1mac -b $h1mac -A 192.0.2.1 -B 192.0.2.1 \
+		-t icmp "ping,id=42,seq=5" -q
+	tc_check_packets "dev $h1 egress" 101 10
+	check_err $? "didn't mirred redirect ICMP"
+	tc_check_packets "dev $h1 ingress" 102 10
+	check_err $? "didn't drop mirred ICMP"
+	local overlimits=$(tc_rule_stats_get ${h1} 101 egress .overlimits)
+	test ${overlimits} = 10
+	check_err $? "wrong overlimits, expected 10 got ${overlimits}"
+
+	tc filter del dev $h1 egress protocol ip pref 100 handle 100 flower
+	tc filter del dev $h1 egress protocol ip pref 101 handle 101 flower
+	tc filter del dev $h1 ingress protocol ip pref 102 handle 102 flower
+
+	rm -f $tmpfile $tmpfile1
+	log_test "mirred_egress_to_ingress_tcp ($tcflags)"
+}
+
 setup_prepare()
 {
 	h1=${NETIFS[p1]}