diff mbox series

[net] net/sched: Add xmit_recursion level in sch_direct_xmit()

Message ID 20240524085108.1430317-1-yuehaibing@huawei.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net] net/sched: Add xmit_recursion level in sch_direct_xmit() | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 3958 this patch: 3958
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 9 of 9 maintainers
netdev/build_clang fail Errors and warnings before: 130 this patch: 130
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn fail Errors and warnings before: 4223 this patch: 4223
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 60 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 22 this patch: 22
netdev/source_inline success Was 0 now: 0

Commit Message

Yue Haibing May 24, 2024, 8:51 a.m. UTC
packet from PF_PACKET socket ontop of an IPv6-backed ipvlan device will hit
WARN_ON_ONCE() in sk_mc_loop() through sch_direct_xmit() path while ipvlan
device has qdisc queue.

WARNING: CPU: 2 PID: 0 at net/core/sock.c:775 sk_mc_loop+0x2d/0x70
Modules linked in: sch_netem ipvlan rfkill cirrus drm_shmem_helper sg drm_kms_helper
CPU: 2 PID: 0 Comm: swapper/2 Kdump: loaded Not tainted 6.9.0+ #279
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
RIP: 0010:sk_mc_loop+0x2d/0x70
Code: fa 0f 1f 44 00 00 65 0f b7 15 f7 96 a3 4f 31 c0 66 85 d2 75 26 48 85 ff 74 1c
RSP: 0018:ffffa9584015cd78 EFLAGS: 00010212
RAX: 0000000000000011 RBX: ffff91e585793e00 RCX: 0000000002c6a001
RDX: 0000000000000000 RSI: 0000000000000040 RDI: ffff91e589c0f000
RBP: ffff91e5855bd100 R08: 0000000000000000 R09: 3d00545216f43d00
R10: ffff91e584fdcc50 R11: 00000060dd8616f4 R12: ffff91e58132d000
R13: ffff91e584fdcc68 R14: ffff91e5869ce800 R15: ffff91e589c0f000
FS:  0000000000000000(0000) GS:ffff91e898100000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f788f7c44c0 CR3: 0000000008e1a000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <IRQ>
 ? __warn+0x83/0x130
 ? sk_mc_loop+0x2d/0x70
 ? report_bug+0x18e/0x1a0
 ? handle_bug+0x3c/0x70
 ? exc_invalid_op+0x18/0x70
 ? asm_exc_invalid_op+0x1a/0x20
 ? sk_mc_loop+0x2d/0x70
 ip6_finish_output2+0x31e/0x590
 ? nf_hook_slow+0x43/0xf0
 ip6_finish_output+0x1f8/0x320
 ? __pfx_ip6_finish_output+0x10/0x10
 ipvlan_xmit_mode_l3+0x22a/0x2a0 [ipvlan]
 ipvlan_start_xmit+0x17/0x50 [ipvlan]
 dev_hard_start_xmit+0x8c/0x1d0
 sch_direct_xmit+0xa2/0x390
 __qdisc_run+0x66/0xd0
 net_tx_action+0x1ca/0x270
 handle_softirqs+0xd6/0x2b0
 __irq_exit_rcu+0x9b/0xc0
 sysvec_apic_timer_interrupt+0x75/0x90
 </IRQ>

Fixes: f60e5990d9c1 ("ipv6: protect skb->sk accesses from recursive dereference inside the stack")
Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
---
 include/linux/netdevice.h | 17 +++++++++++++++++
 net/core/dev.h            | 17 -----------------
 net/sched/sch_generic.c   |  8 +++++---
 3 files changed, 22 insertions(+), 20 deletions(-)

Comments

Eric Dumazet May 24, 2024, 9:24 a.m. UTC | #1
On Fri, May 24, 2024 at 10:49 AM Yue Haibing <yuehaibing@huawei.com> wrote:
>
> packet from PF_PACKET socket ontop of an IPv6-backed ipvlan device will hit
> WARN_ON_ONCE() in sk_mc_loop() through sch_direct_xmit() path while ipvlan
> device has qdisc queue.
>
> WARNING: CPU: 2 PID: 0 at net/core/sock.c:775 sk_mc_loop+0x2d/0x70
> Modules linked in: sch_netem ipvlan rfkill cirrus drm_shmem_helper sg drm_kms_helper
> CPU: 2 PID: 0 Comm: swapper/2 Kdump: loaded Not tainted 6.9.0+ #279
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
> RIP: 0010:sk_mc_loop+0x2d/0x70
> Code: fa 0f 1f 44 00 00 65 0f b7 15 f7 96 a3 4f 31 c0 66 85 d2 75 26 48 85 ff 74 1c
> RSP: 0018:ffffa9584015cd78 EFLAGS: 00010212
> RAX: 0000000000000011 RBX: ffff91e585793e00 RCX: 0000000002c6a001
> RDX: 0000000000000000 RSI: 0000000000000040 RDI: ffff91e589c0f000
> RBP: ffff91e5855bd100 R08: 0000000000000000 R09: 3d00545216f43d00
> R10: ffff91e584fdcc50 R11: 00000060dd8616f4 R12: ffff91e58132d000
> R13: ffff91e584fdcc68 R14: ffff91e5869ce800 R15: ffff91e589c0f000
> FS:  0000000000000000(0000) GS:ffff91e898100000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007f788f7c44c0 CR3: 0000000008e1a000 CR4: 00000000000006f0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
>  <IRQ>
>  ? __warn+0x83/0x130
>  ? sk_mc_loop+0x2d/0x70
>  ? report_bug+0x18e/0x1a0
>  ? handle_bug+0x3c/0x70
>  ? exc_invalid_op+0x18/0x70
>  ? asm_exc_invalid_op+0x1a/0x20
>  ? sk_mc_loop+0x2d/0x70
>  ip6_finish_output2+0x31e/0x590
>  ? nf_hook_slow+0x43/0xf0
>  ip6_finish_output+0x1f8/0x320
>  ? __pfx_ip6_finish_output+0x10/0x10
>  ipvlan_xmit_mode_l3+0x22a/0x2a0 [ipvlan]
>  ipvlan_start_xmit+0x17/0x50 [ipvlan]
>  dev_hard_start_xmit+0x8c/0x1d0
>  sch_direct_xmit+0xa2/0x390
>  __qdisc_run+0x66/0xd0
>  net_tx_action+0x1ca/0x270
>  handle_softirqs+0xd6/0x2b0
>  __irq_exit_rcu+0x9b/0xc0
>  sysvec_apic_timer_interrupt+0x75/0x90

Please provide full symbols in stack traces.

>  </IRQ>
>
> Fixes: f60e5990d9c1 ("ipv6: protect skb->sk accesses from recursive dereference inside the stack")
> Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
> ---
>  include/linux/netdevice.h | 17 +++++++++++++++++
>  net/core/dev.h            | 17 -----------------
>  net/sched/sch_generic.c   |  8 +++++---
>  3 files changed, 22 insertions(+), 20 deletions(-)

This patch seems unrelated to the WARN_ON_ONCE(1) met in sk_mc_loop()

If sk_mc_loop() is called with a socket which is not inet, we are in trouble.

Please fix the root cause instead of trying to shortcut sk_mc_loop() as you did.
Yue Haibing May 24, 2024, 10:40 a.m. UTC | #2
On 2024/5/24 17:24, Eric Dumazet wrote:
> On Fri, May 24, 2024 at 10:49 AM Yue Haibing <yuehaibing@huawei.com> wrote:
>>
>> packet from PF_PACKET socket ontop of an IPv6-backed ipvlan device will hit
>> WARN_ON_ONCE() in sk_mc_loop() through sch_direct_xmit() path while ipvlan
>> device has qdisc queue.
>>
>> WARNING: CPU: 2 PID: 0 at net/core/sock.c:775 sk_mc_loop+0x2d/0x70
>> Modules linked in: sch_netem ipvlan rfkill cirrus drm_shmem_helper sg drm_kms_helper
>> CPU: 2 PID: 0 Comm: swapper/2 Kdump: loaded Not tainted 6.9.0+ #279
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
>> RIP: 0010:sk_mc_loop+0x2d/0x70
>> Code: fa 0f 1f 44 00 00 65 0f b7 15 f7 96 a3 4f 31 c0 66 85 d2 75 26 48 85 ff 74 1c
>> RSP: 0018:ffffa9584015cd78 EFLAGS: 00010212
>> RAX: 0000000000000011 RBX: ffff91e585793e00 RCX: 0000000002c6a001
>> RDX: 0000000000000000 RSI: 0000000000000040 RDI: ffff91e589c0f000
>> RBP: ffff91e5855bd100 R08: 0000000000000000 R09: 3d00545216f43d00
>> R10: ffff91e584fdcc50 R11: 00000060dd8616f4 R12: ffff91e58132d000
>> R13: ffff91e584fdcc68 R14: ffff91e5869ce800 R15: ffff91e589c0f000
>> FS:  0000000000000000(0000) GS:ffff91e898100000(0000) knlGS:0000000000000000
>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> CR2: 00007f788f7c44c0 CR3: 0000000008e1a000 CR4: 00000000000006f0
>> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>> Call Trace:
>>  <IRQ>
>>  ? __warn+0x83/0x130
>>  ? sk_mc_loop+0x2d/0x70
>>  ? report_bug+0x18e/0x1a0
>>  ? handle_bug+0x3c/0x70
>>  ? exc_invalid_op+0x18/0x70
>>  ? asm_exc_invalid_op+0x1a/0x20
>>  ? sk_mc_loop+0x2d/0x70
>>  ip6_finish_output2+0x31e/0x590
>>  ? nf_hook_slow+0x43/0xf0
>>  ip6_finish_output+0x1f8/0x320
>>  ? __pfx_ip6_finish_output+0x10/0x10
>>  ipvlan_xmit_mode_l3+0x22a/0x2a0 [ipvlan]
>>  ipvlan_start_xmit+0x17/0x50 [ipvlan]
>>  dev_hard_start_xmit+0x8c/0x1d0
>>  sch_direct_xmit+0xa2/0x390
>>  __qdisc_run+0x66/0xd0
>>  net_tx_action+0x1ca/0x270
>>  handle_softirqs+0xd6/0x2b0
>>  __irq_exit_rcu+0x9b/0xc0
>>  sysvec_apic_timer_interrupt+0x75/0x90
> 
> Please provide full symbols in stack traces.

Call Trace:
<IRQ>
? __warn (kernel/panic.c:693)
? sk_mc_loop (net/core/sock.c:775 net/core/sock.c:760)
? report_bug (lib/bug.c:201 lib/bug.c:219)
? handle_bug (arch/x86/kernel/traps.c:239)
? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1))
? asm_exc_invalid_op (./arch/x86/include/asm/idtentry.h:621)
? sk_mc_loop (net/core/sock.c:775 net/core/sock.c:760)
ip6_finish_output2 (net/ipv6/ip6_output.c:83 (discriminator 1))
? nf_hook_slow (./include/linux/netfilter.h:154 net/netfilter/core.c:626)
ip6_finish_output (net/ipv6/ip6_output.c:211 net/ipv6/ip6_output.c:222)
? __pfx_ip6_finish_output (net/ipv6/ip6_output.c:215)
ipvlan_xmit_mode_l3 (drivers/net/ipvlan/ipvlan_core.c:498 drivers/net/ipvlan/ipvlan_core.c:538 drivers/net/ipvlan/ipvlan_core.c:602) ipvlan
ipvlan_start_xmit (drivers/net/ipvlan/ipvlan_main.c:226) ipvlan
dev_hard_start_xmit (./include/linux/netdevice.h:4882 ./include/linux/netdevice.h:4896 net/core/dev.c:3578 net/core/dev.c:3594)
sch_direct_xmit (net/sched/sch_generic.c:343)
__qdisc_run (net/sched/sch_generic.c:416)
net_tx_action (./include/net/sch_generic.h:219 ./include/net/pkt_sched.h:128 ./include/net/pkt_sched.h:124 net/core/dev.c:5286)
handle_softirqs (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/irq.h:142 kernel/softirq.c:555)
__irq_exit_rcu (kernel/softirq.c:589 kernel/softirq.c:428 kernel/softirq.c:637)
sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1043 arch/x86/kernel/apic/apic.c:1043)

> 
>>  </IRQ>
>>
>> Fixes: f60e5990d9c1 ("ipv6: protect skb->sk accesses from recursive dereference inside the stack")
>> Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
>> ---
>>  include/linux/netdevice.h | 17 +++++++++++++++++
>>  net/core/dev.h            | 17 -----------------
>>  net/sched/sch_generic.c   |  8 +++++---
>>  3 files changed, 22 insertions(+), 20 deletions(-)
> 
> This patch seems unrelated to the WARN_ON_ONCE(1) met in sk_mc_loop()
> 
> If sk_mc_loop() is called with a socket which is not inet, we are in trouble.
> 
> Please fix the root cause instead of trying to shortcut sk_mc_loop() as you did.
First setup like this:
ip netns add ns0
ip netns add ns1
ip link add ip0 link eth0 type ipvlan mode l3 vepa
ip link add ip1 link eth0 type ipvlan mode l3 vepa
ip link set ip0 netns ns0
ip link exec ip link set ip0 up
ip link set ip1 netns ns1
ip link exec ip link set ip1 up
ip link exec tc qdisc add dev ip1 root netem delay 10ms

Second, build and send a raw ipv6 multicast packet as attached repro in ns1

packet_sendmsg
   packet_snd //skb->sk is packet sk
      __dev_queue_xmit
         __dev_xmit_skb //q->enqueue is not NULL
             __qdisc_run
                 qdisc_restart
                    sch_direct_xmit
                       dev_hard_start_xmit
                          netdev_start_xmit
                            ipvlan_start_xmit
                              ipvlan_xmit_mode_l3 //l3 mode
                                 ipvlan_process_outbound //vepa flag
                                   ipvlan_process_v6_outbound //skb->protocol is ETH_P_IPV6
                                      ip6_local_out
                                       ...
                                         __ip6_finish_output
                                            ip6_finish_output2 //multicast packet
                                               sk_mc_loop //dev_recursion_level is 0
                                                  WARN_ON_ONCE //sk->sk_family is AF_PACKET

> .
>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>           // close()
#include <string.h>           // strcpy, memset(), and memcpy()
#include <netdb.h>            // struct addrinfo
#include <sys/types.h>        // needed for socket(), uint8_t, uint16_t, uint32_t
#include <sys/socket.h>       // needed for socket()
#include <netinet/in.h>       // IPPROTO_TCP, INET6_ADDRSTRLEN
#include <netinet/ip.h>       // IP_MAXPACKET (which is 65535)
#include <netinet/ip6.h>      // struct ip6_hdr
#define __FAVOR_BSD           // Use BSD format of tcp header
#include <netinet/tcp.h>      // struct tcphdr
#include <arpa/inet.h>        // inet_pton() and inet_ntop()
#include <sys/ioctl.h>        // macro ioctl is defined
#include <bits/ioctls.h>      // defines values for argument "request" of ioctl.
#include <net/if.h>           // struct ifreq
#include <linux/if_ether.h>   // ETH_P_IP = 0x0800, ETH_P_IPV6 = 0x86DD
#include <linux/if_packet.h>  // struct sockaddr_ll (see man 7 packet)
#include <net/ethernet.h>
#include <errno.h>            // errno, perror()

#define ETH_HDRLEN 14  // Ethernet header length
#define IP6_HDRLEN 40  // IPv6 header length
#define TCP_HDRLEN 20  // TCP header length, excludes options data

uint16_t checksum (uint16_t *, int);
uint16_t tcp6_checksum (struct ip6_hdr, struct tcphdr);
char *allocate_strmem (int);
uint8_t *allocate_ustrmem (int);
int *allocate_intmem (int);

int main (int argc, char **argv)
{
  int i, status, frame_length, sd, bytes, *tcp_flags;
  char *interface, *target, *src_ip, *dst_ip;
  struct ip6_hdr iphdr;
  struct tcphdr tcphdr;
  uint8_t *src_mac, *dst_mac, *ether_frame;
  struct addrinfo hints, *res;
  struct sockaddr_in6 *ipv6;
  struct sockaddr_ll device;
  struct ifreq ifr;
  void *tmp;

  // Allocate memory for various arrays.
  src_mac = allocate_ustrmem (6);
  dst_mac = allocate_ustrmem (6);
  ether_frame = allocate_ustrmem (IP_MAXPACKET);
  interface = allocate_strmem (40);
  target = allocate_strmem (INET6_ADDRSTRLEN);
  src_ip = allocate_strmem (INET6_ADDRSTRLEN);
  dst_ip = allocate_strmem (INET6_ADDRSTRLEN);
  tcp_flags = allocate_intmem (8);

  // Interface to send packet through.
  strcpy (interface, "ip1");

  // Submit request for a socket descriptor to look up interface.
  if ((sd = socket (PF_PACKET, SOCK_RAW, htons (ETH_P_ALL))) < 0) {
    perror ("socket() failed to get socket descriptor for using ioctl() ");
    exit (EXIT_FAILURE);
  }

  // Use ioctl() to look up interface name and get its MAC address.
  memset (&ifr, 0, sizeof (ifr));
  snprintf (ifr.ifr_name, sizeof (ifr.ifr_name), "%s", interface);
  if (ioctl (sd, SIOCGIFHWADDR, &ifr) < 0) {
    perror ("ioctl() failed to get source MAC address ");
    return (EXIT_FAILURE);
  }
  close (sd);

  // Copy source MAC address.
  memcpy (src_mac, ifr.ifr_hwaddr.sa_data, 6 * sizeof (uint8_t));
  // Set destination MAC address, same as src_mac
  memcpy (dst_mac, src_mac, 6 * sizeof (uint8_t));

  // Find interface index from interface name and store index in
  // struct sockaddr_ll device, which will be used as an argument of sendto().
  if ((device.sll_ifindex = if_nametoindex (interface)) == 0) {
    perror ("if_nametoindex() failed to obtain interface index ");
    exit (EXIT_FAILURE);
  }

  strcpy (src_ip, "fe80::5254:0:93d:f416");
  strcpy (target, "ff00::01");

  // Fill out hints for getaddrinfo().
  memset (&hints, 0, sizeof (struct addrinfo));
  hints.ai_family = AF_INET6;
  hints.ai_socktype = SOCK_RAW;
  hints.ai_flags = hints.ai_flags | AI_CANONNAME;

  // Resolve target using getaddrinfo().
  if ((status = getaddrinfo (target, NULL, &hints, &res)) != 0) {
    fprintf (stderr, "getaddrinfo() failed: %s\n", gai_strerror (status));
    exit (EXIT_FAILURE);
  }
  ipv6 = (struct sockaddr_in6 *) res->ai_addr;
  tmp = &(ipv6->sin6_addr);
  if (inet_ntop (AF_INET6, tmp, dst_ip, INET6_ADDRSTRLEN) == NULL) {
    status = errno;
    fprintf (stderr, "inet_ntop() failed.\nError message: %s", strerror (status));
    exit (EXIT_FAILURE);
  }
  freeaddrinfo (res);

  // Fill out sockaddr_ll.
  device.sll_family = AF_PACKET;
  memcpy (device.sll_addr, src_mac, 6 * sizeof (uint8_t));
  device.sll_halen = htons (6);
  device.sll_protocol = htons (ETH_P_IPV6);

  // IPv6 header
  // IPv6 version (4 bits), Traffic class (8 bits), Flow label (20 bits)
  iphdr.ip6_flow = htonl ((6 << 28) | (0 << 20) | 0);

  // Payload length (16 bits): TCP header
  iphdr.ip6_plen = htons (TCP_HDRLEN);

  // Next header (8 bits): 6 for TCP
  iphdr.ip6_nxt = IPPROTO_TCP;

  // Hop limit (8 bits): default to maximum value
  iphdr.ip6_hops = 255;

  // Source IPv6 address (128 bits)
  if ((status = inet_pton (AF_INET6, src_ip, &(iphdr.ip6_src))) != 1) {
    fprintf (stderr, "inet_pton() failed.\nError message: %s", strerror (status));
    exit (EXIT_FAILURE);
  }

  // Destination IPv6 address (128 bits)
  if ((status = inet_pton (AF_INET6, dst_ip, &(iphdr.ip6_dst))) != 1) {
    fprintf (stderr, "inet_pton() failed.\nError message: %s", strerror (status));
    exit (EXIT_FAILURE);
  }

  // TCP header
  // Source port number (16 bits)
  tcphdr.th_sport = htons (60);

  // Destination port number (16 bits)
  tcphdr.th_dport = htons (80);

  // Sequence number (32 bits)
  tcphdr.th_seq = htonl (0);

  // Acknowledgement number (32 bits): 0 in first packet of SYN/ACK process
  tcphdr.th_ack = htonl (0);

  // Reserved (4 bits): should be 0
  tcphdr.th_x2 = 0;

  // Data offset (4 bits): size of TCP header in 32-bit words
  tcphdr.th_off = TCP_HDRLEN / 4;

  // Flags (8 bits)

  // FIN flag (1 bit)
  tcp_flags[0] = 0;

  // SYN flag (1 bit): set to 1
  tcp_flags[1] = 1;

  // RST flag (1 bit)
  tcp_flags[2] = 0;

  // PSH flag (1 bit)
  tcp_flags[3] = 0;

  // ACK flag (1 bit)
  tcp_flags[4] = 0;

  // URG flag (1 bit)
  tcp_flags[5] = 0;

  // ECE flag (1 bit)
  tcp_flags[6] = 0;

  // CWR flag (1 bit)
  tcp_flags[7] = 0;

  tcphdr.th_flags = 0;
  for (i=0; i<8; i++) {
    tcphdr.th_flags += (tcp_flags[i] << i);
  }

  // Window size (16 bits)
  tcphdr.th_win = htons (65535);

  // Urgent pointer (16 bits): 0 (only valid if URG flag is set)
  tcphdr.th_urp = htons (0);

  // TCP checksum (16 bits)
  tcphdr.th_sum = tcp6_checksum (iphdr, tcphdr);

  // Fill out ethernet frame header.
  // Ethernet frame length = ethernet header (MAC + MAC + ethernet type) + ethernet data (IP header + TCP header)
  frame_length = 6 + 6 + 2 + IP6_HDRLEN + TCP_HDRLEN;

  // Destination and Source MAC addresses
  memcpy (ether_frame, dst_mac, 6 * sizeof (uint8_t));
  memcpy (ether_frame + 6, src_mac, 6 * sizeof (uint8_t));

  // Next is ethernet type code (ETH_P_IPV6 for IPv6).
  // http://www.iana.org/assignments/ethernet-numbers
  ether_frame[12] = ETH_P_IPV6 / 256;
  ether_frame[13] = ETH_P_IPV6 % 256;

  // Next is ethernet frame data (IPv6 header + TCP header).
  // IPv6 header
  memcpy (ether_frame + ETH_HDRLEN, &iphdr, IP6_HDRLEN * sizeof (uint8_t));

  // TCP header
  memcpy (ether_frame + ETH_HDRLEN + IP6_HDRLEN, &tcphdr, TCP_HDRLEN * sizeof (uint8_t));

  // Submit request for a raw socket descriptor.
  if ((sd = socket (PF_PACKET, SOCK_RAW, htons (ETH_P_ALL))) < 0) {
    perror ("socket() failed ");
    exit (EXIT_FAILURE);
  }

  // Send ethernet frame to socket.
  if ((bytes = sendto (sd, ether_frame, frame_length, 0, (struct sockaddr *) &device, sizeof (device))) <= 0) {
    perror ("sendto() failed");
    exit (EXIT_FAILURE);
  }

  // Close socket descriptor.
  close (sd);

  // Free allocated memory.
  free (src_mac);
  free (dst_mac);
  free (ether_frame);
  free (interface);
  free (target);
  free (src_ip);
  free (dst_ip);
  free (tcp_flags);

  return (EXIT_SUCCESS);
}

// Checksum function
uint16_t
checksum (uint16_t *addr, int len)
{
  int nleft = len;
  int sum = 0;
  uint16_t *w = addr;
  uint16_t answer = 0;

  while (nleft > 1) {
    sum += *w++;
    nleft -= sizeof (uint16_t);
  }

  if (nleft == 1) {
    *(uint8_t *) (&answer) = *(uint8_t *) w;
    sum += answer;
  }

  sum = (sum >> 16) + (sum & 0xFFFF);
  sum += (sum >> 16);
  answer = ~sum;
  return (answer);
}

// Build IPv6 TCP pseudo-header and call checksum function (Section 8.1 of RFC 2460).
uint16_t
tcp6_checksum (struct ip6_hdr iphdr, struct tcphdr tcphdr)
{
  uint32_t lvalue;
  char buf[IP_MAXPACKET], cvalue;
  char *ptr;
  int chksumlen = 0;

  ptr = &buf[0];  // ptr points to beginning of buffer buf

  // Copy source IP address into buf (128 bits)
  memcpy (ptr, &iphdr.ip6_src, sizeof (iphdr.ip6_src));
  ptr += sizeof (iphdr.ip6_src);
  chksumlen += sizeof (iphdr.ip6_src);

  // Copy destination IP address into buf (128 bits)
  memcpy (ptr, &iphdr.ip6_dst, sizeof (iphdr.ip6_dst));
  ptr += sizeof (iphdr.ip6_dst);
  chksumlen += sizeof (iphdr.ip6_dst);

  // Copy TCP length to buf (32 bits)
  lvalue = htonl (sizeof (tcphdr));
  memcpy (ptr, &lvalue, sizeof (lvalue));
  ptr += sizeof (lvalue);
  chksumlen += sizeof (lvalue);

  // Copy zero field to buf (24 bits)
  *ptr = 0; ptr++;
  *ptr = 0; ptr++;
  *ptr = 0; ptr++;
  chksumlen += 3;

  // Copy next header field to buf (8 bits)
  memcpy (ptr, &iphdr.ip6_nxt, sizeof (iphdr.ip6_nxt));
  ptr += sizeof (iphdr.ip6_nxt);
  chksumlen += sizeof (iphdr.ip6_nxt);

  // Copy TCP source port to buf (16 bits)
  memcpy (ptr, &tcphdr.th_sport, sizeof (tcphdr.th_sport));
  ptr += sizeof (tcphdr.th_sport);
  chksumlen += sizeof (tcphdr.th_sport);

  // Copy TCP destination port to buf (16 bits)
  memcpy (ptr, &tcphdr.th_dport, sizeof (tcphdr.th_dport));
  ptr += sizeof (tcphdr.th_dport);
  chksumlen += sizeof (tcphdr.th_dport);

  // Copy sequence number to buf (32 bits)
  memcpy (ptr, &tcphdr.th_seq, sizeof (tcphdr.th_seq));
  ptr += sizeof (tcphdr.th_seq);
  chksumlen += sizeof (tcphdr.th_seq);

  // Copy acknowledgement number to buf (32 bits)
  memcpy (ptr, &tcphdr.th_ack, sizeof (tcphdr.th_ack));
  ptr += sizeof (tcphdr.th_ack);
  chksumlen += sizeof (tcphdr.th_ack);

  // Copy data offset to buf (4 bits) and
  // copy reserved bits to buf (4 bits)
  cvalue = (tcphdr.th_off << 4) + tcphdr.th_x2;
  memcpy (ptr, &cvalue, sizeof (cvalue));
  ptr += sizeof (cvalue);
  chksumlen += sizeof (cvalue);

  // Copy TCP flags to buf (8 bits)
  memcpy (ptr, &tcphdr.th_flags, sizeof (tcphdr.th_flags));
  ptr += sizeof (tcphdr.th_flags);
  chksumlen += sizeof (tcphdr.th_flags);

  // Copy TCP window size to buf (16 bits)
  memcpy (ptr, &tcphdr.th_win, sizeof (tcphdr.th_win));
  ptr += sizeof (tcphdr.th_win);
  chksumlen += sizeof (tcphdr.th_win);

  // Copy TCP checksum to buf (16 bits)
  // Zero, since we don't know it yet
  *ptr = 0; ptr++;
  *ptr = 0; ptr++;
  chksumlen += 2;

  // Copy urgent pointer to buf (16 bits)
  memcpy (ptr, &tcphdr.th_urp, sizeof (tcphdr.th_urp));
  ptr += sizeof (tcphdr.th_urp);
  chksumlen += sizeof (tcphdr.th_urp);

  return checksum ((uint16_t *) buf, chksumlen);
}

// Allocate memory for an array of chars.
char *
allocate_strmem (int len)
{
  void *tmp;

  if (len <= 0) {
    fprintf (stderr, "ERROR: Cannot allocate memory because len = %i in allocate_strmem().\n", len);
    exit (EXIT_FAILURE);
  }

  tmp = (char *) malloc (len * sizeof (char));
  if (tmp != NULL) {
    memset (tmp, 0, len * sizeof (char));
    return (tmp);
  } else {
    fprintf (stderr, "ERROR: Cannot allocate memory for array allocate_strmem().\n");
    exit (EXIT_FAILURE);
  }
}

// Allocate memory for an array of unsigned chars.
uint8_t *
allocate_ustrmem (int len)
{
  void *tmp;

  if (len <= 0) {
    fprintf (stderr, "ERROR: Cannot allocate memory because len = %i in allocate_ustrmem().\n", len);
    exit (EXIT_FAILURE);
  }

  tmp = (uint8_t *) malloc (len * sizeof (uint8_t));
  if (tmp != NULL) {
    memset (tmp, 0, len * sizeof (uint8_t));
    return (tmp);
  } else {
    fprintf (stderr, "ERROR: Cannot allocate memory for array allocate_ustrmem().\n");
    exit (EXIT_FAILURE);
  }
}

// Allocate memory for an array of ints.
int *
allocate_intmem (int len)
{
  void *tmp;

  if (len <= 0) {
    fprintf (stderr, "ERROR: Cannot allocate memory because len = %i in allocate_intmem().\n", len);
    exit (EXIT_FAILURE);
  }

  tmp = (int *) malloc (len * sizeof (int));
  if (tmp != NULL) {
    memset (tmp, 0, len * sizeof (int));
    return (tmp);
  } else {
    fprintf (stderr, "ERROR: Cannot allocate memory for array allocate_intmem().\n");
    exit (EXIT_FAILURE);
  }
}
Eric Dumazet May 24, 2024, 12:26 p.m. UTC | #3
On Fri, May 24, 2024 at 12:40 PM Yue Haibing <yuehaibing@huawei.com> wrote:
>
> On 2024/5/24 17:24, Eric Dumazet wrote:
> > On Fri, May 24, 2024 at 10:49 AM Yue Haibing <yuehaibing@huawei.com> wrote:
> >>
> >> packet from PF_PACKET socket ontop of an IPv6-backed ipvlan device will hit
> >> WARN_ON_ONCE() in sk_mc_loop() through sch_direct_xmit() path while ipvlan
> >> device has qdisc queue.
> >>
> >> WARNING: CPU: 2 PID: 0 at net/core/sock.c:775 sk_mc_loop+0x2d/0x70
> >> Modules linked in: sch_netem ipvlan rfkill cirrus drm_shmem_helper sg drm_kms_helper
> >> CPU: 2 PID: 0 Comm: swapper/2 Kdump: loaded Not tainted 6.9.0+ #279
> >> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
> >> RIP: 0010:sk_mc_loop+0x2d/0x70
> >> Code: fa 0f 1f 44 00 00 65 0f b7 15 f7 96 a3 4f 31 c0 66 85 d2 75 26 48 85 ff 74 1c
> >> RSP: 0018:ffffa9584015cd78 EFLAGS: 00010212
> >> RAX: 0000000000000011 RBX: ffff91e585793e00 RCX: 0000000002c6a001
> >> RDX: 0000000000000000 RSI: 0000000000000040 RDI: ffff91e589c0f000
> >> RBP: ffff91e5855bd100 R08: 0000000000000000 R09: 3d00545216f43d00
> >> R10: ffff91e584fdcc50 R11: 00000060dd8616f4 R12: ffff91e58132d000
> >> R13: ffff91e584fdcc68 R14: ffff91e5869ce800 R15: ffff91e589c0f000
> >> FS:  0000000000000000(0000) GS:ffff91e898100000(0000) knlGS:0000000000000000
> >> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >> CR2: 00007f788f7c44c0 CR3: 0000000008e1a000 CR4: 00000000000006f0
> >> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >> Call Trace:
> >>  <IRQ>
> >>  ? __warn+0x83/0x130
> >>  ? sk_mc_loop+0x2d/0x70
> >>  ? report_bug+0x18e/0x1a0
> >>  ? handle_bug+0x3c/0x70
> >>  ? exc_invalid_op+0x18/0x70
> >>  ? asm_exc_invalid_op+0x1a/0x20
> >>  ? sk_mc_loop+0x2d/0x70
> >>  ip6_finish_output2+0x31e/0x590
> >>  ? nf_hook_slow+0x43/0xf0
> >>  ip6_finish_output+0x1f8/0x320
> >>  ? __pfx_ip6_finish_output+0x10/0x10
> >>  ipvlan_xmit_mode_l3+0x22a/0x2a0 [ipvlan]
> >>  ipvlan_start_xmit+0x17/0x50 [ipvlan]
> >>  dev_hard_start_xmit+0x8c/0x1d0
> >>  sch_direct_xmit+0xa2/0x390
> >>  __qdisc_run+0x66/0xd0
> >>  net_tx_action+0x1ca/0x270
> >>  handle_softirqs+0xd6/0x2b0
> >>  __irq_exit_rcu+0x9b/0xc0
> >>  sysvec_apic_timer_interrupt+0x75/0x90
> >
> > Please provide full symbols in stack traces.
>
> Call Trace:
> <IRQ>
> ? __warn (kernel/panic.c:693)
> ? sk_mc_loop (net/core/sock.c:775 net/core/sock.c:760)
> ? report_bug (lib/bug.c:201 lib/bug.c:219)
> ? handle_bug (arch/x86/kernel/traps.c:239)
> ? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1))
> ? asm_exc_invalid_op (./arch/x86/include/asm/idtentry.h:621)
> ? sk_mc_loop (net/core/sock.c:775 net/core/sock.c:760)
> ip6_finish_output2 (net/ipv6/ip6_output.c:83 (discriminator 1))
> ? nf_hook_slow (./include/linux/netfilter.h:154 net/netfilter/core.c:626)
> ip6_finish_output (net/ipv6/ip6_output.c:211 net/ipv6/ip6_output.c:222)
> ? __pfx_ip6_finish_output (net/ipv6/ip6_output.c:215)
> ipvlan_xmit_mode_l3 (drivers/net/ipvlan/ipvlan_core.c:498 drivers/net/ipvlan/ipvlan_core.c:538 drivers/net/ipvlan/ipvlan_core.c:602) ipvlan
> ipvlan_start_xmit (drivers/net/ipvlan/ipvlan_main.c:226) ipvlan
> dev_hard_start_xmit (./include/linux/netdevice.h:4882 ./include/linux/netdevice.h:4896 net/core/dev.c:3578 net/core/dev.c:3594)
> sch_direct_xmit (net/sched/sch_generic.c:343)
> __qdisc_run (net/sched/sch_generic.c:416)
> net_tx_action (./include/net/sch_generic.h:219 ./include/net/pkt_sched.h:128 ./include/net/pkt_sched.h:124 net/core/dev.c:5286)
> handle_softirqs (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/irq.h:142 kernel/softirq.c:555)
> __irq_exit_rcu (kernel/softirq.c:589 kernel/softirq.c:428 kernel/softirq.c:637)
> sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1043 arch/x86/kernel/apic/apic.c:1043)
>
> >
> >>  </IRQ>
> >>
> >> Fixes: f60e5990d9c1 ("ipv6: protect skb->sk accesses from recursive dereference inside the stack")
> >> Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
> >> ---
> >>  include/linux/netdevice.h | 17 +++++++++++++++++
> >>  net/core/dev.h            | 17 -----------------
> >>  net/sched/sch_generic.c   |  8 +++++---
> >>  3 files changed, 22 insertions(+), 20 deletions(-)
> >
> > This patch seems unrelated to the WARN_ON_ONCE(1) met in sk_mc_loop()
> >
> > If sk_mc_loop() is called with a socket which is not inet, we are in trouble.
> >
> > Please fix the root cause instead of trying to shortcut sk_mc_loop() as you did.
> First setup like this:
> ip netns add ns0
> ip netns add ns1
> ip link add ip0 link eth0 type ipvlan mode l3 vepa
> ip link add ip1 link eth0 type ipvlan mode l3 vepa
> ip link set ip0 netns ns0
> ip link exec ip link set ip0 up
> ip link set ip1 netns ns1
> ip link exec ip link set ip1 up
> ip link exec tc qdisc add dev ip1 root netem delay 10ms
>
> Second, build and send a raw ipv6 multicast packet as attached repro in ns1
>
> packet_sendmsg
>    packet_snd //skb->sk is packet sk
>       __dev_queue_xmit
>          __dev_xmit_skb //q->enqueue is not NULL
>              __qdisc_run
>                  qdisc_restart
>                     sch_direct_xmit
>                        dev_hard_start_xmit
>                           netdev_start_xmit
>                             ipvlan_start_xmit
>                               ipvlan_xmit_mode_l3 //l3 mode
>                                  ipvlan_process_outbound //vepa flag
>                                    ipvlan_process_v6_outbound //skb->protocol is ETH_P_IPV6
>                                       ip6_local_out
>                                        ...
>                                          __ip6_finish_output
>                                             ip6_finish_output2 //multicast packet
>                                                sk_mc_loop //dev_recursion_level is 0
>                                                   WARN_ON_ONCE //sk->sk_family is AF_PACKET
>
> > .

I would say ipvlan code should not use skb->sk when calling
ip6_local_out() , like other tunnels.

Untested patch :

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 2d5b021b4ea6053eeb055a76fa4c7d9380cd2a53..fef4eff7753a7acb1e11d9712abd669de7740df6
100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -439,7 +439,7 @@ static noinline_for_stack int
ipvlan_process_v4_outbound(struct sk_buff *skb)

        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));

-       err = ip_local_out(net, skb->sk, skb);
+       err = ip_local_out(net, NULL, skb);
        if (unlikely(net_xmit_eval(err)))
                DEV_STATS_INC(dev, tx_errors);
        else
@@ -494,7 +494,7 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb)

        memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));

-       err = ip6_local_out(dev_net(dev), skb->sk, skb);
+       err = ip6_local_out(dev_net(dev), NULL, skb);
        if (unlikely(net_xmit_eval(err)))
                DEV_STATS_INC(dev, tx_errors);
        else
Yue Haibing May 24, 2024, 12:45 p.m. UTC | #4
On 2024/5/24 20:26, Eric Dumazet wrote:
> On Fri, May 24, 2024 at 12:40 PM Yue Haibing <yuehaibing@huawei.com> wrote:
>>
>> On 2024/5/24 17:24, Eric Dumazet wrote:
>>> On Fri, May 24, 2024 at 10:49 AM Yue Haibing <yuehaibing@huawei.com> wrote:
>>>>
>>>> packet from PF_PACKET socket ontop of an IPv6-backed ipvlan device will hit
>>>> WARN_ON_ONCE() in sk_mc_loop() through sch_direct_xmit() path while ipvlan
>>>> device has qdisc queue.
>>>>
>>>> WARNING: CPU: 2 PID: 0 at net/core/sock.c:775 sk_mc_loop+0x2d/0x70
>>>> Modules linked in: sch_netem ipvlan rfkill cirrus drm_shmem_helper sg drm_kms_helper
>>>> CPU: 2 PID: 0 Comm: swapper/2 Kdump: loaded Not tainted 6.9.0+ #279
>>>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
>>>> RIP: 0010:sk_mc_loop+0x2d/0x70
>>>> Code: fa 0f 1f 44 00 00 65 0f b7 15 f7 96 a3 4f 31 c0 66 85 d2 75 26 48 85 ff 74 1c
>>>> RSP: 0018:ffffa9584015cd78 EFLAGS: 00010212
>>>> RAX: 0000000000000011 RBX: ffff91e585793e00 RCX: 0000000002c6a001
>>>> RDX: 0000000000000000 RSI: 0000000000000040 RDI: ffff91e589c0f000
>>>> RBP: ffff91e5855bd100 R08: 0000000000000000 R09: 3d00545216f43d00
>>>> R10: ffff91e584fdcc50 R11: 00000060dd8616f4 R12: ffff91e58132d000
>>>> R13: ffff91e584fdcc68 R14: ffff91e5869ce800 R15: ffff91e589c0f000
>>>> FS:  0000000000000000(0000) GS:ffff91e898100000(0000) knlGS:0000000000000000
>>>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>> CR2: 00007f788f7c44c0 CR3: 0000000008e1a000 CR4: 00000000000006f0
>>>> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>>> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>>> Call Trace:
>>>>  <IRQ>
>>>>  ? __warn+0x83/0x130
>>>>  ? sk_mc_loop+0x2d/0x70
>>>>  ? report_bug+0x18e/0x1a0
>>>>  ? handle_bug+0x3c/0x70
>>>>  ? exc_invalid_op+0x18/0x70
>>>>  ? asm_exc_invalid_op+0x1a/0x20
>>>>  ? sk_mc_loop+0x2d/0x70
>>>>  ip6_finish_output2+0x31e/0x590
>>>>  ? nf_hook_slow+0x43/0xf0
>>>>  ip6_finish_output+0x1f8/0x320
>>>>  ? __pfx_ip6_finish_output+0x10/0x10
>>>>  ipvlan_xmit_mode_l3+0x22a/0x2a0 [ipvlan]
>>>>  ipvlan_start_xmit+0x17/0x50 [ipvlan]
>>>>  dev_hard_start_xmit+0x8c/0x1d0
>>>>  sch_direct_xmit+0xa2/0x390
>>>>  __qdisc_run+0x66/0xd0
>>>>  net_tx_action+0x1ca/0x270
>>>>  handle_softirqs+0xd6/0x2b0
>>>>  __irq_exit_rcu+0x9b/0xc0
>>>>  sysvec_apic_timer_interrupt+0x75/0x90
>>>
>>> Please provide full symbols in stack traces.
>>
>> Call Trace:
>> <IRQ>
>> ? __warn (kernel/panic.c:693)
>> ? sk_mc_loop (net/core/sock.c:775 net/core/sock.c:760)
>> ? report_bug (lib/bug.c:201 lib/bug.c:219)
>> ? handle_bug (arch/x86/kernel/traps.c:239)
>> ? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1))
>> ? asm_exc_invalid_op (./arch/x86/include/asm/idtentry.h:621)
>> ? sk_mc_loop (net/core/sock.c:775 net/core/sock.c:760)
>> ip6_finish_output2 (net/ipv6/ip6_output.c:83 (discriminator 1))
>> ? nf_hook_slow (./include/linux/netfilter.h:154 net/netfilter/core.c:626)
>> ip6_finish_output (net/ipv6/ip6_output.c:211 net/ipv6/ip6_output.c:222)
>> ? __pfx_ip6_finish_output (net/ipv6/ip6_output.c:215)
>> ipvlan_xmit_mode_l3 (drivers/net/ipvlan/ipvlan_core.c:498 drivers/net/ipvlan/ipvlan_core.c:538 drivers/net/ipvlan/ipvlan_core.c:602) ipvlan
>> ipvlan_start_xmit (drivers/net/ipvlan/ipvlan_main.c:226) ipvlan
>> dev_hard_start_xmit (./include/linux/netdevice.h:4882 ./include/linux/netdevice.h:4896 net/core/dev.c:3578 net/core/dev.c:3594)
>> sch_direct_xmit (net/sched/sch_generic.c:343)
>> __qdisc_run (net/sched/sch_generic.c:416)
>> net_tx_action (./include/net/sch_generic.h:219 ./include/net/pkt_sched.h:128 ./include/net/pkt_sched.h:124 net/core/dev.c:5286)
>> handle_softirqs (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/irq.h:142 kernel/softirq.c:555)
>> __irq_exit_rcu (kernel/softirq.c:589 kernel/softirq.c:428 kernel/softirq.c:637)
>> sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1043 arch/x86/kernel/apic/apic.c:1043)
>>
>>>
>>>>  </IRQ>
>>>>
>>>> Fixes: f60e5990d9c1 ("ipv6: protect skb->sk accesses from recursive dereference inside the stack")
>>>> Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
>>>> ---
>>>>  include/linux/netdevice.h | 17 +++++++++++++++++
>>>>  net/core/dev.h            | 17 -----------------
>>>>  net/sched/sch_generic.c   |  8 +++++---
>>>>  3 files changed, 22 insertions(+), 20 deletions(-)
>>>
>>> This patch seems unrelated to the WARN_ON_ONCE(1) met in sk_mc_loop()
>>>
>>> If sk_mc_loop() is called with a socket which is not inet, we are in trouble.
>>>
>>> Please fix the root cause instead of trying to shortcut sk_mc_loop() as you did.
>> First setup like this:
>> ip netns add ns0
>> ip netns add ns1
>> ip link add ip0 link eth0 type ipvlan mode l3 vepa
>> ip link add ip1 link eth0 type ipvlan mode l3 vepa
>> ip link set ip0 netns ns0
>> ip link exec ip link set ip0 up
>> ip link set ip1 netns ns1
>> ip link exec ip link set ip1 up
>> ip link exec tc qdisc add dev ip1 root netem delay 10ms
>>
>> Second, build and send a raw ipv6 multicast packet as attached repro in ns1
>>
>> packet_sendmsg
>>    packet_snd //skb->sk is packet sk
>>       __dev_queue_xmit
>>          __dev_xmit_skb //q->enqueue is not NULL
>>              __qdisc_run
>>                  qdisc_restart
>>                     sch_direct_xmit
>>                        dev_hard_start_xmit
>>                           netdev_start_xmit
>>                             ipvlan_start_xmit
>>                               ipvlan_xmit_mode_l3 //l3 mode
>>                                  ipvlan_process_outbound //vepa flag
>>                                    ipvlan_process_v6_outbound //skb->protocol is ETH_P_IPV6
>>                                       ip6_local_out
>>                                        ...
>>                                          __ip6_finish_output
>>                                             ip6_finish_output2 //multicast packet
>>                                                sk_mc_loop //dev_recursion_level is 0
>>                                                   WARN_ON_ONCE //sk->sk_family is AF_PACKET
>>
>>> .
> 
> I would say ipvlan code should not use skb->sk when calling
> ip6_local_out() , like other tunnels.

Thanks, seems good. Will test this
> 
> Untested patch :
> 
> diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
> index 2d5b021b4ea6053eeb055a76fa4c7d9380cd2a53..fef4eff7753a7acb1e11d9712abd669de7740df6
> 100644
> --- a/drivers/net/ipvlan/ipvlan_core.c
> +++ b/drivers/net/ipvlan/ipvlan_core.c
> @@ -439,7 +439,7 @@ static noinline_for_stack int
> ipvlan_process_v4_outbound(struct sk_buff *skb)
> 
>         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
> 
> -       err = ip_local_out(net, skb->sk, skb);
> +       err = ip_local_out(net, NULL, skb);
>         if (unlikely(net_xmit_eval(err)))
>                 DEV_STATS_INC(dev, tx_errors);
>         else
> @@ -494,7 +494,7 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb)
> 
>         memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
> 
> -       err = ip6_local_out(dev_net(dev), skb->sk, skb);
> +       err = ip6_local_out(dev_net(dev), NULL, skb);
>         if (unlikely(net_xmit_eval(err)))
>                 DEV_STATS_INC(dev, tx_errors);
>         else
> .
>
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d20c6c99eb88..7c0c9e9b045e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3261,6 +3261,23 @@  static inline int dev_recursion_level(void)
 	return this_cpu_read(softnet_data.xmit.recursion);
 }
 
+#define XMIT_RECURSION_LIMIT	8
+static inline bool dev_xmit_recursion(void)
+{
+	return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
+			XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+	__this_cpu_inc(softnet_data.xmit.recursion);
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+	__this_cpu_dec(softnet_data.xmit.recursion);
+}
+
 void __netif_schedule(struct Qdisc *q);
 void netif_schedule_queue(struct netdev_queue *txq);
 
diff --git a/net/core/dev.h b/net/core/dev.h
index b7b518bc2be5..49345ad7350b 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -149,21 +149,4 @@  static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
 struct napi_struct *napi_by_id(unsigned int napi_id);
 void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
 
-#define XMIT_RECURSION_LIMIT	8
-static inline bool dev_xmit_recursion(void)
-{
-	return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
-			XMIT_RECURSION_LIMIT);
-}
-
-static inline void dev_xmit_recursion_inc(void)
-{
-	__this_cpu_inc(softnet_data.xmit.recursion);
-}
-
-static inline void dev_xmit_recursion_dec(void)
-{
-	__this_cpu_dec(softnet_data.xmit.recursion);
-}
-
 #endif
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2a637a17061b..74d9b43b7767 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -339,11 +339,13 @@  bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 	if (likely(skb)) {
 		HARD_TX_LOCK(dev, txq, smp_processor_id());
-		if (!netif_xmit_frozen_or_stopped(txq))
+		if (!netif_xmit_frozen_or_stopped(txq)) {
+			dev_xmit_recursion_inc();
 			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
-		else
+			dev_xmit_recursion_dec();
+		} else {
 			qdisc_maybe_clear_missed(q, txq);
-
+		}
 		HARD_TX_UNLOCK(dev, txq);
 	} else {
 		if (root_lock)