Message ID | 20230321024946.GA21870@ubuntu (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net: Fix invalid ip_route_output_ports() call | expand |
On Mon, Mar 20, 2023 at 7:49 PM Hyunwoo Kim <v4bel@theori.io> wrote: > > If you pass the address of the struct flowi4 you declared as a > local variable as the fl4 argument to ip_route_output_ports(), > the subsequent call to xfrm_state_find() will read the local > variable by AF_INET6 rather than AF_INET as per policy, > which could cause it to go out of scope on the kernel stack. > > Reported-by: syzbot+ada7c035554bcee65580@syzkaller.appspotmail.com I could not find this syzbot issue, can you provide a link, and a stack trace ? > Signed-off-by: Hyunwoo Kim <v4bel@theori.io> > --- I find this patch quite strange, to be honest. It looks like some xfrm bug to me. A stack trace would be helpful. Thanks.
On Mon, Mar 20, 2023 at 08:17:15PM -0700, Eric Dumazet wrote: > On Mon, Mar 20, 2023 at 7:49 PM Hyunwoo Kim <v4bel@theori.io> wrote: > > > > If you pass the address of the struct flowi4 you declared as a > > local variable as the fl4 argument to ip_route_output_ports(), > > the subsequent call to xfrm_state_find() will read the local > > variable by AF_INET6 rather than AF_INET as per policy, > > which could cause it to go out of scope on the kernel stack. > > > > Reported-by: syzbot+ada7c035554bcee65580@syzkaller.appspotmail.com > > I could not find this syzbot issue, can you provide a link, and a stack trace ? This is the syzbot dashboard: https://syzkaller.appspot.com/bug?extid=0f526bf9663842ac2dc7 and KASAN log: ``` [ 239.016529] ================================================================== [ 239.016540] BUG: KASAN: stack-out-of-bounds in xfrm_state_find+0x2b8/0x2ae0 [ 239.016556] Read of size 4 at addr ffffc90000860ba0 by task swapper/14/0 [ 239.016571] CPU: 14 PID: 0 Comm: swapper/14 Tainted: G D E 6.2.0+ #14 [ 239.016583] Hardware name: Gigabyte Technology Co., Ltd. B460MDS3H/B460M DS3H, BIOS F3 05/27/2020 [ 239.016593] Call Trace: [ 239.016599] <IRQ> [ 239.016605] dump_stack_lvl+0x4c/0x70 [ 239.016617] print_report+0xcf/0x620 [ 239.016629] ? kasan_addr_to_slab+0x11/0xb0 [ 239.016639] ? xfrm_state_find+0x2b8/0x2ae0 [ 239.016650] kasan_report+0xbf/0x100 [ 239.016657] ? xfrm_state_find+0x2b8/0x2ae0 [ 239.016664] __asan_load4+0x84/0xa0 [ 239.016670] xfrm_state_find+0x2b8/0x2ae0 [ 239.016677] ? __pfx_xfrm_state_find+0x10/0x10 [ 239.016684] ? __pfx_xfrm4_get_saddr+0x10/0x10 [ 239.016690] ? unwind_next_frame+0x27/0x40 [ 239.016697] xfrm_tmpl_resolve+0x1f9/0x780 [ 239.016704] ? __pfx_xfrm_tmpl_resolve+0x10/0x10 [ 239.016711] ? kasan_save_stack+0x3e/0x50 [ 239.016716] ? kasan_save_stack+0x2a/0x50 [ 239.016721] ? kasan_set_track+0x29/0x40 [ 239.016726] ? kasan_save_alloc_info+0x22/0x30 [ 239.016731] ? __kasan_slab_alloc+0x91/0xa0 [ 239.016736] ? kmem_cache_alloc+0x17e/0x370 [ 239.016741] ? dst_alloc+0x5c/0x230 [ 239.016747] ? xfrm_pol_bin_cmp+0xc8/0xe0 [ 239.016753] xfrm_resolve_and_create_bundle+0xf1/0x10d0 [ 239.016758] ? __pfx_xfrm_policy_inexact_lookup_rcu+0x10/0x10 [ 239.016764] ? xfrm_policy_lookup_inexact_addr+0xa1/0xc0 [ 239.016771] ? xfrm_policy_match+0xd6/0x110 [ 239.016776] ? __rcu_read_unlock+0x3b/0x80 [ 239.016782] ? xfrm_policy_lookup_bytype.constprop.0+0x52e/0xb80 [ 239.016788] ? __pfx_xfrm_resolve_and_create_bundle+0x10/0x10 [ 239.016795] ? __pfx_xfrm_policy_lookup_bytype.constprop.0+0x10/0x10 [ 239.016802] ? __kasan_check_write+0x18/0x20 [ 239.016807] ? _raw_spin_lock_bh+0x8c/0xe0 [ 239.016813] ? __pfx__raw_spin_lock_bh+0x10/0x10 [ 239.016819] xfrm_lookup_with_ifid+0x2f2/0xe50 [ 239.016824] ? __local_bh_enable_ip+0x3f/0x90 [ 239.016830] ? rcu_gp_cleanup+0x2f2/0x6c0 [ 239.016836] ? __pfx_xfrm_lookup_with_ifid+0x10/0x10 [ 239.016842] ? ip_route_output_key_hash_rcu+0x3da/0x1000 [ 239.016848] xfrm_lookup_route+0x2a/0x100 [ 239.016854] ip_route_output_flow+0x1a7/0x1c0 [ 239.016859] ? __pfx_ip_route_output_flow+0x10/0x10 [ 239.016866] igmpv3_newpack+0x1c1/0x5d0 [ 239.016872] ? __pfx_igmpv3_newpack+0x10/0x10 [ 239.016878] ? check_preempt_curr+0xd7/0x120 [ 239.016885] add_grhead+0x111/0x130 [ 239.016891] ? __pfx_igmp_ifc_timer_expire+0x10/0x10 [ 239.016897] add_grec+0x756/0x7f0 [ 239.016903] ? __pfx_add_grec+0x10/0x10 [ 239.016909] ? __pfx__raw_spin_lock_bh+0x10/0x10 [ 239.016914] ? wake_up_process+0x19/0x20 [ 239.016919] ? insert_work+0x130/0x160 [ 239.016926] ? __pfx_igmp_ifc_timer_expire+0x10/0x10 [ 239.016932] igmp_ifc_timer_expire+0x2b5/0x650 [ 239.016938] ? __kasan_check_write+0x18/0x20 [ 239.016943] ? _raw_spin_lock+0x8c/0xe0 [ 239.016948] ? __pfx_igmp_ifc_timer_expire+0x10/0x10 [ 239.016954] ? __pfx_igmp_ifc_timer_expire+0x10/0x10 [ 239.016960] call_timer_fn+0x2d/0x1b0 [ 239.016966] ? __pfx_igmp_ifc_timer_expire+0x10/0x10 [ 239.016973] __run_timers.part.0+0x447/0x530 [ 239.016979] ? __pfx___run_timers.part.0+0x10/0x10 [ 239.016986] ? ktime_get+0x58/0xd0 [ 239.016992] ? lapic_next_deadline+0x30/0x40 [ 239.016997] ? clockevents_program_event+0x118/0x1a0 [ 239.017004] run_timer_softirq+0x69/0xf0 [ 239.017010] __do_softirq+0x128/0x444 [ 239.017016] __irq_exit_rcu+0xdd/0x130 [ 239.017022] irq_exit_rcu+0x12/0x20 [ 239.017027] sysvec_apic_timer_interrupt+0xa5/0xc0 [ 239.017033] </IRQ> [ 239.017036] <TASK> [ 239.017039] asm_sysvec_apic_timer_interrupt+0x1f/0x30 [ 239.017045] RIP: 0010:cpuidle_enter_state+0x1d2/0x510 [ 239.017051] Code: 30 00 0f 84 61 01 00 00 41 83 ed 01 73 dd 48 83 c4 28 44 89 f0 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc fb 0f 1f 44 00 00 <45> 85 f6 0f 89 18 ff ff ff 48 c7 43 18 00 00 00 00 49 83 fd 09 0f [ 239.017061] RSP: 0018:ffffc9000028fd40 EFLAGS: 00000246 [ 239.017067] RAX: 0000000000000000 RBX: ffffe8ffffd00c40 RCX: ffffffff811eae9c [ 239.017072] RDX: dffffc0000000000 RSI: ffffffff82e7cc00 RDI: ffff88840eb44188 [ 239.017077] RBP: ffffc9000028fd90 R08: 00000037a67e1a91 R09: ffff88840eb3f0a3 [ 239.017082] R10: ffffed1081d67e14 R11: 0000000000000001 R12: ffffffff846c43a0 [ 239.017087] R13: 0000000000000002 R14: 0000000000000002 R15: ffffffff846c4488 [ 239.017093] ? sched_idle_set_state+0x4c/0x70 [ 239.017101] cpuidle_enter+0x45/0x70 [ 239.017108] call_cpuidle+0x44/0x80 [ 239.017113] do_idle+0x2b4/0x340 [ 239.017118] ? __pfx_do_idle+0x10/0x10 [ 239.017125] cpu_startup_entry+0x24/0x30 [ 239.017130] start_secondary+0x1d6/0x210 [ 239.017136] ? __pfx_start_secondary+0x10/0x10 [ 239.017142] ? set_bringup_idt_handler.constprop.0+0x93/0xc0 [ 239.017150] ? start_cpu0+0xc/0xc [ 239.017156] secondary_startup_64_no_verify+0xe5/0xeb [ 239.017163] </TASK> [ 239.017169] The buggy address belongs to the virtual mapping at [ffffc90000859000, ffffc90000862000) created by: irq_init_percpu_irqstack+0x1c8/0x270 [ 239.017182] The buggy address belongs to the physical page: [ 239.017186] page:00000000d2ae7732 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x40eb09 [ 239.017193] flags: 0x17ffffc0001000(reserved|node=0|zone=2|lastcpupid=0x1fffff) [ 239.017202] raw: 0017ffffc0001000 ffffea00103ac248 ffffea00103ac248 0000000000000000 [ 239.017208] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 [ 239.017213] page dumped because: kasan: bad access detected [ 239.017219] Memory state around the buggy address: [ 239.017222] ffffc90000860a80: 00 00 00 00 f3 f3 f3 f3 00 00 00 00 00 00 00 00 [ 239.017228] ffffc90000860b00: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 00 00 00 [ 239.017233] >ffffc90000860b80: 00 00 00 00 f3 f3 f3 f3 00 00 00 00 00 00 00 00 [ 239.017238] ^ [ 239.017241] ffffc90000860c00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 239.017247] ffffc90000860c80: 00 00 00 00 00 00 00 f1 f1 f1 f1 00 f3 f3 f3 00 [ 239.017251] ================================================================== ``` > > > Signed-off-by: Hyunwoo Kim <v4bel@theori.io> > > --- > > I find this patch quite strange, to be honest. > > It looks like some xfrm bug to me. > > A stack trace would be helpful. Here are the root caouses I analyzed: ``` igmp_ifc_timer_expire() -> igmpv3_send_cr() -> add_grec() -> add_grhead() -> igmpv3_newpack()[1] -> ip_route_output_ports() -> ip_route_output_flow()[3] -> xfrm_lookup_route() -> xfrm_lookup() -> xfrm_lookup_with_ifid() -> xfrm_resolve_and_create_bundle() -> xfrm_tmpl_resolve() -> xfrm_tmpl_resolve_one() -> xfrm_state_find()[5] ``` Here, igmpv3_newpack()[1] declares the variable `struct flowi4 fl4`[2]: ``` static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; struct rtable *rt; struct iphdr *pip; struct igmpv3_report *pig; struct net *net = dev_net(dev); struct flowi4 fl4; // <===[2] int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int size = mtu; while (1) { skb = alloc_skb(size + hlen + tlen, GFP_ATOMIC | __GFP_NOWARN); if (skb) break; size >>= 1; if (size < 256) return NULL; } skb->priority = TC_PRIO_CONTROL; rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); ``` Then, starting with ip_route_output_flow()[3], we use flowi4_to_flowi() to convert the `struct flowi4` variable to a `struct flowi` pointer[4]: ``` struct flowi { union { struct flowi_common __fl_common; struct flowi4 ip4; struct flowi6 ip6; } u; #define flowi_oif u.__fl_common.flowic_oif #define flowi_iif u.__fl_common.flowic_iif #define flowi_l3mdev u.__fl_common.flowic_l3mdev #define flowi_mark u.__fl_common.flowic_mark #define flowi_tos u.__fl_common.flowic_tos #define flowi_scope u.__fl_common.flowic_scope #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key #define flowi_uid u.__fl_common.flowic_uid } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) { return container_of(fl4, struct flowi, u.ip4); } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, const struct sock *sk) { struct rtable *rt = __ip_route_output_key(net, flp4); if (IS_ERR(rt)) return rt; if (flp4->flowi4_proto) { flp4->flowi4_oif = rt->dst.dev->ifindex; rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), // <===[4] sk, 0); } return rt; } EXPORT_SYMBOL_GPL(ip_route_output_flow); ``` This is the cause of the stack OOB. Because we calculated the struct flowi pointer address based on struct flowi4 declared as a stack variable, if we accessed a member of flowi that exceeds the size of flowi4, we would get an OOB. Finally, xfrm_state_find()[5] uses daddr, which is a pointer to `&fl->u.ip4.saddr`. Here, the encap_family variable can be entered by the user using the netlink socket. If the user chose AF_INET6 instead of AF_INET, the xfrm_dst_hash() function would be called on an AF_INET6 basis[6], which could cause an OOB in the `struct flowi4 fl4` variable of igmpv3_newpack()[2]. ``` struct xfrm_state * xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, const struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family, u32 if_id) { static xfrm_address_t saddr_wildcard = { }; struct net *net = xp_net(pol); unsigned int h, h_wildcard; struct xfrm_state *x, *x0, *to_put; int acquire_in_progress = 0; int error = 0; struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; unsigned short encap_family = tmpl->encap_family; unsigned int sequence; struct km_event c; to_put = NULL; sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); rcu_read_lock(); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); // <===[6] ``` Of course, it's possible that the patch I wrote is not appropriate. Regards, Hyunwoo Kim
On Mon, Mar 20, 2023 at 10:08 PM Hyunwoo Kim <v4bel@theori.io> wrote: > > On Mon, Mar 20, 2023 at 08:17:15PM -0700, Eric Dumazet wrote: > > On Mon, Mar 20, 2023 at 7:49 PM Hyunwoo Kim <v4bel@theori.io> wrote: > > > > > > If you pass the address of the struct flowi4 you declared as a > > > local variable as the fl4 argument to ip_route_output_ports(), > > > the subsequent call to xfrm_state_find() will read the local > > > variable by AF_INET6 rather than AF_INET as per policy, > > > which could cause it to go out of scope on the kernel stack. > > > > > > Reported-by: syzbot+ada7c035554bcee65580@syzkaller.appspotmail.com > > > > I could not find this syzbot issue, can you provide a link, and a stack trace ? > > This is the syzbot dashboard: > https://syzkaller.appspot.com/bug?extid=0f526bf9663842ac2dc7 > > > and KASAN log: > ``` > [ 239.016529] ================================================================== > [ 239.016540] BUG: KASAN: stack-out-of-bounds in xfrm_state_find+0x2b8/0x2ae0 > [ 239.016556] Read of size 4 at addr ffffc90000860ba0 by task swapper/14/0 > > [ 239.016571] CPU: 14 PID: 0 Comm: swapper/14 Tainted: G D E 6.2.0+ #14 > [ 239.016583] Hardware name: Gigabyte Technology Co., Ltd. B460MDS3H/B460M DS3H, BIOS F3 05/27/2020 > [ 239.016593] Call Trace: > [ 239.016599] <IRQ> > [ 239.016605] dump_stack_lvl+0x4c/0x70 > [ 239.016617] print_report+0xcf/0x620 > [ 239.016629] ? kasan_addr_to_slab+0x11/0xb0 > [ 239.016639] ? xfrm_state_find+0x2b8/0x2ae0 > [ 239.016650] kasan_report+0xbf/0x100 > [ 239.016657] ? xfrm_state_find+0x2b8/0x2ae0 > [ 239.016664] __asan_load4+0x84/0xa0 > [ 239.016670] xfrm_state_find+0x2b8/0x2ae0 > [ 239.016677] ? __pfx_xfrm_state_find+0x10/0x10 > [ 239.016684] ? __pfx_xfrm4_get_saddr+0x10/0x10 > [ 239.016690] ? unwind_next_frame+0x27/0x40 > [ 239.016697] xfrm_tmpl_resolve+0x1f9/0x780 > [ 239.016704] ? __pfx_xfrm_tmpl_resolve+0x10/0x10 > [ 239.016711] ? kasan_save_stack+0x3e/0x50 > [ 239.016716] ? kasan_save_stack+0x2a/0x50 > [ 239.016721] ? kasan_set_track+0x29/0x40 > [ 239.016726] ? kasan_save_alloc_info+0x22/0x30 > [ 239.016731] ? __kasan_slab_alloc+0x91/0xa0 > [ 239.016736] ? kmem_cache_alloc+0x17e/0x370 > [ 239.016741] ? dst_alloc+0x5c/0x230 > [ 239.016747] ? xfrm_pol_bin_cmp+0xc8/0xe0 > [ 239.016753] xfrm_resolve_and_create_bundle+0xf1/0x10d0 > [ 239.016758] ? __pfx_xfrm_policy_inexact_lookup_rcu+0x10/0x10 > [ 239.016764] ? xfrm_policy_lookup_inexact_addr+0xa1/0xc0 > [ 239.016771] ? xfrm_policy_match+0xd6/0x110 > [ 239.016776] ? __rcu_read_unlock+0x3b/0x80 > [ 239.016782] ? xfrm_policy_lookup_bytype.constprop.0+0x52e/0xb80 > [ 239.016788] ? __pfx_xfrm_resolve_and_create_bundle+0x10/0x10 > [ 239.016795] ? __pfx_xfrm_policy_lookup_bytype.constprop.0+0x10/0x10 > [ 239.016802] ? __kasan_check_write+0x18/0x20 > [ 239.016807] ? _raw_spin_lock_bh+0x8c/0xe0 > [ 239.016813] ? __pfx__raw_spin_lock_bh+0x10/0x10 > [ 239.016819] xfrm_lookup_with_ifid+0x2f2/0xe50 > [ 239.016824] ? __local_bh_enable_ip+0x3f/0x90 > [ 239.016830] ? rcu_gp_cleanup+0x2f2/0x6c0 > [ 239.016836] ? __pfx_xfrm_lookup_with_ifid+0x10/0x10 > [ 239.016842] ? ip_route_output_key_hash_rcu+0x3da/0x1000 > [ 239.016848] xfrm_lookup_route+0x2a/0x100 > [ 239.016854] ip_route_output_flow+0x1a7/0x1c0 > [ 239.016859] ? __pfx_ip_route_output_flow+0x10/0x10 > [ 239.016866] igmpv3_newpack+0x1c1/0x5d0 > [ 239.016872] ? __pfx_igmpv3_newpack+0x10/0x10 > [ 239.016878] ? check_preempt_curr+0xd7/0x120 > [ 239.016885] add_grhead+0x111/0x130 > [ 239.016891] ? __pfx_igmp_ifc_timer_expire+0x10/0x10 > [ 239.016897] add_grec+0x756/0x7f0 > [ 239.016903] ? __pfx_add_grec+0x10/0x10 > [ 239.016909] ? __pfx__raw_spin_lock_bh+0x10/0x10 > [ 239.016914] ? wake_up_process+0x19/0x20 > [ 239.016919] ? insert_work+0x130/0x160 > [ 239.016926] ? __pfx_igmp_ifc_timer_expire+0x10/0x10 > [ 239.016932] igmp_ifc_timer_expire+0x2b5/0x650 > [ 239.016938] ? __kasan_check_write+0x18/0x20 > [ 239.016943] ? _raw_spin_lock+0x8c/0xe0 > [ 239.016948] ? __pfx_igmp_ifc_timer_expire+0x10/0x10 > [ 239.016954] ? __pfx_igmp_ifc_timer_expire+0x10/0x10 > [ 239.016960] call_timer_fn+0x2d/0x1b0 > [ 239.016966] ? __pfx_igmp_ifc_timer_expire+0x10/0x10 > [ 239.016973] __run_timers.part.0+0x447/0x530 > [ 239.016979] ? __pfx___run_timers.part.0+0x10/0x10 > [ 239.016986] ? ktime_get+0x58/0xd0 > [ 239.016992] ? lapic_next_deadline+0x30/0x40 > [ 239.016997] ? clockevents_program_event+0x118/0x1a0 > [ 239.017004] run_timer_softirq+0x69/0xf0 > [ 239.017010] __do_softirq+0x128/0x444 > [ 239.017016] __irq_exit_rcu+0xdd/0x130 > [ 239.017022] irq_exit_rcu+0x12/0x20 > [ 239.017027] sysvec_apic_timer_interrupt+0xa5/0xc0 > [ 239.017033] </IRQ> > [ 239.017036] <TASK> > [ 239.017039] asm_sysvec_apic_timer_interrupt+0x1f/0x30 > [ 239.017045] RIP: 0010:cpuidle_enter_state+0x1d2/0x510 > [ 239.017051] Code: 30 00 0f 84 61 01 00 00 41 83 ed 01 73 dd 48 83 c4 28 44 89 f0 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc fb 0f 1f 44 00 00 <45> 85 f6 0f 89 18 ff ff ff 48 c7 43 18 00 00 00 00 49 83 fd 09 0f > [ 239.017061] RSP: 0018:ffffc9000028fd40 EFLAGS: 00000246 > [ 239.017067] RAX: 0000000000000000 RBX: ffffe8ffffd00c40 RCX: ffffffff811eae9c > [ 239.017072] RDX: dffffc0000000000 RSI: ffffffff82e7cc00 RDI: ffff88840eb44188 > [ 239.017077] RBP: ffffc9000028fd90 R08: 00000037a67e1a91 R09: ffff88840eb3f0a3 > [ 239.017082] R10: ffffed1081d67e14 R11: 0000000000000001 R12: ffffffff846c43a0 > [ 239.017087] R13: 0000000000000002 R14: 0000000000000002 R15: ffffffff846c4488 > [ 239.017093] ? sched_idle_set_state+0x4c/0x70 > [ 239.017101] cpuidle_enter+0x45/0x70 > [ 239.017108] call_cpuidle+0x44/0x80 > [ 239.017113] do_idle+0x2b4/0x340 > [ 239.017118] ? __pfx_do_idle+0x10/0x10 > [ 239.017125] cpu_startup_entry+0x24/0x30 > [ 239.017130] start_secondary+0x1d6/0x210 > [ 239.017136] ? __pfx_start_secondary+0x10/0x10 > [ 239.017142] ? set_bringup_idt_handler.constprop.0+0x93/0xc0 > [ 239.017150] ? start_cpu0+0xc/0xc > [ 239.017156] secondary_startup_64_no_verify+0xe5/0xeb > [ 239.017163] </TASK> > > [ 239.017169] The buggy address belongs to the virtual mapping at > [ffffc90000859000, ffffc90000862000) created by: > irq_init_percpu_irqstack+0x1c8/0x270 > > [ 239.017182] The buggy address belongs to the physical page: > [ 239.017186] page:00000000d2ae7732 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x40eb09 > [ 239.017193] flags: 0x17ffffc0001000(reserved|node=0|zone=2|lastcpupid=0x1fffff) > [ 239.017202] raw: 0017ffffc0001000 ffffea00103ac248 ffffea00103ac248 0000000000000000 > [ 239.017208] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 > [ 239.017213] page dumped because: kasan: bad access detected > > [ 239.017219] Memory state around the buggy address: > [ 239.017222] ffffc90000860a80: 00 00 00 00 f3 f3 f3 f3 00 00 00 00 00 00 00 00 > [ 239.017228] ffffc90000860b00: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 00 00 00 > [ 239.017233] >ffffc90000860b80: 00 00 00 00 f3 f3 f3 f3 00 00 00 00 00 00 00 00 > [ 239.017238] ^ > [ 239.017241] ffffc90000860c00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > [ 239.017247] ffffc90000860c80: 00 00 00 00 00 00 00 f1 f1 f1 f1 00 f3 f3 f3 00 > [ 239.017251] ================================================================== > ``` > > > > > > Signed-off-by: Hyunwoo Kim <v4bel@theori.io> > > > --- > > > > I find this patch quite strange, to be honest. > > > > It looks like some xfrm bug to me. > > > > A stack trace would be helpful. > > Here are the root caouses I analyzed: > > ``` > igmp_ifc_timer_expire() -> igmpv3_send_cr() -> add_grec() -> add_grhead() -> igmpv3_newpack()[1] -> ip_route_output_ports() -> > ip_route_output_flow()[3] -> xfrm_lookup_route() -> xfrm_lookup() -> xfrm_lookup_with_ifid() -> xfrm_resolve_and_create_bundle() -> > xfrm_tmpl_resolve() -> xfrm_tmpl_resolve_one() -> xfrm_state_find()[5] > ``` > > Here, igmpv3_newpack()[1] declares the variable `struct flowi4 fl4`[2]: > ``` > static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) > { > struct sk_buff *skb; > struct rtable *rt; > struct iphdr *pip; > struct igmpv3_report *pig; > struct net *net = dev_net(dev); > struct flowi4 fl4; // <===[2] > int hlen = LL_RESERVED_SPACE(dev); > int tlen = dev->needed_tailroom; > unsigned int size = mtu; > > while (1) { > skb = alloc_skb(size + hlen + tlen, > GFP_ATOMIC | __GFP_NOWARN); > if (skb) > break; > size >>= 1; > if (size < 256) > return NULL; > } > skb->priority = TC_PRIO_CONTROL; > > rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, > 0, 0, > IPPROTO_IGMP, 0, dev->ifindex); > ``` > OK, this is IPv4 stack here. No bug I think. > > Then, starting with ip_route_output_flow()[3], we use flowi4_to_flowi() to convert the `struct flowi4` variable to a `struct flowi` pointer[4]: > ``` > struct flowi { > union { > struct flowi_common __fl_common; > struct flowi4 ip4; > struct flowi6 ip6; > } u; > #define flowi_oif u.__fl_common.flowic_oif > #define flowi_iif u.__fl_common.flowic_iif > #define flowi_l3mdev u.__fl_common.flowic_l3mdev > #define flowi_mark u.__fl_common.flowic_mark > #define flowi_tos u.__fl_common.flowic_tos > #define flowi_scope u.__fl_common.flowic_scope > #define flowi_proto u.__fl_common.flowic_proto > #define flowi_flags u.__fl_common.flowic_flags > #define flowi_secid u.__fl_common.flowic_secid > #define flowi_tun_key u.__fl_common.flowic_tun_key > #define flowi_uid u.__fl_common.flowic_uid > } __attribute__((__aligned__(BITS_PER_LONG/8))); > > > static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) > { > return container_of(fl4, struct flowi, u.ip4); > } > > > struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, > const struct sock *sk) > { > struct rtable *rt = __ip_route_output_key(net, flp4); > > if (IS_ERR(rt)) > return rt; > > if (flp4->flowi4_proto) { > flp4->flowi4_oif = rt->dst.dev->ifindex; > rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, > flowi4_to_flowi(flp4), // <===[4] > sk, 0); > } > > return rt; > } > EXPORT_SYMBOL_GPL(ip_route_output_flow); > ``` > This is the cause of the stack OOB. Because we calculated the struct flowi pointer address based on struct flowi4 declared as a stack variable, > if we accessed a member of flowi that exceeds the size of flowi4, we would get an OOB. > > > Finally, xfrm_state_find()[5] uses daddr, which is a pointer to `&fl->u.ip4.saddr`. > Here, the encap_family variable can be entered by the user using the netlink socket. > If the user chose AF_INET6 instead of AF_INET, the xfrm_dst_hash() function would be called on an AF_INET6 basis[6], > which could cause an OOB in the `struct flowi4 fl4` variable of igmpv3_newpack()[2]. > ``` This is probably the real bug. (in XFRM layer) CC Steffen Klassert <steffen.klassert@secunet.com> for comments. > struct xfrm_state * > xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, > const struct flowi *fl, struct xfrm_tmpl *tmpl, > struct xfrm_policy *pol, int *err, > unsigned short family, u32 if_id) > { > static xfrm_address_t saddr_wildcard = { }; > struct net *net = xp_net(pol); > unsigned int h, h_wildcard; > struct xfrm_state *x, *x0, *to_put; > int acquire_in_progress = 0; > int error = 0; > struct xfrm_state *best = NULL; > u32 mark = pol->mark.v & pol->mark.m; > unsigned short encap_family = tmpl->encap_family; > unsigned int sequence; > struct km_event c; > > to_put = NULL; > > sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); > > rcu_read_lock(); > h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); // <===[6] > ``` > > Of course, it's possible that the patch I wrote is not appropriate. > > Thanks for the report. > Regards, > Hyunwoo Kim
On Mon, Mar 20, 2023 at 10:08:03PM -0700, Hyunwoo Kim wrote: > On Mon, Mar 20, 2023 at 08:17:15PM -0700, Eric Dumazet wrote: > > On Mon, Mar 20, 2023 at 7:49 PM Hyunwoo Kim <v4bel@theori.io> wrote: > > struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, > const struct sock *sk) > { > struct rtable *rt = __ip_route_output_key(net, flp4); > > if (IS_ERR(rt)) > return rt; > > if (flp4->flowi4_proto) { > flp4->flowi4_oif = rt->dst.dev->ifindex; > rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, > flowi4_to_flowi(flp4), // <===[4] > sk, 0); > } > > return rt; > } > EXPORT_SYMBOL_GPL(ip_route_output_flow); > ``` > This is the cause of the stack OOB. Because we calculated the struct flowi pointer address based on struct flowi4 declared as a stack variable, > if we accessed a member of flowi that exceeds the size of flowi4, we would get an OOB. > > > Finally, xfrm_state_find()[5] uses daddr, which is a pointer to `&fl->u.ip4.saddr`. > Here, the encap_family variable can be entered by the user using the netlink socket. > If the user chose AF_INET6 instead of AF_INET, the xfrm_dst_hash() function would be called on an AF_INET6 basis[6], > which could cause an OOB in the `struct flowi4 fl4` variable of igmpv3_newpack()[2]. Thanks for the great analysis! Looks like a missing sanity check when the policy gets inserted. Can you send the output of 'ip x p' for that policy?
On Tue, Mar 21, 2023 at 11:52:02AM +0100, Steffen Klassert wrote: > On Mon, Mar 20, 2023 at 10:08:03PM -0700, Hyunwoo Kim wrote: > > On Mon, Mar 20, 2023 at 08:17:15PM -0700, Eric Dumazet wrote: > > > On Mon, Mar 20, 2023 at 7:49 PM Hyunwoo Kim <v4bel@theori.io> wrote: > > > > struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, > > const struct sock *sk) > > { > > struct rtable *rt = __ip_route_output_key(net, flp4); > > > > if (IS_ERR(rt)) > > return rt; > > > > if (flp4->flowi4_proto) { > > flp4->flowi4_oif = rt->dst.dev->ifindex; > > rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, > > flowi4_to_flowi(flp4), // <===[4] > > sk, 0); > > } > > > > return rt; > > } > > EXPORT_SYMBOL_GPL(ip_route_output_flow); > > ``` > > This is the cause of the stack OOB. Because we calculated the struct flowi pointer address based on struct flowi4 declared as a stack variable, > > if we accessed a member of flowi that exceeds the size of flowi4, we would get an OOB. > > > > > > Finally, xfrm_state_find()[5] uses daddr, which is a pointer to `&fl->u.ip4.saddr`. > > Here, the encap_family variable can be entered by the user using the netlink socket. > > If the user chose AF_INET6 instead of AF_INET, the xfrm_dst_hash() function would be called on an AF_INET6 basis[6], > > which could cause an OOB in the `struct flowi4 fl4` variable of igmpv3_newpack()[2]. > > Thanks for the great analysis! > > Looks like a missing sanity check when the policy gets inserted. > Can you send the output of 'ip x p' for that policy? I'm not sure what 'ip x p' means, as my understanding of XFRM is limited, sorry. Instead, here is the (dirty) code I used to trigger this: ``` #include <endian.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/syscall.h> #include <sys/types.h> #include <unistd.h> #include <sched.h> #include <fcntl.h> uint64_t r[2] = {0xffffffffffffffff, 0xffffffffffffffff}; int main(void) { int ret; intptr_t res = 0; syscall(__NR_mmap, 0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul); syscall(__NR_mmap, 0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul); syscall(__NR_mmap, 0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul); res = syscall(__NR_socket, 0x10ul, 3ul, 0); printf("socket() 1 : %ld\n", res); if (res != -1) r[0] = res; *(uint64_t*)0x20000000 = 0; *(uint32_t*)0x20000008 = 0; *(uint64_t*)0x20000010 = 0x20000140; *(uint64_t*)0x20000140 = 0x20000040; memcpy((void*)0x20000040, "\x3c\x00\x00\x00\x10\x00\x01\x04\x00\xee\xff\xff\xff\xff\xff\xff\x00" "\x00\x00\x00", 20); *(uint32_t*)0x20000054 = -1; memcpy((void*)0x20000058, "\x01\x00\x00\x00\x01\x00\x00\x00\x1c\x00\x12\x00\x0c\x00\x01\x00\x62" "\x72\x69\x64\x67\x65", 22); *(uint64_t*)0x20000148 = 0x3c; *(uint64_t*)0x20000018 = 1; *(uint64_t*)0x20000020 = 0; *(uint64_t*)0x20000028 = 0; *(uint32_t*)0x20000030 = 0; ret = syscall(__NR_sendmsg, r[0], 0x20000000ul, 0ul); printf("sendmsg() 1 : %d\n", ret); res = syscall(__NR_socket, 0x10ul, 3ul, 6); printf("socket() 2 : %ld\n", res); if (res != -1) r[1] = res; *(uint64_t*)0x20000480 = 0; *(uint32_t*)0x20000488 = 0; *(uint64_t*)0x20000490 = 0x20000200; *(uint64_t*)0x20000200 = 0x200004c0; *(uint32_t*)0x200004c0 = 0x208; *(uint16_t*)0x200004c4 = 0x19; *(uint16_t*)0x200004c6 = 1; *(uint32_t*)0x200004c8 = 0; *(uint32_t*)0x200004cc = 0; memset((void*)0x200004d0, 0, 16); *(uint8_t*)0x200004e0 = -1; *(uint8_t*)0x200004e1 = 1; memset((void*)0x200004e2, 0, 13); *(uint8_t*)0x200004ef = 1; *(uint16_t*)0x200004f0 = htobe16(0); *(uint16_t*)0x200004f2 = htobe16(0); *(uint16_t*)0x200004f4 = htobe16(0); *(uint16_t*)0x200004f6 = htobe16(0); *(uint16_t*)0x200004f8 = 2; *(uint8_t*)0x200004fa = 0; *(uint8_t*)0x200004fb = 0; *(uint8_t*)0x200004fc = 0; *(uint32_t*)0x20000500 = 0; *(uint32_t*)0x20000504 = -1; *(uint64_t*)0x20000508 = 0; *(uint64_t*)0x20000510 = 0; *(uint64_t*)0x20000518 = 0; *(uint64_t*)0x20000520 = 0; *(uint64_t*)0x20000528 = 0; *(uint64_t*)0x20000530 = 0; *(uint64_t*)0x20000538 = 0; *(uint64_t*)0x20000540 = 0; *(uint64_t*)0x20000548 = 0; *(uint64_t*)0x20000550 = 0; *(uint64_t*)0x20000558 = 0; *(uint64_t*)0x20000560 = 0; *(uint32_t*)0x20000568 = 0; *(uint32_t*)0x2000056c = 0; *(uint8_t*)0x20000570 = 1; *(uint8_t*)0x20000571 = 0; *(uint8_t*)0x20000572 = 0; *(uint8_t*)0x20000573 = 0; *(uint16_t*)0x20000578 = 0xc; *(uint16_t*)0x2000057a = 0x15; *(uint32_t*)0x2000057c = 0; *(uint32_t*)0x20000580 = 6; *(uint16_t*)0x20000584 = 0x144; *(uint16_t*)0x20000586 = 5; memset((void*)0x20000588, 0, 16); *(uint32_t*)0x20000598 = htobe32(0); *(uint8_t*)0x2000059c = 0x6c; *(uint16_t*)0x200005a0 = 0; *(uint32_t*)0x200005a4 = htobe32(0); *(uint32_t*)0x200005b4 = 0; *(uint8_t*)0x200005b8 = 4; *(uint8_t*)0x200005b9 = 0; *(uint8_t*)0x200005ba = 0x10; *(uint32_t*)0x200005bc = 0; *(uint32_t*)0x200005c0 = 0; *(uint32_t*)0x200005c4 = 0; *(uint32_t*)0x200005c8 = htobe32(0xe0000002); *(uint32_t*)0x200005d8 = htobe32(0); *(uint8_t*)0x200005dc = 0x33; *(uint16_t*)0x200005e0 = 0xa; *(uint8_t*)0x200005e4 = 0xfc; *(uint8_t*)0x200005e5 = 0; memset((void*)0x200005e6, 0, 13); *(uint8_t*)0x200005f3 = 0; *(uint32_t*)0x200005f4 = 0; *(uint8_t*)0x200005f8 = 1; *(uint8_t*)0x200005f9 = 0; *(uint8_t*)0x200005fa = 0x20; *(uint32_t*)0x200005fc = 0; *(uint32_t*)0x20000600 = 0; *(uint32_t*)0x20000604 = 0; *(uint8_t*)0x20000608 = 0xac; *(uint8_t*)0x20000609 = 0x14; *(uint8_t*)0x2000060a = 0x14; *(uint8_t*)0x2000060b = 0xfa; *(uint32_t*)0x20000618 = htobe32(0); *(uint8_t*)0x2000061c = 0x2b; *(uint16_t*)0x20000620 = 0xa; *(uint8_t*)0x20000624 = 0xac; *(uint8_t*)0x20000625 = 0x14; *(uint8_t*)0x20000626 = 0x14; *(uint8_t*)0x20000627 = 0xbb; *(uint32_t*)0x20000634 = 0; *(uint8_t*)0x20000638 = 0; *(uint8_t*)0x20000639 = 0; *(uint8_t*)0x2000063a = 3; *(uint32_t*)0x2000063c = 0; *(uint32_t*)0x20000640 = 0; *(uint32_t*)0x20000644 = 0; memcpy((void*)0x20000648, " \001\000\000\000\000\000\000\000\000\000\000\000\000\000\001", 16); *(uint32_t*)0x20000658 = htobe32(0); *(uint8_t*)0x2000065c = 0x33; *(uint16_t*)0x20000660 = 0xa; *(uint32_t*)0x20000664 = htobe32(0); *(uint32_t*)0x20000674 = 0; *(uint8_t*)0x20000678 = 3; *(uint8_t*)0x20000679 = 0; *(uint8_t*)0x2000067a = 0; *(uint32_t*)0x2000067c = 0; *(uint32_t*)0x20000680 = 0; *(uint32_t*)0x20000684 = 0; *(uint32_t*)0x20000688 = htobe32(0x7f000001); *(uint32_t*)0x20000698 = htobe32(0); *(uint8_t*)0x2000069c = 0x6c; *(uint16_t*)0x200006a0 = 0xa; *(uint8_t*)0x200006a4 = -1; *(uint8_t*)0x200006a5 = 1; memset((void*)0x200006a6, 0, 13); *(uint8_t*)0x200006b3 = 1; *(uint32_t*)0x200006b4 = 0; *(uint8_t*)0x200006b8 = 0; *(uint8_t*)0x200006b9 = 0; *(uint8_t*)0x200006ba = 0; *(uint32_t*)0x200006bc = 0; *(uint32_t*)0x200006c0 = 0; *(uint32_t*)0x200006c4 = -1; *(uint64_t*)0x20000208 = 0x208; *(uint64_t*)0x20000498 = 1; *(uint64_t*)0x200004a0 = 0; *(uint64_t*)0x200004a8 = 0; *(uint32_t*)0x200004b0 = 0; ret = syscall(__NR_sendmsg, r[1], 0x20000480ul, 0ul); printf("sendmsg() 2 : %d\n", ret); return 0; } ```
On Tue, Mar 21, 2023 at 4:14 AM Hyunwoo Kim <v4bel@theori.io> wrote: > > I'm not sure what 'ip x p' means, as my understanding of XFRM is limited, sorry. Since your repro does not set up a private netns. Please install the iproute2 package (if not there already) and run the following command sudo ip x p man ip IP(8) Linux IP(8) NAME ip - show / manipulate routing, network devices, interfaces and tunnels SYNOPSIS > > Instead, here is the (dirty) code I used to trigger this: > ``` > #include <endian.h> > #include <stdint.h> > #include <stdio.h> > #include <stdlib.h> > #include <string.h> > #include <sys/syscall.h> > #include <sys/types.h> > #include <unistd.h> > #include <sched.h> > #include <fcntl.h> > > > uint64_t r[2] = {0xffffffffffffffff, 0xffffffffffffffff}; > > int main(void) > { > int ret; > intptr_t res = 0; > > syscall(__NR_mmap, 0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul); > syscall(__NR_mmap, 0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul); > syscall(__NR_mmap, 0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul); > > res = syscall(__NR_socket, 0x10ul, 3ul, 0); > printf("socket() 1 : %ld\n", res); > if (res != -1) > r[0] = res; > *(uint64_t*)0x20000000 = 0; > *(uint32_t*)0x20000008 = 0; > *(uint64_t*)0x20000010 = 0x20000140; > *(uint64_t*)0x20000140 = 0x20000040; > memcpy((void*)0x20000040, > "\x3c\x00\x00\x00\x10\x00\x01\x04\x00\xee\xff\xff\xff\xff\xff\xff\x00" > "\x00\x00\x00", > 20); > *(uint32_t*)0x20000054 = -1; > memcpy((void*)0x20000058, > "\x01\x00\x00\x00\x01\x00\x00\x00\x1c\x00\x12\x00\x0c\x00\x01\x00\x62" > "\x72\x69\x64\x67\x65", > 22); > *(uint64_t*)0x20000148 = 0x3c; > *(uint64_t*)0x20000018 = 1; > *(uint64_t*)0x20000020 = 0; > *(uint64_t*)0x20000028 = 0; > *(uint32_t*)0x20000030 = 0; > ret = syscall(__NR_sendmsg, r[0], 0x20000000ul, 0ul); > printf("sendmsg() 1 : %d\n", ret); > > res = syscall(__NR_socket, 0x10ul, 3ul, 6); > printf("socket() 2 : %ld\n", res); > if (res != -1) > r[1] = res; > *(uint64_t*)0x20000480 = 0; > *(uint32_t*)0x20000488 = 0; > *(uint64_t*)0x20000490 = 0x20000200; > *(uint64_t*)0x20000200 = 0x200004c0; > *(uint32_t*)0x200004c0 = 0x208; > *(uint16_t*)0x200004c4 = 0x19; > *(uint16_t*)0x200004c6 = 1; > *(uint32_t*)0x200004c8 = 0; > *(uint32_t*)0x200004cc = 0; > memset((void*)0x200004d0, 0, 16); > *(uint8_t*)0x200004e0 = -1; > *(uint8_t*)0x200004e1 = 1; > memset((void*)0x200004e2, 0, 13); > *(uint8_t*)0x200004ef = 1; > *(uint16_t*)0x200004f0 = htobe16(0); > *(uint16_t*)0x200004f2 = htobe16(0); > *(uint16_t*)0x200004f4 = htobe16(0); > *(uint16_t*)0x200004f6 = htobe16(0); > *(uint16_t*)0x200004f8 = 2; > *(uint8_t*)0x200004fa = 0; > *(uint8_t*)0x200004fb = 0; > *(uint8_t*)0x200004fc = 0; > *(uint32_t*)0x20000500 = 0; > *(uint32_t*)0x20000504 = -1; > *(uint64_t*)0x20000508 = 0; > *(uint64_t*)0x20000510 = 0; > *(uint64_t*)0x20000518 = 0; > *(uint64_t*)0x20000520 = 0; > *(uint64_t*)0x20000528 = 0; > *(uint64_t*)0x20000530 = 0; > *(uint64_t*)0x20000538 = 0; > *(uint64_t*)0x20000540 = 0; > *(uint64_t*)0x20000548 = 0; > *(uint64_t*)0x20000550 = 0; > *(uint64_t*)0x20000558 = 0; > *(uint64_t*)0x20000560 = 0; > *(uint32_t*)0x20000568 = 0; > *(uint32_t*)0x2000056c = 0; > *(uint8_t*)0x20000570 = 1; > *(uint8_t*)0x20000571 = 0; > *(uint8_t*)0x20000572 = 0; > *(uint8_t*)0x20000573 = 0; > *(uint16_t*)0x20000578 = 0xc; > *(uint16_t*)0x2000057a = 0x15; > *(uint32_t*)0x2000057c = 0; > *(uint32_t*)0x20000580 = 6; > *(uint16_t*)0x20000584 = 0x144; > *(uint16_t*)0x20000586 = 5; > memset((void*)0x20000588, 0, 16); > *(uint32_t*)0x20000598 = htobe32(0); > *(uint8_t*)0x2000059c = 0x6c; > *(uint16_t*)0x200005a0 = 0; > *(uint32_t*)0x200005a4 = htobe32(0); > *(uint32_t*)0x200005b4 = 0; > *(uint8_t*)0x200005b8 = 4; > *(uint8_t*)0x200005b9 = 0; > *(uint8_t*)0x200005ba = 0x10; > *(uint32_t*)0x200005bc = 0; > *(uint32_t*)0x200005c0 = 0; > *(uint32_t*)0x200005c4 = 0; > *(uint32_t*)0x200005c8 = htobe32(0xe0000002); > *(uint32_t*)0x200005d8 = htobe32(0); > *(uint8_t*)0x200005dc = 0x33; > *(uint16_t*)0x200005e0 = 0xa; > *(uint8_t*)0x200005e4 = 0xfc; > *(uint8_t*)0x200005e5 = 0; > memset((void*)0x200005e6, 0, 13); > *(uint8_t*)0x200005f3 = 0; > *(uint32_t*)0x200005f4 = 0; > *(uint8_t*)0x200005f8 = 1; > *(uint8_t*)0x200005f9 = 0; > *(uint8_t*)0x200005fa = 0x20; > *(uint32_t*)0x200005fc = 0; > *(uint32_t*)0x20000600 = 0; > *(uint32_t*)0x20000604 = 0; > *(uint8_t*)0x20000608 = 0xac; > *(uint8_t*)0x20000609 = 0x14; > *(uint8_t*)0x2000060a = 0x14; > *(uint8_t*)0x2000060b = 0xfa; > *(uint32_t*)0x20000618 = htobe32(0); > *(uint8_t*)0x2000061c = 0x2b; > *(uint16_t*)0x20000620 = 0xa; > *(uint8_t*)0x20000624 = 0xac; > *(uint8_t*)0x20000625 = 0x14; > *(uint8_t*)0x20000626 = 0x14; > *(uint8_t*)0x20000627 = 0xbb; > *(uint32_t*)0x20000634 = 0; > *(uint8_t*)0x20000638 = 0; > *(uint8_t*)0x20000639 = 0; > *(uint8_t*)0x2000063a = 3; > *(uint32_t*)0x2000063c = 0; > *(uint32_t*)0x20000640 = 0; > *(uint32_t*)0x20000644 = 0; > memcpy((void*)0x20000648, > " \001\000\000\000\000\000\000\000\000\000\000\000\000\000\001", 16); > *(uint32_t*)0x20000658 = htobe32(0); > *(uint8_t*)0x2000065c = 0x33; > *(uint16_t*)0x20000660 = 0xa; > *(uint32_t*)0x20000664 = htobe32(0); > *(uint32_t*)0x20000674 = 0; > *(uint8_t*)0x20000678 = 3; > *(uint8_t*)0x20000679 = 0; > *(uint8_t*)0x2000067a = 0; > *(uint32_t*)0x2000067c = 0; > *(uint32_t*)0x20000680 = 0; > *(uint32_t*)0x20000684 = 0; > *(uint32_t*)0x20000688 = htobe32(0x7f000001); > *(uint32_t*)0x20000698 = htobe32(0); > *(uint8_t*)0x2000069c = 0x6c; > *(uint16_t*)0x200006a0 = 0xa; > *(uint8_t*)0x200006a4 = -1; > *(uint8_t*)0x200006a5 = 1; > memset((void*)0x200006a6, 0, 13); > *(uint8_t*)0x200006b3 = 1; > *(uint32_t*)0x200006b4 = 0; > *(uint8_t*)0x200006b8 = 0; > *(uint8_t*)0x200006b9 = 0; > *(uint8_t*)0x200006ba = 0; > *(uint32_t*)0x200006bc = 0; > *(uint32_t*)0x200006c0 = 0; > *(uint32_t*)0x200006c4 = -1; > *(uint64_t*)0x20000208 = 0x208; > *(uint64_t*)0x20000498 = 1; > *(uint64_t*)0x200004a0 = 0; > *(uint64_t*)0x200004a8 = 0; > *(uint32_t*)0x200004b0 = 0; > ret = syscall(__NR_sendmsg, r[1], 0x20000480ul, 0ul); > printf("sendmsg() 2 : %d\n", ret); > return 0; > } > ```
On Tue, Mar 21, 2023 at 04:19:25AM -0700, Eric Dumazet wrote: > On Tue, Mar 21, 2023 at 4:14 AM Hyunwoo Kim <v4bel@theori.io> wrote: > > > > I'm not sure what 'ip x p' means, as my understanding of XFRM is limited, sorry. > > Since your repro does not set up a private netns. > > Please install the iproute2 package (if not there already) and run the > following command > > sudo ip x p > > man ip > > IP(8) Linux > IP(8) > > NAME > ip - show / manipulate routing, network devices, interfaces and tunnels > > SYNOPSIS This is the result of creating a new netns, running repro, and then running the ip x p command: ``` src 255.1.0.0/0 dst 0.0.0.0/0 dir out priority 0 mark 0/0x6 tmpl src 0.0.0.0 dst 0.0.0.0 proto comp reqid 0 mode beet level 16 tmpl src fc00:: dst e000:2:: proto ah reqid 0 mode tunnel level 32 tmpl src ac14:14bb:: dst ac14:14fa:: proto route2 reqid 0 mode transport level 3 tmpl src :: dst 2001::1 proto ah reqid 0 mode in_trigger tmpl src ff01::1 dst 7f00:1:: proto comp reqid 0 mode transport ``` > > > > > Instead, here is the (dirty) code I used to trigger this: > > ``` > > #include <endian.h> > > #include <stdint.h> > > #include <stdio.h> > > #include <stdlib.h> > > #include <string.h> > > #include <sys/syscall.h> > > #include <sys/types.h> > > #include <unistd.h> > > #include <sched.h> > > #include <fcntl.h> > > > > > > uint64_t r[2] = {0xffffffffffffffff, 0xffffffffffffffff}; > > > > int main(void) > > { > > int ret; > > intptr_t res = 0; > > > > syscall(__NR_mmap, 0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul); > > syscall(__NR_mmap, 0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul); > > syscall(__NR_mmap, 0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul); > > > > res = syscall(__NR_socket, 0x10ul, 3ul, 0); > > printf("socket() 1 : %ld\n", res); > > if (res != -1) > > r[0] = res; > > *(uint64_t*)0x20000000 = 0; > > *(uint32_t*)0x20000008 = 0; > > *(uint64_t*)0x20000010 = 0x20000140; > > *(uint64_t*)0x20000140 = 0x20000040; > > memcpy((void*)0x20000040, > > "\x3c\x00\x00\x00\x10\x00\x01\x04\x00\xee\xff\xff\xff\xff\xff\xff\x00" > > "\x00\x00\x00", > > 20); > > *(uint32_t*)0x20000054 = -1; > > memcpy((void*)0x20000058, > > "\x01\x00\x00\x00\x01\x00\x00\x00\x1c\x00\x12\x00\x0c\x00\x01\x00\x62" > > "\x72\x69\x64\x67\x65", > > 22); > > *(uint64_t*)0x20000148 = 0x3c; > > *(uint64_t*)0x20000018 = 1; > > *(uint64_t*)0x20000020 = 0; > > *(uint64_t*)0x20000028 = 0; > > *(uint32_t*)0x20000030 = 0; > > ret = syscall(__NR_sendmsg, r[0], 0x20000000ul, 0ul); > > printf("sendmsg() 1 : %d\n", ret); > > > > res = syscall(__NR_socket, 0x10ul, 3ul, 6); > > printf("socket() 2 : %ld\n", res); > > if (res != -1) > > r[1] = res; > > *(uint64_t*)0x20000480 = 0; > > *(uint32_t*)0x20000488 = 0; > > *(uint64_t*)0x20000490 = 0x20000200; > > *(uint64_t*)0x20000200 = 0x200004c0; > > *(uint32_t*)0x200004c0 = 0x208; > > *(uint16_t*)0x200004c4 = 0x19; > > *(uint16_t*)0x200004c6 = 1; > > *(uint32_t*)0x200004c8 = 0; > > *(uint32_t*)0x200004cc = 0; > > memset((void*)0x200004d0, 0, 16); > > *(uint8_t*)0x200004e0 = -1; > > *(uint8_t*)0x200004e1 = 1; > > memset((void*)0x200004e2, 0, 13); > > *(uint8_t*)0x200004ef = 1; > > *(uint16_t*)0x200004f0 = htobe16(0); > > *(uint16_t*)0x200004f2 = htobe16(0); > > *(uint16_t*)0x200004f4 = htobe16(0); > > *(uint16_t*)0x200004f6 = htobe16(0); > > *(uint16_t*)0x200004f8 = 2; > > *(uint8_t*)0x200004fa = 0; > > *(uint8_t*)0x200004fb = 0; > > *(uint8_t*)0x200004fc = 0; > > *(uint32_t*)0x20000500 = 0; > > *(uint32_t*)0x20000504 = -1; > > *(uint64_t*)0x20000508 = 0; > > *(uint64_t*)0x20000510 = 0; > > *(uint64_t*)0x20000518 = 0; > > *(uint64_t*)0x20000520 = 0; > > *(uint64_t*)0x20000528 = 0; > > *(uint64_t*)0x20000530 = 0; > > *(uint64_t*)0x20000538 = 0; > > *(uint64_t*)0x20000540 = 0; > > *(uint64_t*)0x20000548 = 0; > > *(uint64_t*)0x20000550 = 0; > > *(uint64_t*)0x20000558 = 0; > > *(uint64_t*)0x20000560 = 0; > > *(uint32_t*)0x20000568 = 0; > > *(uint32_t*)0x2000056c = 0; > > *(uint8_t*)0x20000570 = 1; > > *(uint8_t*)0x20000571 = 0; > > *(uint8_t*)0x20000572 = 0; > > *(uint8_t*)0x20000573 = 0; > > *(uint16_t*)0x20000578 = 0xc; > > *(uint16_t*)0x2000057a = 0x15; > > *(uint32_t*)0x2000057c = 0; > > *(uint32_t*)0x20000580 = 6; > > *(uint16_t*)0x20000584 = 0x144; > > *(uint16_t*)0x20000586 = 5; > > memset((void*)0x20000588, 0, 16); > > *(uint32_t*)0x20000598 = htobe32(0); > > *(uint8_t*)0x2000059c = 0x6c; > > *(uint16_t*)0x200005a0 = 0; > > *(uint32_t*)0x200005a4 = htobe32(0); > > *(uint32_t*)0x200005b4 = 0; > > *(uint8_t*)0x200005b8 = 4; > > *(uint8_t*)0x200005b9 = 0; > > *(uint8_t*)0x200005ba = 0x10; > > *(uint32_t*)0x200005bc = 0; > > *(uint32_t*)0x200005c0 = 0; > > *(uint32_t*)0x200005c4 = 0; > > *(uint32_t*)0x200005c8 = htobe32(0xe0000002); > > *(uint32_t*)0x200005d8 = htobe32(0); > > *(uint8_t*)0x200005dc = 0x33; > > *(uint16_t*)0x200005e0 = 0xa; > > *(uint8_t*)0x200005e4 = 0xfc; > > *(uint8_t*)0x200005e5 = 0; > > memset((void*)0x200005e6, 0, 13); > > *(uint8_t*)0x200005f3 = 0; > > *(uint32_t*)0x200005f4 = 0; > > *(uint8_t*)0x200005f8 = 1; > > *(uint8_t*)0x200005f9 = 0; > > *(uint8_t*)0x200005fa = 0x20; > > *(uint32_t*)0x200005fc = 0; > > *(uint32_t*)0x20000600 = 0; > > *(uint32_t*)0x20000604 = 0; > > *(uint8_t*)0x20000608 = 0xac; > > *(uint8_t*)0x20000609 = 0x14; > > *(uint8_t*)0x2000060a = 0x14; > > *(uint8_t*)0x2000060b = 0xfa; > > *(uint32_t*)0x20000618 = htobe32(0); > > *(uint8_t*)0x2000061c = 0x2b; > > *(uint16_t*)0x20000620 = 0xa; > > *(uint8_t*)0x20000624 = 0xac; > > *(uint8_t*)0x20000625 = 0x14; > > *(uint8_t*)0x20000626 = 0x14; > > *(uint8_t*)0x20000627 = 0xbb; > > *(uint32_t*)0x20000634 = 0; > > *(uint8_t*)0x20000638 = 0; > > *(uint8_t*)0x20000639 = 0; > > *(uint8_t*)0x2000063a = 3; > > *(uint32_t*)0x2000063c = 0; > > *(uint32_t*)0x20000640 = 0; > > *(uint32_t*)0x20000644 = 0; > > memcpy((void*)0x20000648, > > " \001\000\000\000\000\000\000\000\000\000\000\000\000\000\001", 16); > > *(uint32_t*)0x20000658 = htobe32(0); > > *(uint8_t*)0x2000065c = 0x33; > > *(uint16_t*)0x20000660 = 0xa; > > *(uint32_t*)0x20000664 = htobe32(0); > > *(uint32_t*)0x20000674 = 0; > > *(uint8_t*)0x20000678 = 3; > > *(uint8_t*)0x20000679 = 0; > > *(uint8_t*)0x2000067a = 0; > > *(uint32_t*)0x2000067c = 0; > > *(uint32_t*)0x20000680 = 0; > > *(uint32_t*)0x20000684 = 0; > > *(uint32_t*)0x20000688 = htobe32(0x7f000001); > > *(uint32_t*)0x20000698 = htobe32(0); > > *(uint8_t*)0x2000069c = 0x6c; > > *(uint16_t*)0x200006a0 = 0xa; > > *(uint8_t*)0x200006a4 = -1; > > *(uint8_t*)0x200006a5 = 1; > > memset((void*)0x200006a6, 0, 13); > > *(uint8_t*)0x200006b3 = 1; > > *(uint32_t*)0x200006b4 = 0; > > *(uint8_t*)0x200006b8 = 0; > > *(uint8_t*)0x200006b9 = 0; > > *(uint8_t*)0x200006ba = 0; > > *(uint32_t*)0x200006bc = 0; > > *(uint32_t*)0x200006c0 = 0; > > *(uint32_t*)0x200006c4 = -1; > > *(uint64_t*)0x20000208 = 0x208; > > *(uint64_t*)0x20000498 = 1; > > *(uint64_t*)0x200004a0 = 0; > > *(uint64_t*)0x200004a8 = 0; > > *(uint32_t*)0x200004b0 = 0; > > ret = syscall(__NR_sendmsg, r[1], 0x20000480ul, 0ul); > > printf("sendmsg() 2 : %d\n", ret); > > return 0; > > } > > ```
On Tue, Mar 21, 2023 at 04:14:30AM -0700, Hyunwoo Kim wrote: > On Tue, Mar 21, 2023 at 11:52:02AM +0100, Steffen Klassert wrote: > > On Mon, Mar 20, 2023 at 10:08:03PM -0700, Hyunwoo Kim wrote: > > > On Mon, Mar 20, 2023 at 08:17:15PM -0700, Eric Dumazet wrote: > > > > On Mon, Mar 20, 2023 at 7:49 PM Hyunwoo Kim <v4bel@theori.io> wrote: > > > > > > struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, > > > const struct sock *sk) > > > { > > > struct rtable *rt = __ip_route_output_key(net, flp4); > > > > > > if (IS_ERR(rt)) > > > return rt; > > > > > > if (flp4->flowi4_proto) { > > > flp4->flowi4_oif = rt->dst.dev->ifindex; > > > rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, > > > flowi4_to_flowi(flp4), // <===[4] > > > sk, 0); > > > } > > > > > > return rt; > > > } > > > EXPORT_SYMBOL_GPL(ip_route_output_flow); > > > ``` > > > This is the cause of the stack OOB. Because we calculated the struct flowi pointer address based on struct flowi4 declared as a stack variable, > > > if we accessed a member of flowi that exceeds the size of flowi4, we would get an OOB. > > > > > > > > > Finally, xfrm_state_find()[5] uses daddr, which is a pointer to `&fl->u.ip4.saddr`. > > > Here, the encap_family variable can be entered by the user using the netlink socket. > > > If the user chose AF_INET6 instead of AF_INET, the xfrm_dst_hash() function would be called on an AF_INET6 basis[6], > > > which could cause an OOB in the `struct flowi4 fl4` variable of igmpv3_newpack()[2]. > > > > Thanks for the great analysis! > > > > Looks like a missing sanity check when the policy gets inserted. > > Can you send the output of 'ip x p' for that policy? > > I'm not sure what 'ip x p' means, as my understanding of XFRM is limited, sorry. > > Instead, here is the (dirty) code I used to trigger this: Thanks, you code created a policy with IPv4 selectors and IPv6 transport mode templates: src 255.1.0.0/0 dst 0.0.0.0/0 dir out priority 0 ptype main mark 0/0x6 tmpl src 0.0.0.0 dst 0.0.0.0 proto comp reqid 0 mode beet level 16 tmpl src fc00:: dst e000:2:: proto ah reqid 0 mode tunnel level 32 tmpl src ac14:14bb:: dst ac14:14fa:: proto route2 reqid 0 mode transport level 3 tmpl src :: dst 2001::1 proto ah reqid 0 mode in_trigger tmpl src ff01::1 dst 7f00:1:: proto comp reqid 0 mode transport This is an invalid configuration. I'm working on a fix. Thanks again!
On Tue, Mar 21, 2023 at 04:35:09AM -0700, Hyunwoo Kim wrote: > On Tue, Mar 21, 2023 at 04:19:25AM -0700, Eric Dumazet wrote: > > On Tue, Mar 21, 2023 at 4:14 AM Hyunwoo Kim <v4bel@theori.io> wrote: > > > > > > I'm not sure what 'ip x p' means, as my understanding of XFRM is limited, sorry. > > > > Since your repro does not set up a private netns. > > > > Please install the iproute2 package (if not there already) and run the > > following command > > > > sudo ip x p > > > > man ip > > > > IP(8) Linux > > IP(8) > > > > NAME > > ip - show / manipulate routing, network devices, interfaces and tunnels > > > > SYNOPSIS > > This is the result of creating a new netns, running repro, and then running the ip x p command: > ``` > src 255.1.0.0/0 dst 0.0.0.0/0 > dir out priority 0 > mark 0/0x6 > tmpl src 0.0.0.0 dst 0.0.0.0 > proto comp reqid 0 mode beet > level 16 > tmpl src fc00:: dst e000:2:: > proto ah reqid 0 mode tunnel > level 32 > tmpl src ac14:14bb:: dst ac14:14fa:: > proto route2 reqid 0 mode transport > level 3 > tmpl src :: dst 2001::1 > proto ah reqid 0 mode in_trigger > tmpl src ff01::1 dst 7f00:1:: > proto comp reqid 0 mode transport > ``` I plan to fix this with the patch below. With this, the above policy should be rejected. It still needs a bit of testing to make sure that I prohibited no valid usecase with it. --- Subject: [PATCH RFC ipsec] xfrm: Don't allow optional intermediate templates that changes the address family When an optional intermediate template changes the address family, it is unclear which family the next template should have. This can lead to misinterpretations of IPv4/IPv6 addresses. So reject optional intermediate templates on insertion time. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Hyunwoo Kim <v4bel@theori.io> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> --- net/xfrm/xfrm_user.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 103af2b3e986..df4e840b630e 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1770,6 +1770,7 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, struct netlink_ext_ack *extack) { + bool opt_family_change; u16 prev_family; int i; @@ -1778,6 +1779,7 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, return -EINVAL; } + opt_family_change = false; prev_family = family; for (i = 0; i < nr; i++) { @@ -1791,9 +1793,16 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, if (!ut[i].family) ut[i].family = family; + if (opt_family_change) { + NL_SET_ERR_MSG(extack, "Optional intermediate templates don't support family changes"); + return -EINVAL; + } + switch (ut[i].mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: + if (ut[i].optional && ut[i].family != prev_family) + opt_family_change = true; break; default: if (ut[i].family != prev_family) {
Hi, Steffen! On 3/24/23 09:57, Steffen Klassert wrote: > On Tue, Mar 21, 2023 at 04:35:09AM -0700, Hyunwoo Kim wrote: >> On Tue, Mar 21, 2023 at 04:19:25AM -0700, Eric Dumazet wrote: >>> On Tue, Mar 21, 2023 at 4:14 AM Hyunwoo Kim <v4bel@theori.io> wrote: >>>> >>>> I'm not sure what 'ip x p' means, as my understanding of XFRM is limited, sorry. >>> >>> Since your repro does not set up a private netns. >>> >>> Please install the iproute2 package (if not there already) and run the >>> following command >>> >>> sudo ip x p >>> >>> man ip >>> >>> IP(8) Linux >>> IP(8) >>> >>> NAME >>> ip - show / manipulate routing, network devices, interfaces and tunnels >>> >>> SYNOPSIS >> >> This is the result of creating a new netns, running repro, and then running the ip x p command: >> ``` >> src 255.1.0.0/0 dst 0.0.0.0/0 >> dir out priority 0 >> mark 0/0x6 >> tmpl src 0.0.0.0 dst 0.0.0.0 >> proto comp reqid 0 mode beet >> level 16 >> tmpl src fc00:: dst e000:2:: >> proto ah reqid 0 mode tunnel >> level 32 >> tmpl src ac14:14bb:: dst ac14:14fa:: >> proto route2 reqid 0 mode transport >> level 3 >> tmpl src :: dst 2001::1 >> proto ah reqid 0 mode in_trigger >> tmpl src ff01::1 dst 7f00:1:: >> proto comp reqid 0 mode transport >> ``` > > I plan to fix this with the patch below. With this, the above policy > should be rejected. It still needs a bit of testing to make sure that > I prohibited no valid usecase with it. > > --- > Subject: [PATCH RFC ipsec] xfrm: Don't allow optional intermediate templates that > changes the address family > > When an optional intermediate template changes the address family, > it is unclear which family the next template should have. This can > lead to misinterpretations of IPv4/IPv6 addresses. So reject > optional intermediate templates on insertion time. > > Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") > Reported-by: Hyunwoo Kim <v4bel@theori.io> > Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> > --- > net/xfrm/xfrm_user.c | 9 +++++++++ > 1 file changed, 9 insertions(+) > What's the status of this patch? I'm asking because LTS kernels are affected and we'll have to fix them too. I tried searching for a related patch on public ml archives and into the IPSEC repositories and I couldn't find one. Thanks! ta
On Thu, Mar 30, 2023 at 08:42:56AM +0100, Tudor Ambarus wrote: > Hi, Steffen! > > On 3/24/23 09:57, Steffen Klassert wrote: > > > > I plan to fix this with the patch below. With this, the above policy > > should be rejected. It still needs a bit of testing to make sure that > > I prohibited no valid usecase with it. > > > > --- > > Subject: [PATCH RFC ipsec] xfrm: Don't allow optional intermediate templates that > > changes the address family > > > > When an optional intermediate template changes the address family, > > it is unclear which family the next template should have. This can > > lead to misinterpretations of IPv4/IPv6 addresses. So reject > > optional intermediate templates on insertion time. > > > > Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") > > Reported-by: Hyunwoo Kim <v4bel@theori.io> > > Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> > > --- > > net/xfrm/xfrm_user.c | 9 +++++++++ > > 1 file changed, 9 insertions(+) > > > > What's the status of this patch? I'm asking because LTS kernels are > affected and we'll have to fix them too. I tried searching for a related > patch on public ml archives and into the IPSEC repositories and I > couldn't find one. I'll likely submit that patch tomorrow. I will go to the ipsec tree soon, if there are no objections on the list.
diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 2d20be6ffb7e..4d8caaedc9d4 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -617,7 +617,8 @@ static void amt_send_discovery(struct amt_dev *amt) struct sk_buff *skb; struct iphdr *iph; struct rtable *rt; - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; u32 len; int err; @@ -629,7 +630,7 @@ static void amt_send_discovery(struct amt_dev *amt) if (!netif_running(amt->stream_dev) || !netif_running(amt->dev)) goto out; - rt = ip_route_output_ports(amt->net, &fl4, sock->sk, + rt = ip_route_output_ports(amt->net, fl4, sock->sk, amt->discovery_ip, amt->local_ip, amt->gw_port, amt->relay_port, IPPROTO_UDP, 0, @@ -706,7 +707,8 @@ static void amt_send_request(struct amt_dev *amt, bool v6) struct sk_buff *skb; struct iphdr *iph; struct rtable *rt; - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; u32 len; int err; @@ -718,7 +720,7 @@ static void amt_send_request(struct amt_dev *amt, bool v6) if (!netif_running(amt->stream_dev) || !netif_running(amt->dev)) goto out; - rt = ip_route_output_ports(amt->net, &fl4, sock->sk, + rt = ip_route_output_ports(amt->net, fl4, sock->sk, amt->remote_ip, amt->local_ip, amt->gw_port, amt->relay_port, IPPROTO_UDP, 0, @@ -2554,7 +2556,8 @@ static void amt_send_advertisement(struct amt_dev *amt, __be32 nonce, struct sk_buff *skb; struct iphdr *iph; struct rtable *rt; - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; u32 len; int err; @@ -2566,7 +2569,7 @@ static void amt_send_advertisement(struct amt_dev *amt, __be32 nonce, if (!netif_running(amt->stream_dev) || !netif_running(amt->dev)) goto out; - rt = ip_route_output_ports(amt->net, &fl4, sock->sk, + rt = ip_route_output_ports(amt->net, fl4, sock->sk, daddr, amt->local_ip, dport, amt->relay_port, IPPROTO_UDP, 0, diff --git a/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c b/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c index da8d10475a08..e06a6d0a3595 100644 --- a/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c +++ b/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c @@ -95,10 +95,11 @@ cxgb_find_route(struct cxgb4_lld_info *lldi, __be16 peer_port, u8 tos) { struct rtable *rt; - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; struct neighbour *n; - rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip, + rt = ip_route_output_ports(&init_net, fl4, NULL, peer_ip, local_ip, peer_port, local_port, IPPROTO_TCP, tos & ~INET_ECN_MASK, 0); if (IS_ERR(rt)) diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index 0fe78826c8fa..6e4422ce89b5 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -136,7 +136,8 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) struct pptp_opt *opt = &po->proto.pptp; struct pptp_gre_header *hdr; unsigned int header_len = sizeof(*hdr); - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; int islcp; int len; unsigned char *data; @@ -151,7 +152,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) if (sk_pppox(po)->sk_state & PPPOX_DEAD) goto tx_error; - rt = ip_route_output_ports(net, &fl4, NULL, + rt = ip_route_output_ports(net, fl4, NULL, opt->dst_addr.sin_addr.s_addr, opt->src_addr.sin_addr.s_addr, 0, 0, IPPROTO_GRE, @@ -230,8 +231,8 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) iph->frag_off = 0; iph->protocol = IPPROTO_GRE; iph->tos = 0; - iph->daddr = fl4.daddr; - iph->saddr = fl4.saddr; + iph->daddr = fl4->daddr; + iph->saddr = fl4->saddr; iph->ttl = ip4_dst_hoplimit(&rt->dst); iph->tot_len = htons(skb->len); @@ -405,7 +406,8 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr, struct pppox_sock *po = pppox_sk(sk); struct pptp_opt *opt = &po->proto.pptp; struct rtable *rt; - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; int error = 0; if (sockaddr_len < sizeof(struct sockaddr_pppox)) @@ -438,7 +440,7 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr, po->chan.private = sk; po->chan.ops = &pptp_chan_ops; - rt = ip_route_output_ports(sock_net(sk), &fl4, sk, + rt = ip_route_output_ports(sock_net(sk), fl4, sk, opt->dst_addr.sin_addr.s_addr, opt->src_addr.sin_addr.s_addr, 0, 0, diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 4d1af0cd7d99..12656eb49e50 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -103,7 +103,8 @@ void ip4_datagram_release_cb(struct sock *sk) const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; struct dst_entry *dst; - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; struct rtable *rt; rcu_read_lock(); @@ -116,7 +117,7 @@ void ip4_datagram_release_cb(struct sock *sk) inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; - rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr, + rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c920aa9a62a9..b7de65708b37 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -350,7 +350,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) struct iphdr *pip; struct igmpv3_report *pig; struct net *net = dev_net(dev); - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int size = mtu; @@ -366,7 +367,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) } skb->priority = TC_PRIO_CONTROL; - rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, + rt = ip_route_output_ports(net, fl4, NULL, IGMPV3_ALL_MCR, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); if (IS_ERR(rt)) { @@ -389,10 +390,10 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) pip->tos = 0xc0; pip->frag_off = htons(IP_DF); pip->ttl = 1; - pip->daddr = fl4.daddr; + pip->daddr = fl4->daddr; rcu_read_lock(); - pip->saddr = igmpv3_get_srcaddr(dev, &fl4); + pip->saddr = igmpv3_get_srcaddr(dev, fl4); rcu_read_unlock(); pip->protocol = IPPROTO_IGMP; @@ -730,7 +731,8 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, struct net_device *dev = in_dev->dev; struct net *net = dev_net(dev); __be32 group = pmc ? pmc->multiaddr : 0; - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; __be32 dst; int hlen, tlen; @@ -746,7 +748,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, else dst = group; - rt = ip_route_output_ports(net, &fl4, NULL, dst, 0, + rt = ip_route_output_ports(net, fl4, NULL, dst, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); if (IS_ERR(rt)) @@ -775,7 +777,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->daddr = dst; - iph->saddr = fl4.saddr; + iph->saddr = fl4->saddr; iph->protocol = IPPROTO_IGMP; ip_select_ident(net, skb, NULL); ((u8 *)&iph[1])[0] = IPOPT_RA; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index eec1f6df80d8..994dd589835c 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1829,7 +1829,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, struct net_device *vif_dev; struct net_device *dev; struct rtable *rt; - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; int encap = 0; vif_dev = vif_dev_read(vif); @@ -1849,7 +1850,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; if (vif->flags & VIFF_TUNNEL) { - rt = ip_route_output_ports(net, &fl4, NULL, + rt = ip_route_output_ports(net, fl4, NULL, vif->remote, vif->local, 0, 0, IPPROTO_IPIP, @@ -1858,7 +1859,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; encap = sizeof(struct iphdr); } else { - rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, + rt = ip_route_output_ports(net, fl4, NULL, iph->daddr, 0, 0, 0, IPPROTO_IPIP, RT_TOS(iph->tos), vif->link); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 47b6607a1370..078a97742b7d 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -567,7 +567,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 rel_type = type; u8 rel_code = code; struct rtable *rt; - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); @@ -608,7 +609,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, eiph = ip_hdr(skb2); /* Try to guess incoming interface */ - rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, + rt = ip_route_output_ports(dev_net(skb->dev), fl4, NULL, eiph->saddr, 0, 0, 0, IPPROTO_IPIP, RT_TOS(eiph->tos), 0); if (IS_ERR(rt)) goto out; @@ -618,7 +619,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { - rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, + rt = ip_route_output_ports(dev_net(skb->dev), fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, IPPROTO_IPIP, RT_TOS(eiph->tos), 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 70d81bba5093..fc2fb6bae588 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1098,13 +1098,14 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) struct net_device *tdev = NULL; struct ip_tunnel *tunnel; const struct iphdr *iph; - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; if (iph->daddr) { - struct rtable *rt = ip_route_output_ports(tunnel->net, &fl4, + struct rtable *rt = ip_route_output_ports(tunnel->net, fl4, NULL, iph->daddr, iph->saddr, 0, 0,
If you pass the address of the struct flowi4 you declared as a local variable as the fl4 argument to ip_route_output_ports(), the subsequent call to xfrm_state_find() will read the local variable by AF_INET6 rather than AF_INET as per policy, which could cause it to go out of scope on the kernel stack. Reported-by: syzbot+ada7c035554bcee65580@syzkaller.appspotmail.com Signed-off-by: Hyunwoo Kim <v4bel@theori.io> --- drivers/net/amt.c | 15 +++++++++------ .../net/ethernet/chelsio/libcxgb/libcxgb_cm.c | 5 +++-- drivers/net/ppp/pptp.c | 14 ++++++++------ net/ipv4/datagram.c | 5 +++-- net/ipv4/igmp.c | 16 +++++++++------- net/ipv4/ipmr.c | 7 ++++--- net/ipv6/ip6_tunnel.c | 7 ++++--- net/ipv6/sit.c | 5 +++-- 8 files changed, 43 insertions(+), 31 deletions(-)