Message ID | 20230912013332.2048422-1-jrife@google.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net] net: prevent address overwrite in connect() and sendmsg() | expand |
Jordan Rife wrote: > commit 0bdf399342c5 ("net: Avoid address overwrite in kernel_connect") > ensured that kernel_connect() will not overwrite the address parameter > in cases where BPF connect hooks perform an address rewrite. However, > there remain other cases where BPF hooks can overwrite an address held > by a kernel client. > > ==Scenarios Tested== > > * Code in the SMB and Ceph modules calls sock->ops->connect() directly, > allowing the address overwrite to occur. In the case of SMB, this can > lead to broken mounts. These should probably call kernel_connect instead. > * NFS v3 mounts with proto=udp call sock_sendmsg() for each RPC call, > passing a pointer to the mount address in msg->msg_name which is > later overwritten by a BPF sendmsg hook. This can lead to broken NFS > mounts. Similarly, this could call kernel_sendmsg, and the extra copy handled in that wrapper. The arguments are not exacty the same, so not 100% this is feasible. But it's preferable if in-kernel callers use the kernel_.. API rather than bypass it. Exactly for issues like the one you report. > In order to more comprehensively fix this class of problems, this patch > pushes the address copy deeper into the stack and introduces an address > copy to both udp_sendmsg() and udpv6_sendmsg() to insulate all callers > from address rewrites. > > Signed-off-by: Jordan Rife <jrife@google.com> > --- > net/ipv4/af_inet.c | 18 ++++++++++++++++++ > net/ipv4/udp.c | 21 ++++++++++++++++----- > net/ipv6/udp.c | 23 +++++++++++++++++------ > net/socket.c | 7 +------ > 4 files changed, 52 insertions(+), 17 deletions(-) > > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > index 3d2e30e204735..c37d484fbee34 100644 > --- a/net/ipv4/af_inet.c > +++ b/net/ipv4/af_inet.c > @@ -568,6 +568,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, > { > struct sock *sk = sock->sk; > const struct proto *prot; > + struct sockaddr_storage addr; > int err; > > if (addr_len < sizeof(uaddr->sa_family)) > @@ -580,6 +581,14 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, > return prot->disconnect(sk, flags); > > if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { > + if (uaddr && addr_len <= sizeof(addr)) { > + /* pre_connect can rewrite uaddr, so make a copy to > + * insulate the caller. > + */ > + memcpy(&addr, uaddr, addr_len); > + uaddr = (struct sockaddr *)&addr; > + } > + > err = prot->pre_connect(sk, uaddr, addr_len); > if (err) > return err; > @@ -625,6 +634,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, > int addr_len, int flags, int is_sendmsg) > { > struct sock *sk = sock->sk; > + struct sockaddr_storage addr; > int err; > long timeo; > > @@ -668,6 +678,14 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, > goto out; > > if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { > + if (uaddr && addr_len <= sizeof(addr)) { > + /* pre_connect can rewrite uaddr, so make a copy to > + * insulate the caller. > + */ > + memcpy(&addr, uaddr, addr_len); > + uaddr = (struct sockaddr *)&addr; > + } > + > err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); > if (err) > goto out; > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c > index f39b9c8445808..5f5ee2752eeb7 100644 > --- a/net/ipv4/udp.c > +++ b/net/ipv4/udp.c > @@ -1142,18 +1142,29 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) > } > > if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { > + struct sockaddr_in tmp_addr; > + struct sockaddr_in *addr = usin; > + > + /* BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK can rewrite usin, so make a > + * copy to insulate the caller. > + */ > + if (usin && msg->msg_namelen <= sizeof(tmp_addr)) { > + memcpy(&tmp_addr, usin, msg->msg_namelen); > + addr = &tmp_addr; > + } > + > err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, > - (struct sockaddr *)usin, &ipc.addr); > + (struct sockaddr *)addr, &ipc.addr); > if (err) > goto out_free; > - if (usin) { > - if (usin->sin_port == 0) { > + if (addr) { > + if (addr->sin_port == 0) { > /* BPF program set invalid port. Reject it. */ > err = -EINVAL; > goto out_free; > } > - daddr = usin->sin_addr.s_addr; > - dport = usin->sin_port; > + daddr = addr->sin_addr.s_addr; > + dport = addr->sin_port; > } > } > > diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c > index 86b5d509a4688..cbc1917fad629 100644 > --- a/net/ipv6/udp.c > +++ b/net/ipv6/udp.c > @@ -1506,26 +1506,37 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) > fl6->fl6_sport = inet->inet_sport; > > if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) { > + struct sockaddr_in6 tmp_addr; > + struct sockaddr_in6 *addr = sin6; > + > + /* BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK can rewrite sin6, so make a > + * copy to insulate the caller. > + */ > + if (sin6 && addr_len <= sizeof(tmp_addr)) { > + memcpy(&tmp_addr, sin6, addr_len); > + addr = &tmp_addr; > + } > + > err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, > - (struct sockaddr *)sin6, > + (struct sockaddr *)addr, > &fl6->saddr); > if (err) > goto out_no_dst; > - if (sin6) { > - if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { > + if (addr) { > + if (ipv6_addr_v4mapped(&addr->sin6_addr)) { > /* BPF program rewrote IPv6-only by IPv4-mapped > * IPv6. It's currently unsupported. > */ > err = -ENOTSUPP; > goto out_no_dst; > } > - if (sin6->sin6_port == 0) { > + if (addr->sin6_port == 0) { > /* BPF program set invalid port. Reject it. */ > err = -EINVAL; > goto out_no_dst; > } > - fl6->fl6_dport = sin6->sin6_port; > - fl6->daddr = sin6->sin6_addr; > + fl6->fl6_dport = addr->sin6_port; > + fl6->daddr = addr->sin6_addr; > } > } > > diff --git a/net/socket.c b/net/socket.c > index c8b08b32f097e..39794d026fa11 100644 > --- a/net/socket.c > +++ b/net/socket.c > @@ -3570,12 +3570,7 @@ EXPORT_SYMBOL(kernel_accept); > int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, > int flags) > { > - struct sockaddr_storage address; > - > - memcpy(&address, addr, addrlen); > - > - return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)&address, > - addrlen, flags); > + return READ_ONCE(sock->ops)->connect(sock, addr, addrlen, flags); > } > EXPORT_SYMBOL(kernel_connect); > > -- > 2.42.0.283.g2d96d420d3-goog >
On 9/12/23 3:33 PM, Willem de Bruijn wrote: > Jordan Rife wrote: >> commit 0bdf399342c5 ("net: Avoid address overwrite in kernel_connect") >> ensured that kernel_connect() will not overwrite the address parameter >> in cases where BPF connect hooks perform an address rewrite. However, >> there remain other cases where BPF hooks can overwrite an address held >> by a kernel client. >> >> ==Scenarios Tested== >> >> * Code in the SMB and Ceph modules calls sock->ops->connect() directly, >> allowing the address overwrite to occur. In the case of SMB, this can >> lead to broken mounts. > > These should probably call kernel_connect instead. > >> * NFS v3 mounts with proto=udp call sock_sendmsg() for each RPC call, >> passing a pointer to the mount address in msg->msg_name which is >> later overwritten by a BPF sendmsg hook. This can lead to broken NFS >> mounts. > > Similarly, this could call kernel_sendmsg, and the extra copy handled > in that wrapper. The arguments are not exacty the same, so not 100% > this is feasible. > > But it's preferable if in-kernel callers use the kernel_.. API rather > than bypass it. Exactly for issues like the one you report. Fully agree, if it's feasible it would be better to convert them over to in-kernel API. >> In order to more comprehensively fix this class of problems, this patch >> pushes the address copy deeper into the stack and introduces an address >> copy to both udp_sendmsg() and udpv6_sendmsg() to insulate all callers >> from address rewrites. >> >> Signed-off-by: Jordan Rife <jrife@google.com>
> These should probably call kernel_connect instead. > Similarly, this could call kernel_sendmsg, and the extra copy handled in that wrapper. I was considering this approach initially, but one concern I had was that there are other instances I see of modules calling ops->connect() directly (bypassing kernel_connect()): - net/netfilter/ipvs/ip_vs_sync.c: make_send_sock() - drivers/block/drbd/drbd_receiver.c: drbd_try_connect() - drivers/infiniband/hw/erdma/erdma_cm.c: kernel_bindconnect() - drivers/infiniband/sw/siw/siw_cm.c: kernel_bindconnect() - fs/ocfs2/cluster/tcp.c: o2net_start_connect() - net/rds/tcp_connect.c: rds_tcp_conn_path_connect() and ops->sendmsg(): - net/smc/af_smc.c: smc_sendmsg() - drivers/vhost/net.c: vhost_tx_batch(), handle_tx_copy(), handle_tx_zerocopy() - drivers/target/iscsi/iscsi_target_util.c: iscsit_fe_sendpage_sg() which (at least in theory) leaves them open to the same problem I'm seeing with NFS/SMB right now. I worry that even if all these instances were swapped out with kernel_sendmsg() and kernel_connect(), it would turn into a game of whac-a-mole in the future as new changes or new modules may reintroduce direct calls to sock->ops->connect() or sock->ops->sendmsg(). -Jordan On Tue, Sep 12, 2023 at 7:22 AM Daniel Borkmann <daniel@iogearbox.net> wrote: > > On 9/12/23 3:33 PM, Willem de Bruijn wrote: > > Jordan Rife wrote: > >> commit 0bdf399342c5 ("net: Avoid address overwrite in kernel_connect") > >> ensured that kernel_connect() will not overwrite the address parameter > >> in cases where BPF connect hooks perform an address rewrite. However, > >> there remain other cases where BPF hooks can overwrite an address held > >> by a kernel client. > >> > >> ==Scenarios Tested== > >> > >> * Code in the SMB and Ceph modules calls sock->ops->connect() directly, > >> allowing the address overwrite to occur. In the case of SMB, this can > >> lead to broken mounts. > > > > These should probably call kernel_connect instead. > > > >> * NFS v3 mounts with proto=udp call sock_sendmsg() for each RPC call, > >> passing a pointer to the mount address in msg->msg_name which is > >> later overwritten by a BPF sendmsg hook. This can lead to broken NFS > >> mounts. > > > > Similarly, this could call kernel_sendmsg, and the extra copy handled > > in that wrapper. The arguments are not exacty the same, so not 100% > > this is feasible. > > > > But it's preferable if in-kernel callers use the kernel_.. API rather > > than bypass it. Exactly for issues like the one you report. > > Fully agree, if it's feasible it would be better to convert them over to > in-kernel API. > > >> In order to more comprehensively fix this class of problems, this patch > >> pushes the address copy deeper into the stack and introduces an address > >> copy to both udp_sendmsg() and udpv6_sendmsg() to insulate all callers > >> from address rewrites. > >> > >> Signed-off-by: Jordan Rife <jrife@google.com>
On Tue, Sep 12, 2023 at 2:31 PM Jordan Rife <jrife@google.com> wrote: > > > These should probably call kernel_connect instead. > > Similarly, this could call kernel_sendmsg, and the extra copy handled in that wrapper. > > I was considering this approach initially, but one concern I had was > that there are other instances I see of modules calling ops->connect() > directly (bypassing kernel_connect()): > > - net/netfilter/ipvs/ip_vs_sync.c: make_send_sock() > - drivers/block/drbd/drbd_receiver.c: drbd_try_connect() > - drivers/infiniband/hw/erdma/erdma_cm.c: kernel_bindconnect() > - drivers/infiniband/sw/siw/siw_cm.c: kernel_bindconnect() > - fs/ocfs2/cluster/tcp.c: o2net_start_connect() > - net/rds/tcp_connect.c: rds_tcp_conn_path_connect() > > and ops->sendmsg(): > > - net/smc/af_smc.c: smc_sendmsg() > - drivers/vhost/net.c: vhost_tx_batch(), handle_tx_copy(), handle_tx_zerocopy() > - drivers/target/iscsi/iscsi_target_util.c: iscsit_fe_sendpage_sg() > > which (at least in theory) leaves them open to the same problem I'm > seeing with NFS/SMB right now. I worry that even if all these > instances were swapped out with kernel_sendmsg() and kernel_connect(), > it would turn into a game of whac-a-mole in the future as new changes > or new modules may reintroduce direct calls to sock->ops->connect() or > sock->ops->sendmsg(). Ideally these all would use the proper kernel-internal APIs. I care less about new code. If all examples are clear, that will do the right thing, or is a simple fix-up worst case. Question is if changing all these callers is suitable for a patch targeting net. The changes do seem like trivial one liners? > -Jordan > > > > On Tue, Sep 12, 2023 at 7:22 AM Daniel Borkmann <daniel@iogearbox.net> wrote: > > > > On 9/12/23 3:33 PM, Willem de Bruijn wrote: > > > Jordan Rife wrote: > > >> commit 0bdf399342c5 ("net: Avoid address overwrite in kernel_connect") > > >> ensured that kernel_connect() will not overwrite the address parameter > > >> in cases where BPF connect hooks perform an address rewrite. However, > > >> there remain other cases where BPF hooks can overwrite an address held > > >> by a kernel client. > > >> > > >> ==Scenarios Tested== > > >> > > >> * Code in the SMB and Ceph modules calls sock->ops->connect() directly, > > >> allowing the address overwrite to occur. In the case of SMB, this can > > >> lead to broken mounts. > > > > > > These should probably call kernel_connect instead. > > > > > >> * NFS v3 mounts with proto=udp call sock_sendmsg() for each RPC call, > > >> passing a pointer to the mount address in msg->msg_name which is > > >> later overwritten by a BPF sendmsg hook. This can lead to broken NFS > > >> mounts. > > > > > > Similarly, this could call kernel_sendmsg, and the extra copy handled > > > in that wrapper. The arguments are not exacty the same, so not 100% > > > this is feasible. > > > > > > But it's preferable if in-kernel callers use the kernel_.. API rather > > > than bypass it. Exactly for issues like the one you report. > > > > Fully agree, if it's feasible it would be better to convert them over to > > in-kernel API. > > > > >> In order to more comprehensively fix this class of problems, this patch > > >> pushes the address copy deeper into the stack and introduces an address > > >> copy to both udp_sendmsg() and udpv6_sendmsg() to insulate all callers > > >> from address rewrites. > > >> > > >> Signed-off-by: Jordan Rife <jrife@google.com>
> Ideally these all would use the proper kernel-internal APIs. > I care less about new code. If all examples are clear, that will do the right thing, or is a simple fix-up worst case. Fair enough. > The changes do seem like trivial one liners? Looks like it. Still, it's going to take some time to patch+test each of these. I'll start with NFS (SUNRPC), SMB, and Ceph, since I can easily test them. > Question is if changing all these callers is suitable for a patch targeting net. One patch to net will be needed to add an address copy to kernel_sendmsg(), but I guess I'll need to send some other patches to the appropriate subsystem maintainers for SUNRPC, SMB, and Ceph to swap out calls. > Similarly, this could call kernel_sendmsg, and the extra copy handled in that wrapper. Would it make more sense to do the address copy in sock_sendmsg() instead? Are kernel callers "supposed" to use kernel_sendmsg() or is it valid for them to use sock_sendmsg() as many of them seem to want to do today. Seeing as sock_sendmsg() is called by kernel_sendmsg(), adding the copy in sock_sendmsg() fixes this problem for callers to both of these and avoids the need for patching all modules that call sock_sendmsg(). -Jordan On Tue, Sep 12, 2023 at 12:36 PM Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > On Tue, Sep 12, 2023 at 2:31 PM Jordan Rife <jrife@google.com> wrote: > > > > > These should probably call kernel_connect instead. > > > Similarly, this could call kernel_sendmsg, and the extra copy handled in that wrapper. > > > > I was considering this approach initially, but one concern I had was > > that there are other instances I see of modules calling ops->connect() > > directly (bypassing kernel_connect()): > > > > - net/netfilter/ipvs/ip_vs_sync.c: make_send_sock() > > - drivers/block/drbd/drbd_receiver.c: drbd_try_connect() > > - drivers/infiniband/hw/erdma/erdma_cm.c: kernel_bindconnect() > > - drivers/infiniband/sw/siw/siw_cm.c: kernel_bindconnect() > > - fs/ocfs2/cluster/tcp.c: o2net_start_connect() > > - net/rds/tcp_connect.c: rds_tcp_conn_path_connect() > > > > and ops->sendmsg(): > > > > - net/smc/af_smc.c: smc_sendmsg() > > - drivers/vhost/net.c: vhost_tx_batch(), handle_tx_copy(), handle_tx_zerocopy() > > - drivers/target/iscsi/iscsi_target_util.c: iscsit_fe_sendpage_sg() > > > > which (at least in theory) leaves them open to the same problem I'm > > seeing with NFS/SMB right now. I worry that even if all these > > instances were swapped out with kernel_sendmsg() and kernel_connect(), > > it would turn into a game of whac-a-mole in the future as new changes > > or new modules may reintroduce direct calls to sock->ops->connect() or > > sock->ops->sendmsg(). > > Ideally these all would use the proper kernel-internal APIs. > > I care less about new code. If all examples are clear, that > will do the right thing, or is a simple fix-up worst case. > > Question is if changing all these callers is suitable for a patch > targeting net. > > The changes do seem like trivial one liners? > > > -Jordan > > > > > > > > On Tue, Sep 12, 2023 at 7:22 AM Daniel Borkmann <daniel@iogearbox.net> wrote: > > > > > > On 9/12/23 3:33 PM, Willem de Bruijn wrote: > > > > Jordan Rife wrote: > > > >> commit 0bdf399342c5 ("net: Avoid address overwrite in kernel_connect") > > > >> ensured that kernel_connect() will not overwrite the address parameter > > > >> in cases where BPF connect hooks perform an address rewrite. However, > > > >> there remain other cases where BPF hooks can overwrite an address held > > > >> by a kernel client. > > > >> > > > >> ==Scenarios Tested== > > > >> > > > >> * Code in the SMB and Ceph modules calls sock->ops->connect() directly, > > > >> allowing the address overwrite to occur. In the case of SMB, this can > > > >> lead to broken mounts. > > > > > > > > These should probably call kernel_connect instead. > > > > > > > >> * NFS v3 mounts with proto=udp call sock_sendmsg() for each RPC call, > > > >> passing a pointer to the mount address in msg->msg_name which is > > > >> later overwritten by a BPF sendmsg hook. This can lead to broken NFS > > > >> mounts. > > > > > > > > Similarly, this could call kernel_sendmsg, and the extra copy handled > > > > in that wrapper. The arguments are not exacty the same, so not 100% > > > > this is feasible. > > > > > > > > But it's preferable if in-kernel callers use the kernel_.. API rather > > > > than bypass it. Exactly for issues like the one you report. > > > > > > Fully agree, if it's feasible it would be better to convert them over to > > > in-kernel API. > > > > > > >> In order to more comprehensively fix this class of problems, this patch > > > >> pushes the address copy deeper into the stack and introduces an address > > > >> copy to both udp_sendmsg() and udpv6_sendmsg() to insulate all callers > > > >> from address rewrites. > > > >> > > > >> Signed-off-by: Jordan Rife <jrife@google.com>
On Tue, Sep 12, 2023 at 4:07 PM Jordan Rife <jrife@google.com> wrote: > > > Ideally these all would use the proper kernel-internal APIs. > > I care less about new code. If all examples are clear, that will do the right thing, or is a simple fix-up worst case. > > Fair enough. > > > The changes do seem like trivial one liners? > > Looks like it. Still, it's going to take some time to patch+test each > of these. I'll start with NFS (SUNRPC), SMB, and Ceph, since I can > easily test them. > > > Question is if changing all these callers is suitable for a patch targeting net. > One patch to net will be needed to add an address copy to > kernel_sendmsg(), but I guess I'll need to send some other patches to > the appropriate subsystem maintainers for SUNRPC, SMB, and Ceph to > swap out calls. If we take this path, it could be a single patch. The subsystem maintainers should be CC:ed so that they can (N)ACK it. But I do not mean to ask to split it up and test each one separately. The change from sock->ops->connect to kernel_connect is certainly trivial enough that compile testing should suffice. Note that kernel_connect actually uses READ_ONCE(sock->ops) because of a data race. All callers that call a socket that may be subject to IPV6_ADDRFORM should do that. Likely some of those open coded examples are affected, and do not. This is another example why using a single interface is preferable. > > Similarly, this could call kernel_sendmsg, and the extra copy handled in that wrapper. > > Would it make more sense to do the address copy in sock_sendmsg() > instead? Are kernel callers "supposed" to use kernel_sendmsg() or is > it valid for them to use sock_sendmsg() as many of them seem to want > to do today. Seeing as sock_sendmsg() is called by kernel_sendmsg(), > adding the copy in sock_sendmsg() fixes this problem for callers to > both of these and avoids the need for patching all modules that call > sock_sendmsg(). I think it is "supposed" to be the API for these cases. But as you show, clearly it isn't today. Practice trumps theory. The only question is whether we should pursue your original patch and accept that this will continue, or one that improves the situation, but touches more files and thus has a higher risk of merge conflicts. I'd like to give others some time to chime in. I've given my opinion, but it's only one. > -Jordan > > On Tue, Sep 12, 2023 at 12:36 PM Willem de Bruijn > <willemdebruijn.kernel@gmail.com> wrote: > > > > On Tue, Sep 12, 2023 at 2:31 PM Jordan Rife <jrife@google.com> wrote: > > > > > > > These should probably call kernel_connect instead. > > > > Similarly, this could call kernel_sendmsg, and the extra copy handled in that wrapper. > > > > > > I was considering this approach initially, but one concern I had was > > > that there are other instances I see of modules calling ops->connect() > > > directly (bypassing kernel_connect()): > > > > > > - net/netfilter/ipvs/ip_vs_sync.c: make_send_sock() > > > - drivers/block/drbd/drbd_receiver.c: drbd_try_connect() > > > - drivers/infiniband/hw/erdma/erdma_cm.c: kernel_bindconnect() > > > - drivers/infiniband/sw/siw/siw_cm.c: kernel_bindconnect() > > > - fs/ocfs2/cluster/tcp.c: o2net_start_connect() > > > - net/rds/tcp_connect.c: rds_tcp_conn_path_connect() > > > > > > and ops->sendmsg(): > > > > > > - net/smc/af_smc.c: smc_sendmsg() > > > - drivers/vhost/net.c: vhost_tx_batch(), handle_tx_copy(), handle_tx_zerocopy() > > > - drivers/target/iscsi/iscsi_target_util.c: iscsit_fe_sendpage_sg() > > > > > > which (at least in theory) leaves them open to the same problem I'm > > > seeing with NFS/SMB right now. I worry that even if all these > > > instances were swapped out with kernel_sendmsg() and kernel_connect(), > > > it would turn into a game of whac-a-mole in the future as new changes > > > or new modules may reintroduce direct calls to sock->ops->connect() or > > > sock->ops->sendmsg(). > > > > Ideally these all would use the proper kernel-internal APIs. > > > > I care less about new code. If all examples are clear, that > > will do the right thing, or is a simple fix-up worst case. > > > > Question is if changing all these callers is suitable for a patch > > targeting net. > > > > The changes do seem like trivial one liners? > > > > > -Jordan > > > > > > > > > > > > On Tue, Sep 12, 2023 at 7:22 AM Daniel Borkmann <daniel@iogearbox.net> wrote: > > > > > > > > On 9/12/23 3:33 PM, Willem de Bruijn wrote: > > > > > Jordan Rife wrote: > > > > >> commit 0bdf399342c5 ("net: Avoid address overwrite in kernel_connect") > > > > >> ensured that kernel_connect() will not overwrite the address parameter > > > > >> in cases where BPF connect hooks perform an address rewrite. However, > > > > >> there remain other cases where BPF hooks can overwrite an address held > > > > >> by a kernel client. > > > > >> > > > > >> ==Scenarios Tested== > > > > >> > > > > >> * Code in the SMB and Ceph modules calls sock->ops->connect() directly, > > > > >> allowing the address overwrite to occur. In the case of SMB, this can > > > > >> lead to broken mounts. > > > > > > > > > > These should probably call kernel_connect instead. > > > > > > > > > >> * NFS v3 mounts with proto=udp call sock_sendmsg() for each RPC call, > > > > >> passing a pointer to the mount address in msg->msg_name which is > > > > >> later overwritten by a BPF sendmsg hook. This can lead to broken NFS > > > > >> mounts. > > > > > > > > > > Similarly, this could call kernel_sendmsg, and the extra copy handled > > > > > in that wrapper. The arguments are not exacty the same, so not 100% > > > > > this is feasible. > > > > > > > > > > But it's preferable if in-kernel callers use the kernel_.. API rather > > > > > than bypass it. Exactly for issues like the one you report. > > > > > > > > Fully agree, if it's feasible it would be better to convert them over to > > > > in-kernel API. > > > > > > > > >> In order to more comprehensively fix this class of problems, this patch > > > > >> pushes the address copy deeper into the stack and introduces an address > > > > >> copy to both udp_sendmsg() and udpv6_sendmsg() to insulate all callers > > > > >> from address rewrites. > > > > >> > > > > >> Signed-off-by: Jordan Rife <jrife@google.com>
> If we take this path, it could be a single patch. The subsystem > maintainers should be CC:ed so that they can (N)ACK it. > > But I do not mean to ask to split it up and test each one separately. > > The change from sock->ops->connect to kernel_connect is certainly > trivial enough that compile testing should suffice. Ack. Thanks for clarifying. > The only question is whether we should pursue your original patch and > accept that this will continue, or one that improves the situation, > but touches more files and thus has a higher risk of merge conflicts. > > I'd like to give others some time to chime in. I've given my opinion, > but it's only one. > > I'd like to give others some time to chime in. I've given my opinion, > but it's only one. Sounds good. I'll wait to hear others' opinions on the best path forward. -Jordan On Tue, Sep 12, 2023 at 1:49 PM Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > On Tue, Sep 12, 2023 at 4:07 PM Jordan Rife <jrife@google.com> wrote: > > > > > Ideally these all would use the proper kernel-internal APIs. > > > I care less about new code. If all examples are clear, that will do the right thing, or is a simple fix-up worst case. > > > > Fair enough. > > > > > The changes do seem like trivial one liners? > > > > Looks like it. Still, it's going to take some time to patch+test each > > of these. I'll start with NFS (SUNRPC), SMB, and Ceph, since I can > > easily test them. > > > > > Question is if changing all these callers is suitable for a patch targeting net. > > One patch to net will be needed to add an address copy to > > kernel_sendmsg(), but I guess I'll need to send some other patches to > > the appropriate subsystem maintainers for SUNRPC, SMB, and Ceph to > > swap out calls. > > If we take this path, it could be a single patch. The subsystem > maintainers should be CC:ed so that they can (N)ACK it. > > But I do not mean to ask to split it up and test each one separately. > > The change from sock->ops->connect to kernel_connect is certainly > trivial enough that compile testing should suffice. > > Note that kernel_connect actually uses READ_ONCE(sock->ops) because of > a data race. All callers that call a socket that may be subject to > IPV6_ADDRFORM should do that. Likely some of those open coded examples > are affected, and do not. This is another example why using a single > interface is preferable. > > > > Similarly, this could call kernel_sendmsg, and the extra copy handled in that wrapper. > > > > Would it make more sense to do the address copy in sock_sendmsg() > > instead? Are kernel callers "supposed" to use kernel_sendmsg() or is > > it valid for them to use sock_sendmsg() as many of them seem to want > > to do today. Seeing as sock_sendmsg() is called by kernel_sendmsg(), > > adding the copy in sock_sendmsg() fixes this problem for callers to > > both of these and avoids the need for patching all modules that call > > sock_sendmsg(). > > I think it is "supposed" to be the API for these cases. But as you > show, clearly it isn't today. Practice trumps theory. > > The only question is whether we should pursue your original patch and > accept that this will continue, or one that improves the situation, > but touches more files and thus has a higher risk of merge conflicts. > > I'd like to give others some time to chime in. I've given my opinion, > but it's only one. > > > > > -Jordan > > > > On Tue, Sep 12, 2023 at 12:36 PM Willem de Bruijn > > <willemdebruijn.kernel@gmail.com> wrote: > > > > > > On Tue, Sep 12, 2023 at 2:31 PM Jordan Rife <jrife@google.com> wrote: > > > > > > > > > These should probably call kernel_connect instead. > > > > > Similarly, this could call kernel_sendmsg, and the extra copy handled in that wrapper. > > > > > > > > I was considering this approach initially, but one concern I had was > > > > that there are other instances I see of modules calling ops->connect() > > > > directly (bypassing kernel_connect()): > > > > > > > > - net/netfilter/ipvs/ip_vs_sync.c: make_send_sock() > > > > - drivers/block/drbd/drbd_receiver.c: drbd_try_connect() > > > > - drivers/infiniband/hw/erdma/erdma_cm.c: kernel_bindconnect() > > > > - drivers/infiniband/sw/siw/siw_cm.c: kernel_bindconnect() > > > > - fs/ocfs2/cluster/tcp.c: o2net_start_connect() > > > > - net/rds/tcp_connect.c: rds_tcp_conn_path_connect() > > > > > > > > and ops->sendmsg(): > > > > > > > > - net/smc/af_smc.c: smc_sendmsg() > > > > - drivers/vhost/net.c: vhost_tx_batch(), handle_tx_copy(), handle_tx_zerocopy() > > > > - drivers/target/iscsi/iscsi_target_util.c: iscsit_fe_sendpage_sg() > > > > > > > > which (at least in theory) leaves them open to the same problem I'm > > > > seeing with NFS/SMB right now. I worry that even if all these > > > > instances were swapped out with kernel_sendmsg() and kernel_connect(), > > > > it would turn into a game of whac-a-mole in the future as new changes > > > > or new modules may reintroduce direct calls to sock->ops->connect() or > > > > sock->ops->sendmsg(). > > > > > > Ideally these all would use the proper kernel-internal APIs. > > > > > > I care less about new code. If all examples are clear, that > > > will do the right thing, or is a simple fix-up worst case. > > > > > > Question is if changing all these callers is suitable for a patch > > > targeting net. > > > > > > The changes do seem like trivial one liners? > > > > > > > -Jordan > > > > > > > > > > > > > > > > On Tue, Sep 12, 2023 at 7:22 AM Daniel Borkmann <daniel@iogearbox.net> wrote: > > > > > > > > > > On 9/12/23 3:33 PM, Willem de Bruijn wrote: > > > > > > Jordan Rife wrote: > > > > > >> commit 0bdf399342c5 ("net: Avoid address overwrite in kernel_connect") > > > > > >> ensured that kernel_connect() will not overwrite the address parameter > > > > > >> in cases where BPF connect hooks perform an address rewrite. However, > > > > > >> there remain other cases where BPF hooks can overwrite an address held > > > > > >> by a kernel client. > > > > > >> > > > > > >> ==Scenarios Tested== > > > > > >> > > > > > >> * Code in the SMB and Ceph modules calls sock->ops->connect() directly, > > > > > >> allowing the address overwrite to occur. In the case of SMB, this can > > > > > >> lead to broken mounts. > > > > > > > > > > > > These should probably call kernel_connect instead. > > > > > > > > > > > >> * NFS v3 mounts with proto=udp call sock_sendmsg() for each RPC call, > > > > > >> passing a pointer to the mount address in msg->msg_name which is > > > > > >> later overwritten by a BPF sendmsg hook. This can lead to broken NFS > > > > > >> mounts. > > > > > > > > > > > > Similarly, this could call kernel_sendmsg, and the extra copy handled > > > > > > in that wrapper. The arguments are not exacty the same, so not 100% > > > > > > this is feasible. > > > > > > > > > > > > But it's preferable if in-kernel callers use the kernel_.. API rather > > > > > > than bypass it. Exactly for issues like the one you report. > > > > > > > > > > Fully agree, if it's feasible it would be better to convert them over to > > > > > in-kernel API. > > > > > > > > > > >> In order to more comprehensively fix this class of problems, this patch > > > > > >> pushes the address copy deeper into the stack and introduces an address > > > > > >> copy to both udp_sendmsg() and udpv6_sendmsg() to insulate all callers > > > > > >> from address rewrites. > > > > > >> > > > > > >> Signed-off-by: Jordan Rife <jrife@google.com>
Hi Jordan, kernel test robot noticed the following build warnings: url: https://github.com/intel-lab-lkp/linux/commits/Jordan-Rife/net-prevent-address-overwrite-in-connect-and-sendmsg/20230912-093550 base: net/main patch link: https://lore.kernel.org/r/20230912013332.2048422-1-jrife%40google.com patch subject: [PATCH net] net: prevent address overwrite in connect() and sendmsg() config: x86_64-randconfig-161-20230912 (https://download.01.org/0day-ci/archive/20230913/202309131155.MonA0VTS-lkp@intel.com/config) compiler: gcc-12 (Debian 12.2.0-14) 12.2.0 reproduce: (https://download.01.org/0day-ci/archive/20230913/202309131155.MonA0VTS-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Reported-by: Dan Carpenter <dan.carpenter@linaro.org> | Closes: https://lore.kernel.org/r/202309131155.MonA0VTS-lkp@intel.com/ smatch warnings: net/ipv4/af_inet.c:584 inet_dgram_connect() warn: variable dereferenced before check 'uaddr' (see line 580) vim +/uaddr +584 net/ipv4/af_inet.c ^1da177e4c3f41 Linus Torvalds 2005-04-16 566 int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, ^1da177e4c3f41 Linus Torvalds 2005-04-16 567 int addr_len, int flags) ^1da177e4c3f41 Linus Torvalds 2005-04-16 568 { ^1da177e4c3f41 Linus Torvalds 2005-04-16 569 struct sock *sk = sock->sk; 364f997b5cfe1d Kuniyuki Iwashima 2022-10-06 570 const struct proto *prot; 6113a07e1ad512 Jordan Rife 2023-09-11 571 struct sockaddr_storage addr; d74bad4e74ee37 Andrey Ignatov 2018-03-30 572 int err; ^1da177e4c3f41 Linus Torvalds 2005-04-16 573 6503d96168f891 Changli Gao 2010-03-31 574 if (addr_len < sizeof(uaddr->sa_family)) 6503d96168f891 Changli Gao 2010-03-31 575 return -EINVAL; 364f997b5cfe1d Kuniyuki Iwashima 2022-10-06 576 364f997b5cfe1d Kuniyuki Iwashima 2022-10-06 577 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 364f997b5cfe1d Kuniyuki Iwashima 2022-10-06 578 prot = READ_ONCE(sk->sk_prot); 364f997b5cfe1d Kuniyuki Iwashima 2022-10-06 579 ^1da177e4c3f41 Linus Torvalds 2005-04-16 @580 if (uaddr->sa_family == AF_UNSPEC) ^^^^^^^^^^^^^^^^ "uaddr" better not be NULL 364f997b5cfe1d Kuniyuki Iwashima 2022-10-06 581 return prot->disconnect(sk, flags); ^1da177e4c3f41 Linus Torvalds 2005-04-16 582 d74bad4e74ee37 Andrey Ignatov 2018-03-30 583 if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { 6113a07e1ad512 Jordan Rife 2023-09-11 @584 if (uaddr && addr_len <= sizeof(addr)) { ^^^^^ Remove this check? 6113a07e1ad512 Jordan Rife 2023-09-11 585 /* pre_connect can rewrite uaddr, so make a copy to 6113a07e1ad512 Jordan Rife 2023-09-11 586 * insulate the caller. 6113a07e1ad512 Jordan Rife 2023-09-11 587 */ 6113a07e1ad512 Jordan Rife 2023-09-11 588 memcpy(&addr, uaddr, addr_len); 6113a07e1ad512 Jordan Rife 2023-09-11 589 uaddr = (struct sockaddr *)&addr; 6113a07e1ad512 Jordan Rife 2023-09-11 590 } 6113a07e1ad512 Jordan Rife 2023-09-11 591 364f997b5cfe1d Kuniyuki Iwashima 2022-10-06 592 err = prot->pre_connect(sk, uaddr, addr_len); d74bad4e74ee37 Andrey Ignatov 2018-03-30 593 if (err) d74bad4e74ee37 Andrey Ignatov 2018-03-30 594 return err; d74bad4e74ee37 Andrey Ignatov 2018-03-30 595 } d74bad4e74ee37 Andrey Ignatov 2018-03-30 596 dcd01eeac14486 Eric Dumazet 2021-06-09 597 if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) ^1da177e4c3f41 Linus Torvalds 2005-04-16 598 return -EAGAIN; 364f997b5cfe1d Kuniyuki Iwashima 2022-10-06 599 return prot->connect(sk, uaddr, addr_len); ^1da177e4c3f41 Linus Torvalds 2005-04-16 600 }
On Tue, Sep 12, 2023 at 5:09 PM Jordan Rife <jrife@google.com> wrote: > > > If we take this path, it could be a single patch. The subsystem > > maintainers should be CC:ed so that they can (N)ACK it. > > > > But I do not mean to ask to split it up and test each one separately. > > > > The change from sock->ops->connect to kernel_connect is certainly > > trivial enough that compile testing should suffice. > > Ack. Thanks for clarifying. > > > The only question is whether we should pursue your original patch and > > accept that this will continue, or one that improves the situation, > > but touches more files and thus has a higher risk of merge conflicts. > > > > I'd like to give others some time to chime in. I've given my opinion, > > but it's only one. > > > > I'd like to give others some time to chime in. I've given my opinion, > > but it's only one. > > Sounds good. I'll wait to hear others' opinions on the best path forward. No other comments so far. My hunch is that a short list of these changes ``` @@ -1328,7 +1328,7 @@ static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, if (rv < 0) return rv; - rv = s->ops->connect(s, raddr, size, flags); + rv = kernel_connect(s, raddr, size, flags); ``` is no more invasive than your proposed patch, and gives a more robust outcome. Please take a stab. If it proves at all non-trivial, e.g., in the conversion of arguments between kernel_sendmsg and sock_sendmsg/sock->ops->sendmsg, then let's submit your original patch. And I will do the conversion in net-next instead.
> Please take a stab. To clarify, the plan is to: 1) Swap out calls to sock->ops->connect() with kernel_connect() 2) Move the address copy to kernel_sendmsg() 3) Swap out calls to sock_sendmsg()/sock->ops->sendmsg() with kernel_sendmsg() > If it proves at all non-trivial, e.g., in the conversion of arguments > between kernel_sendmsg and sock_sendmsg/sock->ops->sendmsg, then let's > submit your original patch. And I will do the conversion in net-next > instead. I'll give it a try and see how trivial the changes are. -Jordan On Wed, Sep 13, 2023 at 7:03 AM Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > On Tue, Sep 12, 2023 at 5:09 PM Jordan Rife <jrife@google.com> wrote: > > > > > If we take this path, it could be a single patch. The subsystem > > > maintainers should be CC:ed so that they can (N)ACK it. > > > > > > But I do not mean to ask to split it up and test each one separately. > > > > > > The change from sock->ops->connect to kernel_connect is certainly > > > trivial enough that compile testing should suffice. > > > > Ack. Thanks for clarifying. > > > > > The only question is whether we should pursue your original patch and > > > accept that this will continue, or one that improves the situation, > > > but touches more files and thus has a higher risk of merge conflicts. > > > > > > I'd like to give others some time to chime in. I've given my opinion, > > > but it's only one. > > > > > > I'd like to give others some time to chime in. I've given my opinion, > > > but it's only one. > > > > Sounds good. I'll wait to hear others' opinions on the best path forward. > > No other comments so far. > > My hunch is that a short list of these changes > > ``` > @@ -1328,7 +1328,7 @@ static int kernel_bindconnect(struct socket *s, > struct sockaddr *laddr, > if (rv < 0) > return rv; > > - rv = s->ops->connect(s, raddr, size, flags); > + rv = kernel_connect(s, raddr, size, flags); > ``` > > is no more invasive than your proposed patch, and gives a more robust outcome. > > Please take a stab. > > If it proves at all non-trivial, e.g., in the conversion of arguments > between kernel_sendmsg and sock_sendmsg/sock->ops->sendmsg, then let's > submit your original patch. And I will do the conversion in net-next > instead.
On Wed, Sep 13, 2023 at 2:04 PM Jordan Rife <jrife@google.com> wrote: > > > Please take a stab. > > To clarify, the plan is to: > > 1) Swap out calls to sock->ops->connect() with kernel_connect() > 2) Move the address copy to kernel_sendmsg() > 3) Swap out calls to sock_sendmsg()/sock->ops->sendmsg() with kernel_sendmsg() Exactly. > > If it proves at all non-trivial, e.g., in the conversion of arguments > > between kernel_sendmsg and sock_sendmsg/sock->ops->sendmsg, then let's > > submit your original patch. And I will do the conversion in net-next > > instead. > > I'll give it a try and see how trivial the changes are. Thanks! > -Jordan > > On Wed, Sep 13, 2023 at 7:03 AM Willem de Bruijn > <willemdebruijn.kernel@gmail.com> wrote: > > > > On Tue, Sep 12, 2023 at 5:09 PM Jordan Rife <jrife@google.com> wrote: > > > > > > > If we take this path, it could be a single patch. The subsystem > > > > maintainers should be CC:ed so that they can (N)ACK it. > > > > > > > > But I do not mean to ask to split it up and test each one separately. > > > > > > > > The change from sock->ops->connect to kernel_connect is certainly > > > > trivial enough that compile testing should suffice. > > > > > > Ack. Thanks for clarifying. > > > > > > > The only question is whether we should pursue your original patch and > > > > accept that this will continue, or one that improves the situation, > > > > but touches more files and thus has a higher risk of merge conflicts. > > > > > > > > I'd like to give others some time to chime in. I've given my opinion, > > > > but it's only one. > > > > > > > > I'd like to give others some time to chime in. I've given my opinion, > > > > but it's only one. > > > > > > Sounds good. I'll wait to hear others' opinions on the best path forward. > > > > No other comments so far. > > > > My hunch is that a short list of these changes > > > > ``` > > @@ -1328,7 +1328,7 @@ static int kernel_bindconnect(struct socket *s, > > struct sockaddr *laddr, > > if (rv < 0) > > return rv; > > > > - rv = s->ops->connect(s, raddr, size, flags); > > + rv = kernel_connect(s, raddr, size, flags); > > ``` > > > > is no more invasive than your proposed patch, and gives a more robust outcome. > > > > Please take a stab. > > > > If it proves at all non-trivial, e.g., in the conversion of arguments > > between kernel_sendmsg and sock_sendmsg/sock->ops->sendmsg, then let's > > submit your original patch. And I will do the conversion in net-next > > instead.
On Wed, 2023-09-13 at 10:02 -0400, Willem de Bruijn wrote: > On Tue, Sep 12, 2023 at 5:09 PM Jordan Rife <jrife@google.com> wrote: > > > > > If we take this path, it could be a single patch. The subsystem > > > maintainers should be CC:ed so that they can (N)ACK it. > > > > > > But I do not mean to ask to split it up and test each one separately. > > > > > > The change from sock->ops->connect to kernel_connect is certainly > > > trivial enough that compile testing should suffice. > > > > Ack. Thanks for clarifying. > > > > > The only question is whether we should pursue your original patch and > > > accept that this will continue, or one that improves the situation, > > > but touches more files and thus has a higher risk of merge conflicts. > > > > > > I'd like to give others some time to chime in. I've given my opinion, > > > but it's only one. > > > > > > I'd like to give others some time to chime in. I've given my opinion, > > > but it's only one. > > > > Sounds good. I'll wait to hear others' opinions on the best path forward. > > No other comments so far. > > My hunch is that a short list of these changes > > ``` > @@ -1328,7 +1328,7 @@ static int kernel_bindconnect(struct socket *s, > struct sockaddr *laddr, > if (rv < 0) > return rv; > > - rv = s->ops->connect(s, raddr, size, flags); > + rv = kernel_connect(s, raddr, size, flags); > ``` > > is no more invasive than your proposed patch, and gives a more robust outcome. > > Please take a stab. I'm sorry for the late feedback. For the records, I agree the cleanest fix described above should be attempted first. Thanks, Paolo
> 1) Swap out calls to sock->ops->connect() with kernel_connect() This is trivial, as expected. I have a patch ready that swaps out all occurrences of sock->ops->connect(). > 2) Move the address copy to kernel_sendmsg() > 3) Swap out calls to sock_sendmsg()/sock->ops->sendmsg() with kernel_sendmsg() This turns out to be less trivial. kernel_sensmsg() looks to be a special case of sock_sendmsg() with sock_sendmsg() being the more generic of the two: int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); return sock_sendmsg(sock, msg); } It populates msg->msg_iter with a kvec whereas most cases I could find where sock_sendmsg() is used are using a bio_vec. Some examples: ==drivers/iscsi/iscsi_tcp.c: iscsi_sw_tcp_xmit_segment()== iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, copy); r = sock_sendmsg(sk, &msg); ==fs/ocfs2/cluster: o2net_sendpage()== bvec_set_virt(&bv, virt, size); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, size); while (1) { msg.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES; mutex_lock(&sc->sc_send_lock); ret = sock_sendmsg(sc->sc_sock, &msg); ==net/sunrpc/svcsock.c: svc_udp_sendto()== iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, count, 0); err = sock_sendmsg(svsk->sk_sock, &msg); if (err == -ECONNREFUSED) { /* ICMP error on earlier request. */ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, count, 0); err = sock_sendmsg(svsk->sk_sock, &msg); } Maybe these two types are more interchangeable than I'm thinking, but it seems like it might be simpler to just do the address copy inside sock_sendmsg(). Does this revised plan sound reasonable: 1) Swap out calls to sock->ops->connect() with kernel_connect() 2) Move the address copy to sock_sendmsg() I also noticed that BPF hooks inside bind() can rewrite the bind address. Should we do something similar for kernel_bind: 1) Add an address copy to kernel_bind() 2) Swap out direct calls to ops->bind() with kernel_bind() -Jordan On Thu, Sep 14, 2023 at 1:24 AM Paolo Abeni <pabeni@redhat.com> wrote: > > On Wed, 2023-09-13 at 10:02 -0400, Willem de Bruijn wrote: > > On Tue, Sep 12, 2023 at 5:09 PM Jordan Rife <jrife@google.com> wrote: > > > > > > > If we take this path, it could be a single patch. The subsystem > > > > maintainers should be CC:ed so that they can (N)ACK it. > > > > > > > > But I do not mean to ask to split it up and test each one separately. > > > > > > > > The change from sock->ops->connect to kernel_connect is certainly > > > > trivial enough that compile testing should suffice. > > > > > > Ack. Thanks for clarifying. > > > > > > > The only question is whether we should pursue your original patch and > > > > accept that this will continue, or one that improves the situation, > > > > but touches more files and thus has a higher risk of merge conflicts. > > > > > > > > I'd like to give others some time to chime in. I've given my opinion, > > > > but it's only one. > > > > > > > > I'd like to give others some time to chime in. I've given my opinion, > > > > but it's only one. > > > > > > Sounds good. I'll wait to hear others' opinions on the best path forward. > > > > No other comments so far. > > > > My hunch is that a short list of these changes > > > > ``` > > @@ -1328,7 +1328,7 @@ static int kernel_bindconnect(struct socket *s, > > struct sockaddr *laddr, > > if (rv < 0) > > return rv; > > > > - rv = s->ops->connect(s, raddr, size, flags); > > + rv = kernel_connect(s, raddr, size, flags); > > ``` > > > > is no more invasive than your proposed patch, and gives a more robust outcome. > > > > Please take a stab. > > I'm sorry for the late feedback. For the records, I agree the cleanest > fix described above should be attempted first. > > Thanks, > > Paolo >
On Thu, Sep 14, 2023 at 2:41 PM Jordan Rife <jrife@google.com> wrote: > > > 1) Swap out calls to sock->ops->connect() with kernel_connect() > > This is trivial, as expected. I have a patch ready that swaps out all > occurrences of sock->ops->connect(). > > > 2) Move the address copy to kernel_sendmsg() > > 3) Swap out calls to sock_sendmsg()/sock->ops->sendmsg() with kernel_sendmsg() > > This turns out to be less trivial. kernel_sensmsg() looks to be a > special case of sock_sendmsg() with sock_sendmsg() being the more > generic of the two: > > int kernel_sendmsg(struct socket *sock, struct msghdr *msg, > struct kvec *vec, size_t num, size_t size) > { > iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); > return sock_sendmsg(sock, msg); > } > > It populates msg->msg_iter with a kvec whereas most cases I could find > where sock_sendmsg() is used are using a bio_vec. Some examples: > > ==drivers/iscsi/iscsi_tcp.c: iscsi_sw_tcp_xmit_segment()== > iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, copy); > > r = sock_sendmsg(sk, &msg); > > ==fs/ocfs2/cluster: o2net_sendpage()== > bvec_set_virt(&bv, virt, size); > iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, size); > > while (1) { > msg.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES; > mutex_lock(&sc->sc_send_lock); > ret = sock_sendmsg(sc->sc_sock, &msg); > > ==net/sunrpc/svcsock.c: svc_udp_sendto()== > iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, > count, 0); > err = sock_sendmsg(svsk->sk_sock, &msg); > if (err == -ECONNREFUSED) { > /* ICMP error on earlier request. */ > iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, > count, 0); > err = sock_sendmsg(svsk->sk_sock, &msg); > } > > Maybe these two types are more interchangeable than I'm thinking, but > it seems like it might be simpler to just do the address copy inside > sock_sendmsg(). Does this revised plan sound reasonable: > > 1) Swap out calls to sock->ops->connect() with kernel_connect() > 2) Move the address copy to sock_sendmsg() That also covers normal user code that is already protected. How about a __kernel_sendmsg that follows the sock_sendmsg API and is a thin wrapper exactly for in-kernel callers. > I also noticed that BPF hooks inside bind() can rewrite the bind > address. Should we do something similar for kernel_bind: > > 1) Add an address copy to kernel_bind() > 2) Swap out direct calls to ops->bind() with kernel_bind() Good catch. Sounds like it. At that point it might be worth splitting into three patches. > -Jordan > > On Thu, Sep 14, 2023 at 1:24 AM Paolo Abeni <pabeni@redhat.com> wrote: > > > > On Wed, 2023-09-13 at 10:02 -0400, Willem de Bruijn wrote: > > > On Tue, Sep 12, 2023 at 5:09 PM Jordan Rife <jrife@google.com> wrote: > > > > > > > > > If we take this path, it could be a single patch. The subsystem > > > > > maintainers should be CC:ed so that they can (N)ACK it. > > > > > > > > > > But I do not mean to ask to split it up and test each one separately. > > > > > > > > > > The change from sock->ops->connect to kernel_connect is certainly > > > > > trivial enough that compile testing should suffice. > > > > > > > > Ack. Thanks for clarifying. > > > > > > > > > The only question is whether we should pursue your original patch and > > > > > accept that this will continue, or one that improves the situation, > > > > > but touches more files and thus has a higher risk of merge conflicts. > > > > > > > > > > I'd like to give others some time to chime in. I've given my opinion, > > > > > but it's only one. > > > > > > > > > > I'd like to give others some time to chime in. I've given my opinion, > > > > > but it's only one. > > > > > > > > Sounds good. I'll wait to hear others' opinions on the best path forward. > > > > > > No other comments so far. > > > > > > My hunch is that a short list of these changes > > > > > > ``` > > > @@ -1328,7 +1328,7 @@ static int kernel_bindconnect(struct socket *s, > > > struct sockaddr *laddr, > > > if (rv < 0) > > > return rv; > > > > > > - rv = s->ops->connect(s, raddr, size, flags); > > > + rv = kernel_connect(s, raddr, size, flags); > > > ``` > > > > > > is no more invasive than your proposed patch, and gives a more robust outcome. > > > > > > Please take a stab. > > > > I'm sorry for the late feedback. For the records, I agree the cleanest > > fix described above should be attempted first. > > > > Thanks, > > > > Paolo > >
> How about a __kernel_sendmsg that follows the sock_sendmsg API and is > a thin wrapper exactly for in-kernel callers. Makes sense. That works for me. > Good catch. Sounds like it. > At that point it might be worth splitting into three patches. Sounds good. I can do Patch 1: kernel_connect() changes (ready to go) Patch 2: sock_sendmsg() changes Patch 3: kernel_bind() changes -Jordan On Thu, Sep 14, 2023 at 5:17 PM Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > On Thu, Sep 14, 2023 at 2:41 PM Jordan Rife <jrife@google.com> wrote: > > > > > 1) Swap out calls to sock->ops->connect() with kernel_connect() > > > > This is trivial, as expected. I have a patch ready that swaps out all > > occurrences of sock->ops->connect(). > > > > > 2) Move the address copy to kernel_sendmsg() > > > 3) Swap out calls to sock_sendmsg()/sock->ops->sendmsg() with kernel_sendmsg() > > > > This turns out to be less trivial. kernel_sensmsg() looks to be a > > special case of sock_sendmsg() with sock_sendmsg() being the more > > generic of the two: > > > > int kernel_sendmsg(struct socket *sock, struct msghdr *msg, > > struct kvec *vec, size_t num, size_t size) > > { > > iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); > > return sock_sendmsg(sock, msg); > > } > > > > It populates msg->msg_iter with a kvec whereas most cases I could find > > where sock_sendmsg() is used are using a bio_vec. Some examples: > > > > ==drivers/iscsi/iscsi_tcp.c: iscsi_sw_tcp_xmit_segment()== > > iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, copy); > > > > r = sock_sendmsg(sk, &msg); > > > > ==fs/ocfs2/cluster: o2net_sendpage()== > > bvec_set_virt(&bv, virt, size); > > iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, size); > > > > while (1) { > > msg.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES; > > mutex_lock(&sc->sc_send_lock); > > ret = sock_sendmsg(sc->sc_sock, &msg); > > > > ==net/sunrpc/svcsock.c: svc_udp_sendto()== > > iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, > > count, 0); > > err = sock_sendmsg(svsk->sk_sock, &msg); > > if (err == -ECONNREFUSED) { > > /* ICMP error on earlier request. */ > > iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, > > count, 0); > > err = sock_sendmsg(svsk->sk_sock, &msg); > > } > > > > Maybe these two types are more interchangeable than I'm thinking, but > > it seems like it might be simpler to just do the address copy inside > > sock_sendmsg(). Does this revised plan sound reasonable: > > > > 1) Swap out calls to sock->ops->connect() with kernel_connect() > > 2) Move the address copy to sock_sendmsg() > > That also covers normal user code that is already protected. > > How about a __kernel_sendmsg that follows the sock_sendmsg API and is > a thin wrapper exactly for in-kernel callers. > > > I also noticed that BPF hooks inside bind() can rewrite the bind > > address. Should we do something similar for kernel_bind: > > > > 1) Add an address copy to kernel_bind() > > 2) Swap out direct calls to ops->bind() with kernel_bind() > > Good catch. Sounds like it. > > At that point it might be worth splitting into three patches. > > > -Jordan > > > > On Thu, Sep 14, 2023 at 1:24 AM Paolo Abeni <pabeni@redhat.com> wrote: > > > > > > On Wed, 2023-09-13 at 10:02 -0400, Willem de Bruijn wrote: > > > > On Tue, Sep 12, 2023 at 5:09 PM Jordan Rife <jrife@google.com> wrote: > > > > > > > > > > > If we take this path, it could be a single patch. The subsystem > > > > > > maintainers should be CC:ed so that they can (N)ACK it. > > > > > > > > > > > > But I do not mean to ask to split it up and test each one separately. > > > > > > > > > > > > The change from sock->ops->connect to kernel_connect is certainly > > > > > > trivial enough that compile testing should suffice. > > > > > > > > > > Ack. Thanks for clarifying. > > > > > > > > > > > The only question is whether we should pursue your original patch and > > > > > > accept that this will continue, or one that improves the situation, > > > > > > but touches more files and thus has a higher risk of merge conflicts. > > > > > > > > > > > > I'd like to give others some time to chime in. I've given my opinion, > > > > > > but it's only one. > > > > > > > > > > > > I'd like to give others some time to chime in. I've given my opinion, > > > > > > but it's only one. > > > > > > > > > > Sounds good. I'll wait to hear others' opinions on the best path forward. > > > > > > > > No other comments so far. > > > > > > > > My hunch is that a short list of these changes > > > > > > > > ``` > > > > @@ -1328,7 +1328,7 @@ static int kernel_bindconnect(struct socket *s, > > > > struct sockaddr *laddr, > > > > if (rv < 0) > > > > return rv; > > > > > > > > - rv = s->ops->connect(s, raddr, size, flags); > > > > + rv = kernel_connect(s, raddr, size, flags); > > > > ``` > > > > > > > > is no more invasive than your proposed patch, and gives a more robust outcome. > > > > > > > > Please take a stab. > > > > > > I'm sorry for the late feedback. For the records, I agree the cleanest > > > fix described above should be attempted first. > > > > > > Thanks, > > > > > > Paolo > > >
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 3d2e30e204735..c37d484fbee34 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -568,6 +568,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, { struct sock *sk = sock->sk; const struct proto *prot; + struct sockaddr_storage addr; int err; if (addr_len < sizeof(uaddr->sa_family)) @@ -580,6 +581,14 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, return prot->disconnect(sk, flags); if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + if (uaddr && addr_len <= sizeof(addr)) { + /* pre_connect can rewrite uaddr, so make a copy to + * insulate the caller. + */ + memcpy(&addr, uaddr, addr_len); + uaddr = (struct sockaddr *)&addr; + } + err = prot->pre_connect(sk, uaddr, addr_len); if (err) return err; @@ -625,6 +634,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg) { struct sock *sk = sock->sk; + struct sockaddr_storage addr; int err; long timeo; @@ -668,6 +678,14 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + if (uaddr && addr_len <= sizeof(addr)) { + /* pre_connect can rewrite uaddr, so make a copy to + * insulate the caller. + */ + memcpy(&addr, uaddr, addr_len); + uaddr = (struct sockaddr *)&addr; + } + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); if (err) goto out; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f39b9c8445808..5f5ee2752eeb7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1142,18 +1142,29 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { + struct sockaddr_in tmp_addr; + struct sockaddr_in *addr = usin; + + /* BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK can rewrite usin, so make a + * copy to insulate the caller. + */ + if (usin && msg->msg_namelen <= sizeof(tmp_addr)) { + memcpy(&tmp_addr, usin, msg->msg_namelen); + addr = &tmp_addr; + } + err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, - (struct sockaddr *)usin, &ipc.addr); + (struct sockaddr *)addr, &ipc.addr); if (err) goto out_free; - if (usin) { - if (usin->sin_port == 0) { + if (addr) { + if (addr->sin_port == 0) { /* BPF program set invalid port. Reject it. */ err = -EINVAL; goto out_free; } - daddr = usin->sin_addr.s_addr; - dport = usin->sin_port; + daddr = addr->sin_addr.s_addr; + dport = addr->sin_port; } } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 86b5d509a4688..cbc1917fad629 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1506,26 +1506,37 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6->fl6_sport = inet->inet_sport; if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) { + struct sockaddr_in6 tmp_addr; + struct sockaddr_in6 *addr = sin6; + + /* BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK can rewrite sin6, so make a + * copy to insulate the caller. + */ + if (sin6 && addr_len <= sizeof(tmp_addr)) { + memcpy(&tmp_addr, sin6, addr_len); + addr = &tmp_addr; + } + err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, - (struct sockaddr *)sin6, + (struct sockaddr *)addr, &fl6->saddr); if (err) goto out_no_dst; - if (sin6) { - if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { + if (addr) { + if (ipv6_addr_v4mapped(&addr->sin6_addr)) { /* BPF program rewrote IPv6-only by IPv4-mapped * IPv6. It's currently unsupported. */ err = -ENOTSUPP; goto out_no_dst; } - if (sin6->sin6_port == 0) { + if (addr->sin6_port == 0) { /* BPF program set invalid port. Reject it. */ err = -EINVAL; goto out_no_dst; } - fl6->fl6_dport = sin6->sin6_port; - fl6->daddr = sin6->sin6_addr; + fl6->fl6_dport = addr->sin6_port; + fl6->daddr = addr->sin6_addr; } } diff --git a/net/socket.c b/net/socket.c index c8b08b32f097e..39794d026fa11 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3570,12 +3570,7 @@ EXPORT_SYMBOL(kernel_accept); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags) { - struct sockaddr_storage address; - - memcpy(&address, addr, addrlen); - - return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)&address, - addrlen, flags); + return READ_ONCE(sock->ops)->connect(sock, addr, addrlen, flags); } EXPORT_SYMBOL(kernel_connect);
commit 0bdf399342c5 ("net: Avoid address overwrite in kernel_connect") ensured that kernel_connect() will not overwrite the address parameter in cases where BPF connect hooks perform an address rewrite. However, there remain other cases where BPF hooks can overwrite an address held by a kernel client. ==Scenarios Tested== * Code in the SMB and Ceph modules calls sock->ops->connect() directly, allowing the address overwrite to occur. In the case of SMB, this can lead to broken mounts. * NFS v3 mounts with proto=udp call sock_sendmsg() for each RPC call, passing a pointer to the mount address in msg->msg_name which is later overwritten by a BPF sendmsg hook. This can lead to broken NFS mounts. In order to more comprehensively fix this class of problems, this patch pushes the address copy deeper into the stack and introduces an address copy to both udp_sendmsg() and udpv6_sendmsg() to insulate all callers from address rewrites. Signed-off-by: Jordan Rife <jrife@google.com> --- net/ipv4/af_inet.c | 18 ++++++++++++++++++ net/ipv4/udp.c | 21 ++++++++++++++++----- net/ipv6/udp.c | 23 +++++++++++++++++------ net/socket.c | 7 +------ 4 files changed, 52 insertions(+), 17 deletions(-)