diff mbox series

[bpf-next,01/11] bpf: make bpf_d_path() helper use probe-read semantics

Message ID 5643840bd57d0c2345635552ae228dfb2ed3428c.1708377880.git.mattbobrowski@google.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: probe-read bpf_d_path() and add new acquire/release BPF kfuncs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 fail Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-32 fail Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / test
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 fail Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-29 fail Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1065 this patch: 1065
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 14 maintainers not CCed: mhiramat@kernel.org linux-trace-kernel@vger.kernel.org john.fastabend@gmail.com rostedt@goodmis.org yonghong.song@linux.dev martin.lau@linux.dev jack@suse.cz song@kernel.org sdf@google.com mathieu.desnoyers@efficios.com eddyz87@gmail.com kpsingh@kernel.org viro@zeniv.linux.org.uk haoluo@google.com
netdev/build_clang success Errors and warnings before: 1066 this patch: 1066
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1082 this patch: 1082
netdev/checkpatch warning CHECK: Macro argument 'src' may be better as '(src)' to avoid precedence issues CHECK: Unnecessary parentheses around '&__r' CHECK: Unnecessary parentheses around '&src' CHECK: extern prototypes should be avoided in .h files WARNING: Improper SPDX comment style for 'include/linux/probe_read_d_path.h', please use '/*' instead WARNING: Missing a blank line after declarations WARNING: Missing or malformed SPDX-License-Identifier tag in line 1 WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: function definition argument 'char *' should also have an identifier name WARNING: function definition argument 'int' should also have an identifier name WARNING: function definition argument 'struct dentry *' should also have an identifier name
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline fail Was 0 now: 1

Commit Message

Matt Bobrowski Feb. 20, 2024, 9:27 a.m. UTC
There has now been several reported instances [0, 1, 2] where the
usage of the BPF helper bpf_d_path() has led to some form of memory
corruption issue.

The fundamental reason behind why we repeatedly see bpf_d_path() being
susceptible to such memory corruption issues is because it only
enforces ARG_PTR_TO_BTF_ID constraints onto it's struct path
argument. This essentially means that it only requires an in-kernel
pointer of type struct path to be provided to it. Depending on the
underlying context and where the supplied struct path was obtained
from and when, depends on whether the struct path is fully intact or
not when calling bpf_d_path(). It's certainly possible to call
bpf_d_path() and subsequently d_path() from contexts where the
supplied struct path to bpf_d_path() has already started being torn
down by __fput() and such. An example of this is perfectly illustrated
in [0].

Moving forward, we simply cannot enforce KF_TRUSTED_ARGS semantics
onto struct path of bpf_d_path(), as this approach would presumably
lead to some pretty wide scale and highly undesirable BPF program
breakage. To avoid breaking any pre-existing BPF program that is
dependent on bpf_d_path(), I propose that we take a different path and
re-implement an incredibly minimalistic and bare bone version of
d_path() which is entirely backed by kernel probe-read semantics. IOW,
a version of d_path() that is backed by
copy_from_kernel_nofault(). This ensures that any reads performed
against the supplied struct path to bpf_d_path() which may end up
faulting for whatever reason end up being gracefully handled and fixed
up.

The caveats with such an approach is that we can't fully uphold all of
d_path()'s path resolution capabilities. Resolving a path which is
comprised of a dentry that make use of dynamic names via isn't
possible as we can't enforce probe-read semantics onto indirect
function calls performed via d_op as they're implementation
dependent. For such cases, we just return -EOPNOTSUPP. This might be a
little surprising to some users, especially those that are interested
in resolving paths that involve a dentry that resides on some
non-mountable pseudo-filesystem, being pipefs/sockfs/nsfs, but it's
arguably better than enforcing KF_TRUSTED_ARGS onto bpf_d_path() and
causing an unnecessary shemozzle for users. Additionally, we don't
make use of all the locking semantics, or handle all the erroneous
cases in which d_path() naturally would. This is fine however, as
we're only looking to provide users with a rather acceptable version
of a reconstructed path, whilst they eventually migrate over to the
trusted bpf_path_d_path() BPF kfunc variant.

Note that the selftests that go with this change to bpf_d_path() have
been purposely split out into a completely separate patch. This is so
that the reviewers attention is not torn by noise and can remain
focused on reviewing the implementation details contained within this
patch.

[0] https://lore.kernel.org/bpf/CAG48ez0ppjcT=QxU-jtCUfb5xQb3mLr=5FcwddF_VKfEBPs_Dg@mail.gmail.com/
[1] https://lore.kernel.org/bpf/20230606181714.532998-1-jolsa@kernel.org/
[2] https://lore.kernel.org/bpf/20220219113744.1852259-1-memxor@gmail.com/

Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
---
 fs/Makefile                       |   6 +-
 fs/probe_read_d_path.c            | 150 ++++++++++++++++++++++++++++++
 include/linux/probe_read_d_path.h |  13 +++
 kernel/trace/bpf_trace.c          |  13 ++-
 4 files changed, 172 insertions(+), 10 deletions(-)
 create mode 100644 fs/probe_read_d_path.c
 create mode 100644 include/linux/probe_read_d_path.h

Comments

Christian Brauner Feb. 20, 2024, 9:48 a.m. UTC | #1
On Tue, Feb 20, 2024 at 09:27:23AM +0000, Matt Bobrowski wrote:
> There has now been several reported instances [0, 1, 2] where the
> usage of the BPF helper bpf_d_path() has led to some form of memory
> corruption issue.
> 
> The fundamental reason behind why we repeatedly see bpf_d_path() being
> susceptible to such memory corruption issues is because it only
> enforces ARG_PTR_TO_BTF_ID constraints onto it's struct path
> argument. This essentially means that it only requires an in-kernel
> pointer of type struct path to be provided to it. Depending on the
> underlying context and where the supplied struct path was obtained
> from and when, depends on whether the struct path is fully intact or
> not when calling bpf_d_path(). It's certainly possible to call
> bpf_d_path() and subsequently d_path() from contexts where the
> supplied struct path to bpf_d_path() has already started being torn
> down by __fput() and such. An example of this is perfectly illustrated
> in [0].
> 
> Moving forward, we simply cannot enforce KF_TRUSTED_ARGS semantics
> onto struct path of bpf_d_path(), as this approach would presumably
> lead to some pretty wide scale and highly undesirable BPF program
> breakage. To avoid breaking any pre-existing BPF program that is
> dependent on bpf_d_path(), I propose that we take a different path and
> re-implement an incredibly minimalistic and bare bone version of
> d_path() which is entirely backed by kernel probe-read semantics. IOW,
> a version of d_path() that is backed by
> copy_from_kernel_nofault(). This ensures that any reads performed
> against the supplied struct path to bpf_d_path() which may end up
> faulting for whatever reason end up being gracefully handled and fixed
> up.
> 
> The caveats with such an approach is that we can't fully uphold all of
> d_path()'s path resolution capabilities. Resolving a path which is
> comprised of a dentry that make use of dynamic names via isn't
> possible as we can't enforce probe-read semantics onto indirect
> function calls performed via d_op as they're implementation
> dependent. For such cases, we just return -EOPNOTSUPP. This might be a
> little surprising to some users, especially those that are interested
> in resolving paths that involve a dentry that resides on some
> non-mountable pseudo-filesystem, being pipefs/sockfs/nsfs, but it's
> arguably better than enforcing KF_TRUSTED_ARGS onto bpf_d_path() and
> causing an unnecessary shemozzle for users. Additionally, we don't

NAK. We're not going to add a semi-functional reimplementation of
d_path() for bpf. This relied on VFS internals and guarantees that were
never given. Restrict it to KF_TRUSTED_ARGS as it was suggested when
this originally came up or fix it another way. But we're not adding a
bunch of kfuncs to even more sensitive VFS machinery and then build a
d_path() clone just so we can retroactively justify broken behavior.

> make use of all the locking semantics, or handle all the erroneous
> cases in which d_path() naturally would. This is fine however, as
> we're only looking to provide users with a rather acceptable version
> of a reconstructed path, whilst they eventually migrate over to the
> trusted bpf_path_d_path() BPF kfunc variant.
> 
> Note that the selftests that go with this change to bpf_d_path() have
> been purposely split out into a completely separate patch. This is so
> that the reviewers attention is not torn by noise and can remain
> focused on reviewing the implementation details contained within this
> patch.
> 
> [0] https://lore.kernel.org/bpf/CAG48ez0ppjcT=QxU-jtCUfb5xQb3mLr=5FcwddF_VKfEBPs_Dg@mail.gmail.com/
> [1] https://lore.kernel.org/bpf/20230606181714.532998-1-jolsa@kernel.org/
> [2] https://lore.kernel.org/bpf/20220219113744.1852259-1-memxor@gmail.com/
> 
> Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
> ---
>  fs/Makefile                       |   6 +-
>  fs/probe_read_d_path.c            | 150 ++++++++++++++++++++++++++++++
>  include/linux/probe_read_d_path.h |  13 +++
>  kernel/trace/bpf_trace.c          |  13 ++-
>  4 files changed, 172 insertions(+), 10 deletions(-)
>  create mode 100644 fs/probe_read_d_path.c
>  create mode 100644 include/linux/probe_read_d_path.h
> 
> diff --git a/fs/Makefile b/fs/Makefile
> index c09016257f05..945c9c84d35d 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -4,7 +4,7 @@
>  #
>  # 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
>  # Rewritten to use lists instead of if-statements.
> -# 
> +#
>  
>  
>  obj-y :=	open.o read_write.o file_table.o super.o \
> @@ -12,7 +12,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
>  		ioctl.o readdir.o select.o dcache.o inode.o \
>  		attr.o bad_inode.o file.o filesystems.o namespace.o \
>  		seq_file.o xattr.o libfs.o fs-writeback.o \
> -		pnode.o splice.o sync.o utimes.o d_path.o \
> +		pnode.o splice.o sync.o utimes.o d_path.o probe_read_d_path.o \
>  		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
>  		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
>  		kernel_read_file.o mnt_idmapping.o remap_range.o
> @@ -58,7 +58,7 @@ obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
>  obj-y				+= devpts/
>  
>  obj-$(CONFIG_DLM)		+= dlm/
> - 
> +
>  # Do not add any filesystems before this line
>  obj-$(CONFIG_NETFS_SUPPORT)	+= netfs/
>  obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
> diff --git a/fs/probe_read_d_path.c b/fs/probe_read_d_path.c
> new file mode 100644
> index 000000000000..8d0db902f836
> --- /dev/null
> +++ b/fs/probe_read_d_path.c
> @@ -0,0 +1,150 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2024 Google LLC.
> + */
> +
> +#include "asm/ptrace.h"
> +#include <linux/container_of.h>
> +#include <linux/dcache.h>
> +#include <linux/fs_struct.h>
> +#include <linux/uaccess.h>
> +#include <linux/path.h>
> +#include <linux/probe_read_d_path.h>
> +
> +#include "mount.h"
> +
> +#define PROBE_READ(src)                                              \
> +	({                                                           \
> +		typeof(src) __r;                                     \
> +		if (copy_from_kernel_nofault((void *)(&__r), (&src), \
> +					     sizeof((__r))))         \
> +			memset((void *)(&__r), 0, sizeof((__r)));    \
> +		__r;                                                 \
> +	})
> +
> +static inline bool probe_read_d_unlinked(const struct dentry *dentry)
> +{
> +	return !PROBE_READ(dentry->d_hash.pprev) &&
> +	       !(dentry == PROBE_READ(dentry->d_parent));
> +}
> +
> +static long probe_read_prepend(const char *s, int len, char *buf, int *buflen)
> +{
> +	/*
> +	 * The supplied len that is to be copied into the buffer will result in
> +	 * an overflow. The true implementation of d_path() already returns an
> +	 * error for such overflow cases, so the semantics with regards to the
> +	 * bpf_d_path() helper returning the same error value for overflow cases
> +	 * remain the same.
> +	 */
> +	if (len > *buflen)
> +		return -ENAMETOOLONG;
> +
> +	/*
> +	 * The supplied string fits completely into the remaining buffer
> +	 * space. Attempt to make the copy.
> +	 */
> +	*buflen -= len;
> +	buf += *buflen;
> +	return copy_from_kernel_nofault(buf, s, len);
> +}
> +
> +static bool use_dname(const struct path *path)
> +{
> +	const struct dentry_operations *d_op;
> +	char *(*d_dname)(struct dentry *, char *, int);
> +
> +	d_op = PROBE_READ(path->dentry->d_op);
> +	d_dname = PROBE_READ(d_op->d_dname);
> +
> +	return d_op && d_dname &&
> +	       (!(path->dentry == PROBE_READ(path->dentry->d_parent)) ||
> +		path->dentry != PROBE_READ(path->mnt->mnt_root));
> +}
> +
> +char *probe_read_d_path(const struct path *path, char *buf, int buflen)
> +{
> +	int len;
> +	long err;
> +	struct path root;
> +	struct mount *mnt;
> +	struct dentry *dentry;
> +
> +	dentry = path->dentry;
> +	mnt = container_of(path->mnt, struct mount, mnt);
> +
> +	/*
> +	 * We cannot back dentry->d_op->d_dname() with probe-read semantics, so
> +	 * just return an error to the caller when the supplied path contains a
> +	 * dentry component that makes use of a dynamic name.
> +	 */
> +	if (use_dname(path))
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	err = probe_read_prepend("\0", 1, buf, &buflen);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	if (probe_read_d_unlinked(dentry)) {
> +		err = probe_read_prepend(" (deleted)", 10, buf, &buflen);
> +		if (err)
> +			return ERR_PTR(err);
> +	}
> +
> +	len = buflen;
> +	root = PROBE_READ(current->fs->root);
> +	while (dentry != root.dentry || &mnt->mnt != root.mnt) {
> +		struct dentry *parent;
> +		if (dentry == PROBE_READ(mnt->mnt.mnt_root)) {
> +			struct mount *m;
> +
> +			m = PROBE_READ(mnt->mnt_parent);
> +			if (mnt != m) {
> +				dentry = PROBE_READ(mnt->mnt_mountpoint);
> +				mnt = m;
> +				continue;
> +			}
> +
> +			/*
> +			 * If we've reached the global root, then there's
> +			 * nothing we can really do but bail.
> +			 */
> +			break;
> +		}
> +
> +		parent = PROBE_READ(dentry->d_parent);
> +		if (dentry == parent) {
> +			/*
> +			 * Escaped? We return an ECANCELED error here to signify
> +			 * that we've prematurely terminated pathname
> +			 * reconstruction. We've potentially hit a root dentry
> +			 * that isn't associated with any roots from the mounted
> +			 * filesystems that we've jumped through, so it's not
> +			 * clear where we are in the VFS exactly.
> +			 */
> +			err = -ECANCELED;
> +			break;
> +		}
> +
> +		err = probe_read_prepend(dentry->d_name.name,
> +					 PROBE_READ(dentry->d_name.len), buf,
> +					 &buflen);
> +		if (err)
> +			break;
> +
> +		err = probe_read_prepend("/", 1, buf, &buflen);
> +		if (err)
> +			break;
> +		dentry = parent;
> +	}
> +
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	if (len == buflen) {
> +		err = probe_read_prepend("/", 1, buf, &buflen);
> +		if (err)
> +			return ERR_PTR(err);
> +	}
> +	return buf + buflen;
> +}
> diff --git a/include/linux/probe_read_d_path.h b/include/linux/probe_read_d_path.h
> new file mode 100644
> index 000000000000..9b3908746657
> --- /dev/null
> +++ b/include/linux/probe_read_d_path.h
> @@ -0,0 +1,13 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2024 Google LLC.
> + */
> +
> +#ifndef _LINUX_PROBE_READ_D_PATH_H
> +#define _LINUX_PROBE_READ_D_PATH_H
> +
> +#include <linux/path.h>
> +
> +extern char *probe_read_d_path(const struct path *path, char *buf, int buflen);
> +
> +#endif /* _LINUX_PROBE_READ_D_PATH_H */
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 241ddf5e3895..12dbd9cef1fa 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -25,6 +25,7 @@
>  #include <linux/verification.h>
>  #include <linux/namei.h>
>  #include <linux/fileattr.h>
> +#include <linux/probe_read_d_path.h>
>  
>  #include <net/bpf_sk_storage.h>
>  
> @@ -923,14 +924,12 @@ BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
>  	if (len < 0)
>  		return len;
>  
> -	p = d_path(&copy, buf, sz);
> -	if (IS_ERR(p)) {
> -		len = PTR_ERR(p);
> -	} else {
> -		len = buf + sz - p;
> -		memmove(buf, p, len);
> -	}
> +	p = probe_read_d_path(&copy, buf, sz);
> +	if (IS_ERR(p))
> +		return PTR_ERR(p);
>  
> +	len = buf + sz - p;
> +	memmove(buf, p, len);
>  	return len;
>  }
>  
> -- 
> 2.44.0.rc0.258.g7320e95886-goog
> 
> /M
Matt Bobrowski Feb. 20, 2024, 1:22 p.m. UTC | #2
On Tue, Feb 20, 2024 at 10:48:10AM +0100, Christian Brauner wrote:
> On Tue, Feb 20, 2024 at 09:27:23AM +0000, Matt Bobrowski wrote:
> > There has now been several reported instances [0, 1, 2] where the
> > usage of the BPF helper bpf_d_path() has led to some form of memory
> > corruption issue.
> > 
> > The fundamental reason behind why we repeatedly see bpf_d_path() being
> > susceptible to such memory corruption issues is because it only
> > enforces ARG_PTR_TO_BTF_ID constraints onto it's struct path
> > argument. This essentially means that it only requires an in-kernel
> > pointer of type struct path to be provided to it. Depending on the
> > underlying context and where the supplied struct path was obtained
> > from and when, depends on whether the struct path is fully intact or
> > not when calling bpf_d_path(). It's certainly possible to call
> > bpf_d_path() and subsequently d_path() from contexts where the
> > supplied struct path to bpf_d_path() has already started being torn
> > down by __fput() and such. An example of this is perfectly illustrated
> > in [0].
> > 
> > Moving forward, we simply cannot enforce KF_TRUSTED_ARGS semantics
> > onto struct path of bpf_d_path(), as this approach would presumably
> > lead to some pretty wide scale and highly undesirable BPF program
> > breakage. To avoid breaking any pre-existing BPF program that is
> > dependent on bpf_d_path(), I propose that we take a different path and
> > re-implement an incredibly minimalistic and bare bone version of
> > d_path() which is entirely backed by kernel probe-read semantics. IOW,
> > a version of d_path() that is backed by
> > copy_from_kernel_nofault(). This ensures that any reads performed
> > against the supplied struct path to bpf_d_path() which may end up
> > faulting for whatever reason end up being gracefully handled and fixed
> > up.
> > 
> > The caveats with such an approach is that we can't fully uphold all of
> > d_path()'s path resolution capabilities. Resolving a path which is
> > comprised of a dentry that make use of dynamic names via isn't
> > possible as we can't enforce probe-read semantics onto indirect
> > function calls performed via d_op as they're implementation
> > dependent. For such cases, we just return -EOPNOTSUPP. This might be a
> > little surprising to some users, especially those that are interested
> > in resolving paths that involve a dentry that resides on some
> > non-mountable pseudo-filesystem, being pipefs/sockfs/nsfs, but it's
> > arguably better than enforcing KF_TRUSTED_ARGS onto bpf_d_path() and
> > causing an unnecessary shemozzle for users. Additionally, we don't
> 
> NAK. We're not going to add a semi-functional reimplementation of
> d_path() for bpf. This relied on VFS internals and guarantees that were
> never given. Restrict it to KF_TRUSTED_ARGS as it was suggested when
> this originally came up or fix it another way. But we're not adding a
> bunch of kfuncs to even more sensitive VFS machinery and then build a
> d_path() clone just so we can retroactively justify broken behavior.

OK, I agree, having a semi-functional re-implementation of d_path() is
indeed suboptimal. However, also understand that slapping the
KF_TRUSTED_ARGS constraint onto the pre-existing BPF helper
bpf_d_path() would outright break a lot of BPF programs out there, so
I can't see how taht would be an acceptable approach moving forward
here either.

Let's say that we decided to leave the pre-existing bpf_d_path()
implementation as is, accepting that it is fundamentally succeptible
to memory corruption issues, are you saying that you're also not for
adding the KF_TRUSTED_ARGS d_path() variant as I've done so here
[0]. Or, is it the other supporting reference counting based BPF
kfuncs [1, 2] that have irked you and aren't supportive of either?

[0] https://lore.kernel.org/bpf/20240220-erstochen-notwehr-755dbd0a02b3@brauner/T/#m542b86991b257cf9612406f1cc4d5692bcb75da8
[1] https://lore.kernel.org/bpf/20240220-erstochen-notwehr-755dbd0a02b3@brauner/T/#mc2aaadbe17490aeb1dde09071629b0b2a87d7436
[2] https://lore.kernel.org/bpf/20240220-erstochen-notwehr-755dbd0a02b3@brauner/T/#m07fa7a0c03af530d2ab3c4ef25c377b1d6ef17f8

/M
Christian Brauner Feb. 21, 2024, 7:55 a.m. UTC | #3
On Tue, Feb 20, 2024 at 01:22:14PM +0000, Matt Bobrowski wrote:
> On Tue, Feb 20, 2024 at 10:48:10AM +0100, Christian Brauner wrote:
> > On Tue, Feb 20, 2024 at 09:27:23AM +0000, Matt Bobrowski wrote:
> > > There has now been several reported instances [0, 1, 2] where the
> > > usage of the BPF helper bpf_d_path() has led to some form of memory
> > > corruption issue.
> > > 
> > > The fundamental reason behind why we repeatedly see bpf_d_path() being
> > > susceptible to such memory corruption issues is because it only
> > > enforces ARG_PTR_TO_BTF_ID constraints onto it's struct path
> > > argument. This essentially means that it only requires an in-kernel
> > > pointer of type struct path to be provided to it. Depending on the
> > > underlying context and where the supplied struct path was obtained
> > > from and when, depends on whether the struct path is fully intact or
> > > not when calling bpf_d_path(). It's certainly possible to call
> > > bpf_d_path() and subsequently d_path() from contexts where the
> > > supplied struct path to bpf_d_path() has already started being torn
> > > down by __fput() and such. An example of this is perfectly illustrated
> > > in [0].
> > > 
> > > Moving forward, we simply cannot enforce KF_TRUSTED_ARGS semantics
> > > onto struct path of bpf_d_path(), as this approach would presumably
> > > lead to some pretty wide scale and highly undesirable BPF program
> > > breakage. To avoid breaking any pre-existing BPF program that is
> > > dependent on bpf_d_path(), I propose that we take a different path and
> > > re-implement an incredibly minimalistic and bare bone version of
> > > d_path() which is entirely backed by kernel probe-read semantics. IOW,
> > > a version of d_path() that is backed by
> > > copy_from_kernel_nofault(). This ensures that any reads performed
> > > against the supplied struct path to bpf_d_path() which may end up
> > > faulting for whatever reason end up being gracefully handled and fixed
> > > up.
> > > 
> > > The caveats with such an approach is that we can't fully uphold all of
> > > d_path()'s path resolution capabilities. Resolving a path which is
> > > comprised of a dentry that make use of dynamic names via isn't
> > > possible as we can't enforce probe-read semantics onto indirect
> > > function calls performed via d_op as they're implementation
> > > dependent. For such cases, we just return -EOPNOTSUPP. This might be a
> > > little surprising to some users, especially those that are interested
> > > in resolving paths that involve a dentry that resides on some
> > > non-mountable pseudo-filesystem, being pipefs/sockfs/nsfs, but it's
> > > arguably better than enforcing KF_TRUSTED_ARGS onto bpf_d_path() and
> > > causing an unnecessary shemozzle for users. Additionally, we don't
> > 
> > NAK. We're not going to add a semi-functional reimplementation of
> > d_path() for bpf. This relied on VFS internals and guarantees that were
> > never given. Restrict it to KF_TRUSTED_ARGS as it was suggested when
> > this originally came up or fix it another way. But we're not adding a
> > bunch of kfuncs to even more sensitive VFS machinery and then build a
> > d_path() clone just so we can retroactively justify broken behavior.
> 
> OK, I agree, having a semi-functional re-implementation of d_path() is
> indeed suboptimal. However, also understand that slapping the

The ugliness of the duplicated code made me start my mail with NAK. It
would've been enough to just say no.

> KF_TRUSTED_ARGS constraint onto the pre-existing BPF helper
> bpf_d_path() would outright break a lot of BPF programs out there, so
> I can't see how taht would be an acceptable approach moving forward
> here either.
> 
> Let's say that we decided to leave the pre-existing bpf_d_path()
> implementation as is, accepting that it is fundamentally succeptible
> to memory corruption issues, are you saying that you're also not for
> adding the KF_TRUSTED_ARGS d_path() variant as I've done so here

No, that's fine and was the initial proposal anyway. You're already
using the existing d_path() anway in that bpf_d_path() thing. So
exposing another variant with KF_TRUSTED_ARGS restriction is fine. But
not hacking up a custom d_path() variant.

> [0]. Or, is it the other supporting reference counting based BPF
> kfuncs [1, 2] that have irked you and aren't supportive of either?

Yes, because you're exposing fs_root, fs_pwd, path_put() and fdput(),
get_task_exe_file(), get_mm_exe_file(). None of that I see being turned
into kfuncs.
Matt Bobrowski Feb. 21, 2024, 1:38 p.m. UTC | #4
Hey Christian,

On Wed, Feb 21, 2024 at 08:55:25AM +0100, Christian Brauner wrote:
> On Tue, Feb 20, 2024 at 01:22:14PM +0000, Matt Bobrowski wrote:
> > On Tue, Feb 20, 2024 at 10:48:10AM +0100, Christian Brauner wrote:
> > > On Tue, Feb 20, 2024 at 09:27:23AM +0000, Matt Bobrowski wrote:
> > > > There has now been several reported instances [0, 1, 2] where the
> > > > usage of the BPF helper bpf_d_path() has led to some form of memory
> > > > corruption issue.
> > > > 
> > > > The fundamental reason behind why we repeatedly see bpf_d_path() being
> > > > susceptible to such memory corruption issues is because it only
> > > > enforces ARG_PTR_TO_BTF_ID constraints onto it's struct path
> > > > argument. This essentially means that it only requires an in-kernel
> > > > pointer of type struct path to be provided to it. Depending on the
> > > > underlying context and where the supplied struct path was obtained
> > > > from and when, depends on whether the struct path is fully intact or
> > > > not when calling bpf_d_path(). It's certainly possible to call
> > > > bpf_d_path() and subsequently d_path() from contexts where the
> > > > supplied struct path to bpf_d_path() has already started being torn
> > > > down by __fput() and such. An example of this is perfectly illustrated
> > > > in [0].
> > > > 
> > > > Moving forward, we simply cannot enforce KF_TRUSTED_ARGS semantics
> > > > onto struct path of bpf_d_path(), as this approach would presumably
> > > > lead to some pretty wide scale and highly undesirable BPF program
> > > > breakage. To avoid breaking any pre-existing BPF program that is
> > > > dependent on bpf_d_path(), I propose that we take a different path and
> > > > re-implement an incredibly minimalistic and bare bone version of
> > > > d_path() which is entirely backed by kernel probe-read semantics. IOW,
> > > > a version of d_path() that is backed by
> > > > copy_from_kernel_nofault(). This ensures that any reads performed
> > > > against the supplied struct path to bpf_d_path() which may end up
> > > > faulting for whatever reason end up being gracefully handled and fixed
> > > > up.
> > > > 
> > > > The caveats with such an approach is that we can't fully uphold all of
> > > > d_path()'s path resolution capabilities. Resolving a path which is
> > > > comprised of a dentry that make use of dynamic names via isn't
> > > > possible as we can't enforce probe-read semantics onto indirect
> > > > function calls performed via d_op as they're implementation
> > > > dependent. For such cases, we just return -EOPNOTSUPP. This might be a
> > > > little surprising to some users, especially those that are interested
> > > > in resolving paths that involve a dentry that resides on some
> > > > non-mountable pseudo-filesystem, being pipefs/sockfs/nsfs, but it's
> > > > arguably better than enforcing KF_TRUSTED_ARGS onto bpf_d_path() and
> > > > causing an unnecessary shemozzle for users. Additionally, we don't
> > > 
> > > NAK. We're not going to add a semi-functional reimplementation of
> > > d_path() for bpf. This relied on VFS internals and guarantees that were
> > > never given. Restrict it to KF_TRUSTED_ARGS as it was suggested when
> > > this originally came up or fix it another way. But we're not adding a
> > > bunch of kfuncs to even more sensitive VFS machinery and then build a
> > > d_path() clone just so we can retroactively justify broken behavior.
> > 
> > OK, I agree, having a semi-functional re-implementation of d_path() is
> > indeed suboptimal. However, also understand that slapping the
> 
> The ugliness of the duplicated code made me start my mail with NAK. It
> would've been enough to just say no.
> 
> > KF_TRUSTED_ARGS constraint onto the pre-existing BPF helper
> > bpf_d_path() would outright break a lot of BPF programs out there, so
> > I can't see how taht would be an acceptable approach moving forward
> > here either.
> > 
> > Let's say that we decided to leave the pre-existing bpf_d_path()
> > implementation as is, accepting that it is fundamentally succeptible
> > to memory corruption issues, are you saying that you're also not for
> > adding the KF_TRUSTED_ARGS d_path() variant as I've done so here
> 
> No, that's fine and was the initial proposal anyway. You're already
> using the existing d_path() anway in that bpf_d_path() thing. So
> exposing another variant with KF_TRUSTED_ARGS restriction is fine. But
> not hacking up a custom d_path() variant.

OK, thank you for clarifying. Perhaps we should just make a remark in
the form of a comment against bpf_d_path() stating that this BPF
helper is considered unsafe and users should look to migrate to the
newly added KF_TRUSTED_ARGS variant if at all possible.

> > [0]. Or, is it the other supporting reference counting based BPF
> > kfuncs [1, 2] that have irked you and aren't supportive of either?
> 
> Yes, because you're exposing fs_root, fs_pwd, path_put() and fdput(),
> get_task_exe_file(), get_mm_exe_file(). None of that I see being turned
> into kfuncs.

Hm, OK, but do know that BPF kfuncs do not make any promises around
being a stable interface, they never have and never will. Therefore,
it's not like introducing this kind of dependency on such APIs from
BPF kfuncs would hinder you from fundamentally modifying them moving
forward?

Additionally, given that these new acquire/release based BPF kfuncs
which rely on APIs like get_fs_root() and path_put() are in fact
restricted to BPF LSM programs, usage of such BPF kfuncs from the
context of a BPF LSM program would rather be analogous to a
pre-existing LSM module calling get_fs_root() and path_put()
explicitly within one of its implemented hooks, no? IOW, once a BPF
LSM program is loaded and JITed, what's the fundamental difference
between a baked in LSM module hook implementation which calls
get_fs_root() and a BPF LSM program which calls
bpf_get_task_fs_root()?  They're both being used in a perfectly
reasonable and sane like-for-like context, so what's the issue with
exposing such APIs as BPF kfuncs if they're being used appropriately?
It really doesn't make sense to provide independent reference counting
implementations just for BPF if there's some pre-existing
infrastructure in the kernel that does it the right way.

Also note that without such new reference counting BPF kfuncs which
I've proposed within this patch series the KF_TRUSTED_ARGS variant of
bpf_d_path() that we've agreed on becomes somewhat difficult to use in
practice. It'd essentially only be usable from LSM hooks that pass in
a struct path via the context parameter. Whilst in reality, it's
considered rather typical to also pass a struct path like
&current->mm->exe_file->f_path and &current->fs->pwd to bpf_d_path()
and friends from within the the implementation of an LSM hook. Such
struct path objects nested some levels deep isn't considered as being
trusted and therefore cannot be passed to a BPF kfunc that enforces
the KF_TRUSTED_ARGS constraint. The only way to acquire trust on a
pointer after performing such a struct walk is by grabbing a reference
on it, and hence why this KF_TRUSTED_ARGS change to d_path() and these
new BPF kfuncs go hand in hand.

Apologies about all the questions and comments here, but I'm really
just trying to understand why there's so much push back with regards
adding these reference counting BPF kfuncs?

/M
diff mbox series

Patch

diff --git a/fs/Makefile b/fs/Makefile
index c09016257f05..945c9c84d35d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -4,7 +4,7 @@ 
 #
 # 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
 # Rewritten to use lists instead of if-statements.
-# 
+#
 
 
 obj-y :=	open.o read_write.o file_table.o super.o \
@@ -12,7 +12,7 @@  obj-y :=	open.o read_write.o file_table.o super.o \
 		ioctl.o readdir.o select.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
-		pnode.o splice.o sync.o utimes.o d_path.o \
+		pnode.o splice.o sync.o utimes.o d_path.o probe_read_d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
 		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
 		kernel_read_file.o mnt_idmapping.o remap_range.o
@@ -58,7 +58,7 @@  obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-y				+= devpts/
 
 obj-$(CONFIG_DLM)		+= dlm/
- 
+
 # Do not add any filesystems before this line
 obj-$(CONFIG_NETFS_SUPPORT)	+= netfs/
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
diff --git a/fs/probe_read_d_path.c b/fs/probe_read_d_path.c
new file mode 100644
index 000000000000..8d0db902f836
--- /dev/null
+++ b/fs/probe_read_d_path.c
@@ -0,0 +1,150 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024 Google LLC.
+ */
+
+#include "asm/ptrace.h"
+#include <linux/container_of.h>
+#include <linux/dcache.h>
+#include <linux/fs_struct.h>
+#include <linux/uaccess.h>
+#include <linux/path.h>
+#include <linux/probe_read_d_path.h>
+
+#include "mount.h"
+
+#define PROBE_READ(src)                                              \
+	({                                                           \
+		typeof(src) __r;                                     \
+		if (copy_from_kernel_nofault((void *)(&__r), (&src), \
+					     sizeof((__r))))         \
+			memset((void *)(&__r), 0, sizeof((__r)));    \
+		__r;                                                 \
+	})
+
+static inline bool probe_read_d_unlinked(const struct dentry *dentry)
+{
+	return !PROBE_READ(dentry->d_hash.pprev) &&
+	       !(dentry == PROBE_READ(dentry->d_parent));
+}
+
+static long probe_read_prepend(const char *s, int len, char *buf, int *buflen)
+{
+	/*
+	 * The supplied len that is to be copied into the buffer will result in
+	 * an overflow. The true implementation of d_path() already returns an
+	 * error for such overflow cases, so the semantics with regards to the
+	 * bpf_d_path() helper returning the same error value for overflow cases
+	 * remain the same.
+	 */
+	if (len > *buflen)
+		return -ENAMETOOLONG;
+
+	/*
+	 * The supplied string fits completely into the remaining buffer
+	 * space. Attempt to make the copy.
+	 */
+	*buflen -= len;
+	buf += *buflen;
+	return copy_from_kernel_nofault(buf, s, len);
+}
+
+static bool use_dname(const struct path *path)
+{
+	const struct dentry_operations *d_op;
+	char *(*d_dname)(struct dentry *, char *, int);
+
+	d_op = PROBE_READ(path->dentry->d_op);
+	d_dname = PROBE_READ(d_op->d_dname);
+
+	return d_op && d_dname &&
+	       (!(path->dentry == PROBE_READ(path->dentry->d_parent)) ||
+		path->dentry != PROBE_READ(path->mnt->mnt_root));
+}
+
+char *probe_read_d_path(const struct path *path, char *buf, int buflen)
+{
+	int len;
+	long err;
+	struct path root;
+	struct mount *mnt;
+	struct dentry *dentry;
+
+	dentry = path->dentry;
+	mnt = container_of(path->mnt, struct mount, mnt);
+
+	/*
+	 * We cannot back dentry->d_op->d_dname() with probe-read semantics, so
+	 * just return an error to the caller when the supplied path contains a
+	 * dentry component that makes use of a dynamic name.
+	 */
+	if (use_dname(path))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	err = probe_read_prepend("\0", 1, buf, &buflen);
+	if (err)
+		return ERR_PTR(err);
+
+	if (probe_read_d_unlinked(dentry)) {
+		err = probe_read_prepend(" (deleted)", 10, buf, &buflen);
+		if (err)
+			return ERR_PTR(err);
+	}
+
+	len = buflen;
+	root = PROBE_READ(current->fs->root);
+	while (dentry != root.dentry || &mnt->mnt != root.mnt) {
+		struct dentry *parent;
+		if (dentry == PROBE_READ(mnt->mnt.mnt_root)) {
+			struct mount *m;
+
+			m = PROBE_READ(mnt->mnt_parent);
+			if (mnt != m) {
+				dentry = PROBE_READ(mnt->mnt_mountpoint);
+				mnt = m;
+				continue;
+			}
+
+			/*
+			 * If we've reached the global root, then there's
+			 * nothing we can really do but bail.
+			 */
+			break;
+		}
+
+		parent = PROBE_READ(dentry->d_parent);
+		if (dentry == parent) {
+			/*
+			 * Escaped? We return an ECANCELED error here to signify
+			 * that we've prematurely terminated pathname
+			 * reconstruction. We've potentially hit a root dentry
+			 * that isn't associated with any roots from the mounted
+			 * filesystems that we've jumped through, so it's not
+			 * clear where we are in the VFS exactly.
+			 */
+			err = -ECANCELED;
+			break;
+		}
+
+		err = probe_read_prepend(dentry->d_name.name,
+					 PROBE_READ(dentry->d_name.len), buf,
+					 &buflen);
+		if (err)
+			break;
+
+		err = probe_read_prepend("/", 1, buf, &buflen);
+		if (err)
+			break;
+		dentry = parent;
+	}
+
+	if (err)
+		return ERR_PTR(err);
+
+	if (len == buflen) {
+		err = probe_read_prepend("/", 1, buf, &buflen);
+		if (err)
+			return ERR_PTR(err);
+	}
+	return buf + buflen;
+}
diff --git a/include/linux/probe_read_d_path.h b/include/linux/probe_read_d_path.h
new file mode 100644
index 000000000000..9b3908746657
--- /dev/null
+++ b/include/linux/probe_read_d_path.h
@@ -0,0 +1,13 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024 Google LLC.
+ */
+
+#ifndef _LINUX_PROBE_READ_D_PATH_H
+#define _LINUX_PROBE_READ_D_PATH_H
+
+#include <linux/path.h>
+
+extern char *probe_read_d_path(const struct path *path, char *buf, int buflen);
+
+#endif /* _LINUX_PROBE_READ_D_PATH_H */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 241ddf5e3895..12dbd9cef1fa 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -25,6 +25,7 @@ 
 #include <linux/verification.h>
 #include <linux/namei.h>
 #include <linux/fileattr.h>
+#include <linux/probe_read_d_path.h>
 
 #include <net/bpf_sk_storage.h>
 
@@ -923,14 +924,12 @@  BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
 	if (len < 0)
 		return len;
 
-	p = d_path(&copy, buf, sz);
-	if (IS_ERR(p)) {
-		len = PTR_ERR(p);
-	} else {
-		len = buf + sz - p;
-		memmove(buf, p, len);
-	}
+	p = probe_read_d_path(&copy, buf, sz);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
 
+	len = buf + sz - p;
+	memmove(buf, p, len);
 	return len;
 }