@@ -48,6 +48,7 @@ Currently, these files are in /proc/sys/fs:
- suid_dumpable
- super-max
- super-nr
+- trust_policy
aio-nr & aio-max-nr
@@ -382,3 +383,52 @@ Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
on a 64bit one.
The current default value for max_user_watches is the 1/32 of the available
low memory, divided for the "watch" cost in bytes.
+
+
+trust_policy
+------------
+
+An interpreter can call :manpage:`trusted_for(2)` with a
+``TRUSTED_FOR_EXECUTION`` usage to check that opened regular files are expected
+to be executable. If the file is not identified as executable, then the
+syscall returns -EACCES. This may allow a script interpreter to check
+executable permission before reading commands from a file, or a dynamic linker
+to only load executable shared objects. One interesting use case is to enforce
+a "write xor execute" policy through interpreters.
+
+The ability to restrict code execution must be thought as a system-wide policy,
+which first starts by restricting mount points with the ``noexec`` option.
+This option is also automatically applied to special filesystems such as /proc .
+This prevents files on such mount points to be directly executed by the kernel
+or mapped as executable memory (e.g. libraries). With script interpreters
+using :manpage:`trusted_for(2)`, the executable permission can then be checked
+before reading commands from files. This makes it possible to enforce the
+``noexec`` at the interpreter level, and thus propagates this security policy
+to scripts. To be fully effective, these interpreters also need to handle the
+other ways to execute code: command line parameters (e.g., option ``-e`` for
+Perl), module loading (e.g., option ``-m`` for Python), stdin, file sourcing,
+environment variables, configuration files, etc. According to the threat
+model, it may be acceptable to allow some script interpreters (e.g. Bash) to
+interpret commands from stdin, may it be a TTY or a pipe, because it may not be
+enough to (directly) perform syscalls.
+
+There are two complementary security policies: enforce the ``noexec`` mount
+option, and enforce executable file permission. These policies are handled by
+the ``fs.trust_policy`` sysctl (writable only with ``CAP_SYS_ADMIN``) as a
+bitmask:
+
+1 - Mount restriction: checks that the mount options for the underlying VFS
+ mount do not prevent execution.
+
+2 - File permission restriction: checks that the file is marked as
+ executable for the current process (e.g., POSIX permissions, ACLs).
+
+Note that as long as a policy is enforced, checking any non-regular file with
+:manpage:`trusted_for(2)` returns -EACCES (e.g. TTYs, pipe), even when such a
+file is marked as executable or is on an executable mount point.
+
+Code samples can be found in
+tools/testing/selftests/interpreter/trust_policy_test.c and interpreter patches
+(for the original O_MAYEXEC) are available at
+https://github.com/clipos-archive/clipos4_portage-overlay/search?q=O_MAYEXEC .
+See also an overview article: https://lwn.net/Articles/820000/ .
@@ -32,6 +32,8 @@
#include <linux/ima.h>
#include <linux/dnotify.h>
#include <linux/compat.h>
+#include <linux/sysctl.h>
+#include <uapi/linux/trusted-for.h>
#include "internal.h"
@@ -482,6 +484,81 @@ SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
return do_faccessat(AT_FDCWD, filename, mode, 0);
}
+#define TRUST_POLICY_EXEC_MOUNT BIT(0)
+#define TRUST_POLICY_EXEC_FILE BIT(1)
+
+int sysctl_trust_policy __read_mostly;
+
+SYSCALL_DEFINE3(trusted_for, const int, fd, const enum trusted_for_usage, usage,
+ const u32, flags)
+{
+ int mask, err = -EACCES;
+ struct fd f;
+ struct inode *inode;
+
+ if (flags)
+ return -EINVAL;
+
+ /* Only handles execution for now. */
+ if (usage != TRUSTED_FOR_EXECUTION)
+ return -EINVAL;
+ mask = MAY_EXEC;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+ inode = d_backing_inode(f.file->f_path.dentry);
+
+ /*
+ * For compatibility reasons, without a defined security policy, we
+ * must map the execute permission to the read permission. Indeed,
+ * from user space point of view, being able to execute data (e.g.
+ * scripts) implies to be able to read this data.
+ */
+ if ((mask & MAY_EXEC)) {
+ /*
+ * If there is a system-wide execute policy enforced, then
+ * forbids access to non-regular files and special superblocks.
+ */
+ if ((sysctl_trust_policy & (TRUST_POLICY_EXEC_MOUNT |
+ TRUST_POLICY_EXEC_FILE))) {
+ if (!S_ISREG(inode->i_mode))
+ goto out_fd;
+ /*
+ * Denies access to pseudo filesystems that will never
+ * be mountable (e.g. sockfs, pipefs) but can still be
+ * reachable through /proc/self/fd, or memfd-like file
+ * descriptors, or nsfs-like files.
+ *
+ * According to the selftests, SB_NOEXEC seems to be
+ * only used by proc and nsfs filesystems.
+ */
+ if ((f.file->f_path.dentry->d_sb->s_flags &
+ (SB_NOUSER | SB_KERNMOUNT | SB_NOEXEC)))
+ goto out_fd;
+ }
+
+ if ((sysctl_trust_policy & TRUST_POLICY_EXEC_MOUNT) &&
+ path_noexec(&f.file->f_path))
+ goto out_fd;
+ /*
+ * For compatibility reasons, if the system-wide policy doesn't
+ * enforce file permission checks, then replaces the execute
+ * permission request with a read permission request.
+ */
+ if (!(sysctl_trust_policy & TRUST_POLICY_EXEC_FILE))
+ mask &= ~MAY_EXEC;
+ /* To be executed *by* user space, files must be readable. */
+ mask |= MAY_READ;
+ }
+
+ err = inode_permission(inode, mask | MAY_ACCESS);
+
+out_fd:
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE1(chdir, const char __user *, filename)
{
struct path path;
@@ -83,6 +83,7 @@ extern int sysctl_protected_symlinks;
extern int sysctl_protected_hardlinks;
extern int sysctl_protected_fifos;
extern int sysctl_protected_regular;
+extern int sysctl_trust_policy;
typedef __kernel_rwf_t rwf_t;
@@ -429,6 +429,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode);
asmlinkage long sys_faccessat2(int dfd, const char __user *filename, int mode,
int flags);
+asmlinkage long sys_trusted_for(int fd, enum trusted_for_usage usage, u32 flags);
asmlinkage long sys_chdir(const char __user *filename);
asmlinkage long sys_fchdir(unsigned int fd);
asmlinkage long sys_chroot(const char __user *filename);
new file mode 100644
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_TRUSTED_FOR_H
+#define _UAPI_LINUX_TRUSTED_FOR_H
+
+/**
+ * enum trusted_for_usage - Usage for which a file descriptor is trusted
+ *
+ * Argument of trusted_for(2).
+ */
+enum trusted_for_usage {
+ /**
+ * @TRUSTED_FOR_EXECUTION: Check that the data read from a file
+ * descriptor is trusted to be executed or interpreted (e.g. scripts).
+ */
+ TRUSTED_FOR_EXECUTION = 1,
+};
+
+#endif /* _UAPI_LINUX_TRUSTED_FOR_H */
@@ -113,6 +113,7 @@ static int sixty = 60;
static int __maybe_unused neg_one = -1;
static int __maybe_unused two = 2;
+static int __maybe_unused three = 3;
static int __maybe_unused four = 4;
static unsigned long zero_ul;
static unsigned long one_ul = 1;
@@ -887,7 +888,6 @@ static int proc_taint(struct ctl_table *table, int write,
return err;
}
-#ifdef CONFIG_PRINTK
static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -896,7 +896,6 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
}
-#endif
/**
* struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
@@ -3301,6 +3300,15 @@ static struct ctl_table fs_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
+ {
+ .procname = "trust_policy",
+ .data = &sysctl_trust_policy,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &three,
+ },
#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
{
.procname = "binfmt_misc",