diff mbox series

[v3,1/3] fs: Support setting a minimum fd for "lowest available fd" allocation

Message ID 90bf6fd43343ca862e7f61b0834baf2bdbd0e24c.1586321767.git.josh@joshtriplett.org (mailing list archive)
State New, archived
Headers show
Series Support userspace-selected fds | expand

Commit Message

Josh Triplett April 8, 2020, 6:57 a.m. UTC
Some applications want to prevent the usual "lowest available fd"
allocation from allocating certain file descriptors. For instance, they
may want to prevent allocation of a closed fd 0, 1, or 2 other than via
dup2/dup3, or reserve some low file descriptors for other purposes.

Add a prctl to increase the minimum fd and return the previous minimum.

System calls that allocate a specific file descriptor, such as
dup2/dup3, ignore this minimum.

exec resets the minimum fd, to prevent one program from interfering with
another program's expectations about fd allocation.

Test program:

    #include <err.h>
    #include <fcntl.h>
    #include <stdio.h>
    #include <sys/prctl.h>

    int main(int argc, char *argv[])
    {
        if (prctl(PR_INCREASE_MIN_FD, 100, 0, 0, 0) < 0)
            err(1, "prctl");
        int fd = open("/dev/null", O_RDONLY);
        if (fd < 0)
            err(1, "open");
        printf("%d\n", fd); // prints 100
        return 0;
    }

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
---
 fs/file.c                  | 23 +++++++++++++++++------
 include/linux/fdtable.h    |  1 +
 include/linux/file.h       |  1 +
 include/uapi/linux/prctl.h |  3 +++
 kernel/sys.c               |  5 +++++
 5 files changed, 27 insertions(+), 6 deletions(-)

Comments

Aleksa Sarai April 8, 2020, noon UTC | #1
On 2020-04-07, Josh Triplett <josh@joshtriplett.org> wrote:
> Some applications want to prevent the usual "lowest available fd"
> allocation from allocating certain file descriptors. For instance, they
> may want to prevent allocation of a closed fd 0, 1, or 2 other than via
> dup2/dup3, or reserve some low file descriptors for other purposes.
> 
> Add a prctl to increase the minimum fd and return the previous minimum.
> 
> System calls that allocate a specific file descriptor, such as
> dup2/dup3, ignore this minimum.
> 
> exec resets the minimum fd, to prevent one program from interfering with
> another program's expectations about fd allocation.

Why is it implemented as an "increase the value" interface? It feels
like this is meant to avoid some kind of security trap (with a library
reducing the value) but it means that if you want to temporarily raise
the minimum fd number it's not possible (without re-exec()ing yourself,
which is hardly a fun thing to do).

Then again, this might've been discussed before and I missed it...

> Test program:
> 
>     #include <err.h>
>     #include <fcntl.h>
>     #include <stdio.h>
>     #include <sys/prctl.h>
> 
>     int main(int argc, char *argv[])
>     {
>         if (prctl(PR_INCREASE_MIN_FD, 100, 0, 0, 0) < 0)
>             err(1, "prctl");
>         int fd = open("/dev/null", O_RDONLY);
>         if (fd < 0)
>             err(1, "open");
>         printf("%d\n", fd); // prints 100
>         return 0;
>     }
> 
> Signed-off-by: Josh Triplett <josh@joshtriplett.org>
> ---
>  fs/file.c                  | 23 +++++++++++++++++------
>  include/linux/fdtable.h    |  1 +
>  include/linux/file.h       |  1 +
>  include/uapi/linux/prctl.h |  3 +++
>  kernel/sys.c               |  5 +++++
>  5 files changed, 27 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/file.c b/fs/file.c
> index c8a4e4c86e55..ba06140d89af 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -286,7 +286,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
>  	spin_lock_init(&newf->file_lock);
>  	newf->resize_in_progress = false;
>  	init_waitqueue_head(&newf->resize_wait);
> -	newf->next_fd = 0;
>  	new_fdt = &newf->fdtab;
>  	new_fdt->max_fds = NR_OPEN_DEFAULT;
>  	new_fdt->close_on_exec = newf->close_on_exec_init;
> @@ -295,6 +294,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
>  	new_fdt->fd = &newf->fd_array[0];
>  
>  	spin_lock(&oldf->file_lock);
> +	newf->next_fd = newf->min_fd = oldf->min_fd;
>  	old_fdt = files_fdtable(oldf);
>  	open_files = count_open_files(old_fdt);
>  
> @@ -487,9 +487,7 @@ int __alloc_fd(struct files_struct *files,
>  	spin_lock(&files->file_lock);
>  repeat:
>  	fdt = files_fdtable(files);
> -	fd = start;
> -	if (fd < files->next_fd)
> -		fd = files->next_fd;
> +	fd = max3(start, files->min_fd, files->next_fd);
>  
>  	if (fd < fdt->max_fds)
>  		fd = find_next_fd(fdt, fd);
> @@ -514,7 +512,7 @@ int __alloc_fd(struct files_struct *files,
>  		goto repeat;
>  
>  	if (start <= files->next_fd)
> -		files->next_fd = fd + 1;
> +		files->next_fd = max(fd + 1, files->min_fd);
>  
>  	__set_open_fd(fd, fdt);
>  	if (flags & O_CLOEXEC)
> @@ -555,7 +553,7 @@ static void __put_unused_fd(struct files_struct *files, unsigned int fd)
>  {
>  	struct fdtable *fdt = files_fdtable(files);
>  	__clear_open_fd(fd, fdt);
> -	if (fd < files->next_fd)
> +	if (fd < files->next_fd && fd >= files->min_fd)
>  		files->next_fd = fd;
>  }
>  
> @@ -684,6 +682,7 @@ void do_close_on_exec(struct files_struct *files)
>  
>  	/* exec unshares first */
>  	spin_lock(&files->file_lock);
> +	files->min_fd = 0;
>  	for (i = 0; ; i++) {
>  		unsigned long set;
>  		unsigned fd = i * BITS_PER_LONG;
> @@ -865,6 +864,18 @@ bool get_close_on_exec(unsigned int fd)
>  	return res;
>  }
>  
> +unsigned int increase_min_fd(unsigned int num)
> +{
> +	struct files_struct *files = current->files;
> +	unsigned int old_min_fd;
> +
> +	spin_lock(&files->file_lock);
> +	old_min_fd = files->min_fd;
> +	files->min_fd += num;
> +	spin_unlock(&files->file_lock);
> +	return old_min_fd;
> +}
> +
>  static int do_dup2(struct files_struct *files,
>  	struct file *file, unsigned fd, unsigned flags)
>  __releases(&files->file_lock)
> diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
> index f07c55ea0c22..d1980443d8b3 100644
> --- a/include/linux/fdtable.h
> +++ b/include/linux/fdtable.h
> @@ -60,6 +60,7 @@ struct files_struct {
>     */
>  	spinlock_t file_lock ____cacheline_aligned_in_smp;
>  	unsigned int next_fd;
> +	unsigned int min_fd; /* min for "lowest available fd" allocation */
>  	unsigned long close_on_exec_init[1];
>  	unsigned long open_fds_init[1];
>  	unsigned long full_fds_bits_init[1];
> diff --git a/include/linux/file.h b/include/linux/file.h
> index 142d102f285e..b67986f818d2 100644
> --- a/include/linux/file.h
> +++ b/include/linux/file.h
> @@ -88,6 +88,7 @@ extern bool get_close_on_exec(unsigned int fd);
>  extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile);
>  extern int get_unused_fd_flags(unsigned flags);
>  extern void put_unused_fd(unsigned int fd);
> +extern unsigned int increase_min_fd(unsigned int num);
>  
>  extern void fd_install(unsigned int fd, struct file *file);
>  
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 07b4f8131e36..916327272d21 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -238,4 +238,7 @@ struct prctl_mm_map {
>  #define PR_SET_IO_FLUSHER		57
>  #define PR_GET_IO_FLUSHER		58
>  
> +/* Increase minimum file descriptor for "lowest available fd" allocation */
> +#define PR_INCREASE_MIN_FD		59
> +
>  #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/sys.c b/kernel/sys.c
> index d325f3ab624a..daa0ce43cecc 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2514,6 +2514,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  
>  		error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
>  		break;
> +	case PR_INCREASE_MIN_FD:
> +		if (arg3 || arg4 || arg5)
> +			return -EINVAL;
> +		error = increase_min_fd((unsigned int)arg2);
> +		break;
>  	default:
>  		error = -EINVAL;
>  		break;
> -- 
> 2.26.0
>
Josh Triplett April 9, 2020, 3:17 a.m. UTC | #2
On Wed, Apr 08, 2020 at 10:00:40PM +1000, Aleksa Sarai wrote:
> On 2020-04-07, Josh Triplett <josh@joshtriplett.org> wrote:
> > Some applications want to prevent the usual "lowest available fd"
> > allocation from allocating certain file descriptors. For instance, they
> > may want to prevent allocation of a closed fd 0, 1, or 2 other than via
> > dup2/dup3, or reserve some low file descriptors for other purposes.
> > 
> > Add a prctl to increase the minimum fd and return the previous minimum.
> > 
> > System calls that allocate a specific file descriptor, such as
> > dup2/dup3, ignore this minimum.
> > 
> > exec resets the minimum fd, to prevent one program from interfering with
> > another program's expectations about fd allocation.
> 
> Why is it implemented as an "increase the value" interface? It feels
> like this is meant to avoid some kind of security trap (with a library
> reducing the value) but it means that if you want to temporarily raise
> the minimum fd number it's not possible (without re-exec()ing yourself,
> which is hardly a fun thing to do).
> 
> Then again, this might've been discussed before and I missed it...

It was: the previous version was a "get" and "set" interface. That
interface didn't allow for the possibility that something else in the
process had already set a minimum. This new atomic increase interface
(which also serves as a "get" interface if you pass 0) makes it possible
for a userspace library to reserve a range. (You have no guarantee about
previously allocated descriptors in that range, but you know that no
*new* automatically allocated descriptors will appear in that range,
which suffices; userspace can do the rest.)

- Josh Triplett
diff mbox series

Patch

diff --git a/fs/file.c b/fs/file.c
index c8a4e4c86e55..ba06140d89af 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -286,7 +286,6 @@  struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	spin_lock_init(&newf->file_lock);
 	newf->resize_in_progress = false;
 	init_waitqueue_head(&newf->resize_wait);
-	newf->next_fd = 0;
 	new_fdt = &newf->fdtab;
 	new_fdt->max_fds = NR_OPEN_DEFAULT;
 	new_fdt->close_on_exec = newf->close_on_exec_init;
@@ -295,6 +294,7 @@  struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	new_fdt->fd = &newf->fd_array[0];
 
 	spin_lock(&oldf->file_lock);
+	newf->next_fd = newf->min_fd = oldf->min_fd;
 	old_fdt = files_fdtable(oldf);
 	open_files = count_open_files(old_fdt);
 
@@ -487,9 +487,7 @@  int __alloc_fd(struct files_struct *files,
 	spin_lock(&files->file_lock);
 repeat:
 	fdt = files_fdtable(files);
-	fd = start;
-	if (fd < files->next_fd)
-		fd = files->next_fd;
+	fd = max3(start, files->min_fd, files->next_fd);
 
 	if (fd < fdt->max_fds)
 		fd = find_next_fd(fdt, fd);
@@ -514,7 +512,7 @@  int __alloc_fd(struct files_struct *files,
 		goto repeat;
 
 	if (start <= files->next_fd)
-		files->next_fd = fd + 1;
+		files->next_fd = max(fd + 1, files->min_fd);
 
 	__set_open_fd(fd, fdt);
 	if (flags & O_CLOEXEC)
@@ -555,7 +553,7 @@  static void __put_unused_fd(struct files_struct *files, unsigned int fd)
 {
 	struct fdtable *fdt = files_fdtable(files);
 	__clear_open_fd(fd, fdt);
-	if (fd < files->next_fd)
+	if (fd < files->next_fd && fd >= files->min_fd)
 		files->next_fd = fd;
 }
 
@@ -684,6 +682,7 @@  void do_close_on_exec(struct files_struct *files)
 
 	/* exec unshares first */
 	spin_lock(&files->file_lock);
+	files->min_fd = 0;
 	for (i = 0; ; i++) {
 		unsigned long set;
 		unsigned fd = i * BITS_PER_LONG;
@@ -865,6 +864,18 @@  bool get_close_on_exec(unsigned int fd)
 	return res;
 }
 
+unsigned int increase_min_fd(unsigned int num)
+{
+	struct files_struct *files = current->files;
+	unsigned int old_min_fd;
+
+	spin_lock(&files->file_lock);
+	old_min_fd = files->min_fd;
+	files->min_fd += num;
+	spin_unlock(&files->file_lock);
+	return old_min_fd;
+}
+
 static int do_dup2(struct files_struct *files,
 	struct file *file, unsigned fd, unsigned flags)
 __releases(&files->file_lock)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index f07c55ea0c22..d1980443d8b3 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -60,6 +60,7 @@  struct files_struct {
    */
 	spinlock_t file_lock ____cacheline_aligned_in_smp;
 	unsigned int next_fd;
+	unsigned int min_fd; /* min for "lowest available fd" allocation */
 	unsigned long close_on_exec_init[1];
 	unsigned long open_fds_init[1];
 	unsigned long full_fds_bits_init[1];
diff --git a/include/linux/file.h b/include/linux/file.h
index 142d102f285e..b67986f818d2 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -88,6 +88,7 @@  extern bool get_close_on_exec(unsigned int fd);
 extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile);
 extern int get_unused_fd_flags(unsigned flags);
 extern void put_unused_fd(unsigned int fd);
+extern unsigned int increase_min_fd(unsigned int num);
 
 extern void fd_install(unsigned int fd, struct file *file);
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 07b4f8131e36..916327272d21 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -238,4 +238,7 @@  struct prctl_mm_map {
 #define PR_SET_IO_FLUSHER		57
 #define PR_GET_IO_FLUSHER		58
 
+/* Increase minimum file descriptor for "lowest available fd" allocation */
+#define PR_INCREASE_MIN_FD		59
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index d325f3ab624a..daa0ce43cecc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2514,6 +2514,11 @@  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 
 		error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
 		break;
+	case PR_INCREASE_MIN_FD:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = increase_min_fd((unsigned int)arg2);
+		break;
 	default:
 		error = -EINVAL;
 		break;