diff mbox series

[RFC] more close_range() fun

Message ID 20240816030341.GW13701@ZenIV (mailing list archive)
State New
Headers show
Series [RFC] more close_range() fun | expand

Commit Message

Al Viro Aug. 16, 2024, 3:03 a.m. UTC
Thread 1:
	if (dup2(0, 1023) >= 0)
		dup2(0, 10);

Thread 2:
	if (close_range(64, 127, CLOSE_RANGE_UNSHARE) == 0 &&
	    fcntl(10, F_GETFD) >= 0 &&
	    fcntl(1023, F_GETFD) == -1)
		printf("broken");


Note that close_range() call in the second thread does not
affect any of the descriptors we work with in the first thread
and at no point does thread 1 have descriptor 10 opened without
descriptor 1023 also being opened.

It *can* actually happen - all it takes is close_range(2) decision
to trim the copied descriptor table made before the first dup2()
and actual copying done after both dup2() are done.

I would not expect that printf to trigger - not without having
looked through the close_range(2) implementation.  Note that
manpage doesn't even hint at anything of that sort.

IMO it's a QoI issue at the very least, and arguably an outright
bug.

Note that there is a case where everything works fine, and I suspect
that most of the callers where we want trimming are of that sort -
if the second argument of close_range() is above the INT_MAX.

If that's the only case where we want trimming to happen, the
fix is trivial; if not... also doable.  We just need to pass the
range to be punched out all way down to sane_fdtable_size()
(e.g. as a pointer, NULL meaning "no holes to punch").  I wouldn't
bother with unshare_fd() - it's not hard for __close_range() to
use dup_fd() instead.

Something like this (completely untested), perhaps?

Comments

Al Viro Aug. 16, 2024, 3:07 a.m. UTC | #1
On Fri, Aug 16, 2024 at 04:03:41AM +0100, Al Viro wrote:
> Thread 1:
> 	if (dup2(0, 1023) >= 0)
> 		dup2(0, 10);
> 
> Thread 2:
> 	if (close_range(64, 127, CLOSE_RANGE_UNSHARE) == 0 &&
> 	    fcntl(10, F_GETFD) >= 0 &&
> 	    fcntl(1023, F_GETFD) == -1)
> 		printf("broken");
> 
> 
> Note that close_range() call in the second thread does not
> affect any of the descriptors we work with in the first thread
> and at no point does thread 1 have descriptor 10 opened without
> descriptor 1023 also being opened.
> 
> It *can* actually happen - all it takes is close_range(2) decision
> to trim the copied descriptor table made before the first dup2()
> and actual copying done after both dup2() are done.
> 
> I would not expect that printf to trigger - not without having
> looked through the close_range(2) implementation.  Note that
> manpage doesn't even hint at anything of that sort.
> 
> IMO it's a QoI issue at the very least, and arguably an outright
> bug.
> 
> Note that there is a case where everything works fine, and I suspect
> that most of the callers where we want trimming are of that sort -
> if the second argument of close_range() is above the INT_MAX.
> 
> If that's the only case where we want trimming to happen, the
> fix is trivial; if not... also doable.  We just need to pass the
> range to be punched out all way down to sane_fdtable_size()
> (e.g. as a pointer, NULL meaning "no holes to punch").  I wouldn't
> bother with unshare_fd() - it's not hard for __close_range() to
> use dup_fd() instead.
> 
> Something like this (completely untested), perhaps?

or, with a braino fixed, this:

diff --git a/fs/file.c b/fs/file.c
index 655338effe9c..312574fda320 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -272,20 +272,6 @@ static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
 	return test_bit(fd, fdt->open_fds);
 }
 
-static unsigned int count_open_files(struct fdtable *fdt)
-{
-	unsigned int size = fdt->max_fds;
-	unsigned int i;
-
-	/* Find the last open fd */
-	for (i = size / BITS_PER_LONG; i > 0; ) {
-		if (fdt->open_fds[--i])
-			break;
-	}
-	i = (i + 1) * BITS_PER_LONG;
-	return i;
-}
-
 /*
  * Note that a sane fdtable size always has to be a multiple of
  * BITS_PER_LONG, since we have bitmaps that are sized by this.
@@ -299,14 +285,18 @@ static unsigned int count_open_files(struct fdtable *fdt)
  * just make that BITS_PER_LONG alignment be part of a sane
  * fdtable size. Becuase that's really what it is.
  */
-static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
+static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int *punch_hole)
 {
-	unsigned int count;
-
-	count = count_open_files(fdt);
-	if (max_fds < NR_OPEN_DEFAULT)
-		max_fds = NR_OPEN_DEFAULT;
-	return ALIGN(min(count, max_fds), BITS_PER_LONG);
+	unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);
+
+	if (last == fdt->max_fds)	// empty
+		return NR_OPEN_DEFAULT;
+	if (punch_hole && punch_hole[1] >= last && punch_hole[0] <= last) {
+		last = find_last_bit(fdt->open_fds, punch_hole[0]);
+		if (last == punch_hole[0])
+			return NR_OPEN_DEFAULT;
+	}
+	return ALIGN(last + 1, BITS_PER_LONG);
 }
 
 /*
@@ -314,7 +304,7 @@ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
  * passed in files structure.
  * errorp will be valid only when the returned files_struct is NULL.
  */
-struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
+struct files_struct *dup_fd(struct files_struct *oldf, unsigned int *punch_hole, int *errorp)
 {
 	struct files_struct *newf;
 	struct file **old_fds, **new_fds;
@@ -341,7 +331,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 
 	spin_lock(&oldf->file_lock);
 	old_fdt = files_fdtable(oldf);
-	open_files = sane_fdtable_size(old_fdt, max_fds);
+	open_files = sane_fdtable_size(old_fdt, punch_hole);
 
 	/*
 	 * Check whether we need to allocate a larger fd array and fd set.
@@ -372,7 +362,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 		 */
 		spin_lock(&oldf->file_lock);
 		old_fdt = files_fdtable(oldf);
-		open_files = sane_fdtable_size(old_fdt, max_fds);
+		open_files = sane_fdtable_size(old_fdt, punch_hole);
 	}
 
 	copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
@@ -748,37 +738,26 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 	if (fd > max_fd)
 		return -EINVAL;
 
-	if (flags & CLOSE_RANGE_UNSHARE) {
+	if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
 		int ret;
-		unsigned int max_unshare_fds = NR_OPEN_MAX;
+		unsigned int range[2] = {fd, max_fd}, *punch_hole = range;
 
 		/*
 		 * If the caller requested all fds to be made cloexec we always
 		 * copy all of the file descriptors since they still want to
 		 * use them.
 		 */
-		if (!(flags & CLOSE_RANGE_CLOEXEC)) {
-			/*
-			 * If the requested range is greater than the current
-			 * maximum, we're closing everything so only copy all
-			 * file descriptors beneath the lowest file descriptor.
-			 */
-			rcu_read_lock();
-			if (max_fd >= last_fd(files_fdtable(cur_fds)))
-				max_unshare_fds = fd;
-			rcu_read_unlock();
-		}
+		if (flags & CLOSE_RANGE_CLOEXEC)
+			punch_hole = NULL;
 
-		ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
-		if (ret)
+		fds = dup_fd(cur_fds, punch_hole, &ret);
+		if (!fds)
 			return ret;
-
 		/*
 		 * We used to share our file descriptor table, and have now
 		 * created a private one, make sure we're using it below.
 		 */
-		if (fds)
-			swap(cur_fds, fds);
+		swap(cur_fds, fds);
 	}
 
 	if (flags & CLOSE_RANGE_CLOEXEC)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 2944d4aa413b..6bc4353f919e 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -22,7 +22,6 @@
  * as this is the granularity returned by copy_fdset().
  */
 #define NR_OPEN_DEFAULT BITS_PER_LONG
-#define NR_OPEN_MAX ~0U
 
 struct fdtable {
 	unsigned int max_fds;
@@ -106,7 +105,7 @@ struct task_struct;
 
 void put_files_struct(struct files_struct *fs);
 int unshare_files(void);
-struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
+struct files_struct *dup_fd(struct files_struct *, unsigned *, int *) __latent_entropy;
 void do_close_on_exec(struct files_struct *);
 int iterate_fd(struct files_struct *, unsigned,
 		int (*)(const void *, struct file *, unsigned),
@@ -115,8 +114,7 @@ int iterate_fd(struct files_struct *, unsigned,
 extern int close_fd(unsigned int fd);
 extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
 extern struct file *file_close_fd(unsigned int fd);
-extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
-		      struct files_struct **new_fdp);
+extern int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp);
 
 extern struct kmem_cache *files_cachep;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index cc760491f201..a7c905f06048 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1773,7 +1773,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
 		goto out;
 	}
 
-	newf = dup_fd(oldf, NR_OPEN_MAX, &error);
+	newf = dup_fd(oldf, NULL, &error);
 	if (!newf)
 		goto out;
 
@@ -3232,15 +3232,14 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 /*
  * Unshare file descriptor table if it is being shared
  */
-int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
-	       struct files_struct **new_fdp)
+int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
 {
 	struct files_struct *fd = current->files;
 	int error = 0;
 
 	if ((unshare_flags & CLONE_FILES) &&
 	    (fd && atomic_read(&fd->count) > 1)) {
-		*new_fdp = dup_fd(fd, max_fds, &error);
+		*new_fdp = dup_fd(fd, NULL, &error);
 		if (!*new_fdp)
 			return error;
 	}
@@ -3300,7 +3299,7 @@ int ksys_unshare(unsigned long unshare_flags)
 	err = unshare_fs(unshare_flags, &new_fs);
 	if (err)
 		goto bad_unshare_out;
-	err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
+	err = unshare_fd(unshare_flags, &new_fd);
 	if (err)
 		goto bad_unshare_cleanup_fs;
 	err = unshare_userns(unshare_flags, &new_cred);
@@ -3392,7 +3391,7 @@ int unshare_files(void)
 	struct files_struct *old, *copy = NULL;
 	int error;
 
-	error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
+	error = unshare_fd(CLONE_FILES, &copy);
 	if (error || !copy)
 		return error;
Christian Brauner Aug. 16, 2024, 8:25 a.m. UTC | #2
On Fri, Aug 16, 2024 at 04:03:41AM GMT, Al Viro wrote:
> Thread 1:
> 	if (dup2(0, 1023) >= 0)
> 		dup2(0, 10);
> 
> Thread 2:
> 	if (close_range(64, 127, CLOSE_RANGE_UNSHARE) == 0 &&
> 	    fcntl(10, F_GETFD) >= 0 &&
> 	    fcntl(1023, F_GETFD) == -1)
> 		printf("broken");
> 
> 
> Note that close_range() call in the second thread does not
> affect any of the descriptors we work with in the first thread
> and at no point does thread 1 have descriptor 10 opened without
> descriptor 1023 also being opened.
> 
> It *can* actually happen - all it takes is close_range(2) decision
> to trim the copied descriptor table made before the first dup2()
> and actual copying done after both dup2() are done.
> 
> I would not expect that printf to trigger - not without having
> looked through the close_range(2) implementation.  Note that
> manpage doesn't even hint at anything of that sort.
> 
> IMO it's a QoI issue at the very least, and arguably an outright
> bug.

I don't think so. It is clear that the file descriptor table is unshared
and that fds are closed afterwards and that this can race with file
descriptors being inserted into the currently shared fdtable. Imho,
there's nothing to fix here.

I also question whether any userspace out there has any such ordering
expectations between the two dup2()s and the close_range() call and
specifically whether we should even bother giving any such guarantees.

If you feel strongly about this then by all means change it but I really
think this isn't necessary.

> 
> Note that there is a case where everything works fine, and I suspect
> that most of the callers where we want trimming are of that sort -
> if the second argument of close_range() is above the INT_MAX.
> 
> If that's the only case where we want trimming to happen, the
> fix is trivial; if not... also doable.  We just need to pass the
> range to be punched out all way down to sane_fdtable_size()
> (e.g. as a pointer, NULL meaning "no holes to punch").  I wouldn't
> bother with unshare_fd() - it's not hard for __close_range() to
> use dup_fd() instead.
> 
> Something like this (completely untested), perhaps?
> 
> diff --git a/fs/file.c b/fs/file.c
> index 655338effe9c..870c0c65530b 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -272,20 +272,6 @@ static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
>  	return test_bit(fd, fdt->open_fds);
>  }
>  
> -static unsigned int count_open_files(struct fdtable *fdt)
> -{
> -	unsigned int size = fdt->max_fds;
> -	unsigned int i;
> -
> -	/* Find the last open fd */
> -	for (i = size / BITS_PER_LONG; i > 0; ) {
> -		if (fdt->open_fds[--i])
> -			break;
> -	}
> -	i = (i + 1) * BITS_PER_LONG;
> -	return i;
> -}
> -
>  /*
>   * Note that a sane fdtable size always has to be a multiple of
>   * BITS_PER_LONG, since we have bitmaps that are sized by this.
> @@ -299,14 +285,18 @@ static unsigned int count_open_files(struct fdtable *fdt)
>   * just make that BITS_PER_LONG alignment be part of a sane
>   * fdtable size. Becuase that's really what it is.
>   */
> -static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
> +static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int *punch_hole)
>  {
> -	unsigned int count;
> -
> -	count = count_open_files(fdt);
> -	if (max_fds < NR_OPEN_DEFAULT)
> -		max_fds = NR_OPEN_DEFAULT;
> -	return ALIGN(min(count, max_fds), BITS_PER_LONG);
> +	unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);
> +
> +	if (last == fdt->max_fds)	// empty
> +		return NR_OPEN_DEFAULT;
> +	if (punch_hole && punch_hole[1] >= last) {
> +		last = find_last_bit(fdt->open_fds, punch_hole[0]);
> +		if (last == punch_hole[0])
> +			return NR_OPEN_DEFAULT;
> +	}
> +	return ALIGN(last + 1, BITS_PER_LONG);
>  }
>  
>  /*
> @@ -314,7 +304,7 @@ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
>   * passed in files structure.
>   * errorp will be valid only when the returned files_struct is NULL.
>   */
> -struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
> +struct files_struct *dup_fd(struct files_struct *oldf, unsigned int *punch_hole, int *errorp)
>  {
>  	struct files_struct *newf;
>  	struct file **old_fds, **new_fds;
> @@ -341,7 +331,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
>  
>  	spin_lock(&oldf->file_lock);
>  	old_fdt = files_fdtable(oldf);
> -	open_files = sane_fdtable_size(old_fdt, max_fds);
> +	open_files = sane_fdtable_size(old_fdt, punch_hole);
>  
>  	/*
>  	 * Check whether we need to allocate a larger fd array and fd set.
> @@ -372,7 +362,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
>  		 */
>  		spin_lock(&oldf->file_lock);
>  		old_fdt = files_fdtable(oldf);
> -		open_files = sane_fdtable_size(old_fdt, max_fds);
> +		open_files = sane_fdtable_size(old_fdt, punch_hole);
>  	}
>  
>  	copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
> @@ -748,37 +738,26 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
>  	if (fd > max_fd)
>  		return -EINVAL;
>  
> -	if (flags & CLOSE_RANGE_UNSHARE) {
> +	if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
>  		int ret;
> -		unsigned int max_unshare_fds = NR_OPEN_MAX;
> +		unsigned int range[2] = {fd, max_fd}, *punch_hole = range;
>  
>  		/*
>  		 * If the caller requested all fds to be made cloexec we always
>  		 * copy all of the file descriptors since they still want to
>  		 * use them.
>  		 */
> -		if (!(flags & CLOSE_RANGE_CLOEXEC)) {
> -			/*
> -			 * If the requested range is greater than the current
> -			 * maximum, we're closing everything so only copy all
> -			 * file descriptors beneath the lowest file descriptor.
> -			 */
> -			rcu_read_lock();
> -			if (max_fd >= last_fd(files_fdtable(cur_fds)))
> -				max_unshare_fds = fd;
> -			rcu_read_unlock();
> -		}
> +		if (flags & CLOSE_RANGE_CLOEXEC)
> +			punch_hole = NULL;
>  
> -		ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
> -		if (ret)
> +		fds = dup_fd(cur_fds, punch_hole, &ret);
> +		if (!fds)
>  			return ret;
> -
>  		/*
>  		 * We used to share our file descriptor table, and have now
>  		 * created a private one, make sure we're using it below.
>  		 */
> -		if (fds)
> -			swap(cur_fds, fds);
> +		swap(cur_fds, fds);
>  	}
>  
>  	if (flags & CLOSE_RANGE_CLOEXEC)
> diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
> index 2944d4aa413b..6bc4353f919e 100644
> --- a/include/linux/fdtable.h
> +++ b/include/linux/fdtable.h
> @@ -22,7 +22,6 @@
>   * as this is the granularity returned by copy_fdset().
>   */
>  #define NR_OPEN_DEFAULT BITS_PER_LONG
> -#define NR_OPEN_MAX ~0U
>  
>  struct fdtable {
>  	unsigned int max_fds;
> @@ -106,7 +105,7 @@ struct task_struct;
>  
>  void put_files_struct(struct files_struct *fs);
>  int unshare_files(void);
> -struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
> +struct files_struct *dup_fd(struct files_struct *, unsigned *, int *) __latent_entropy;
>  void do_close_on_exec(struct files_struct *);
>  int iterate_fd(struct files_struct *, unsigned,
>  		int (*)(const void *, struct file *, unsigned),
> @@ -115,8 +114,7 @@ int iterate_fd(struct files_struct *, unsigned,
>  extern int close_fd(unsigned int fd);
>  extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
>  extern struct file *file_close_fd(unsigned int fd);
> -extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
> -		      struct files_struct **new_fdp);
> +extern int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp);
>  
>  extern struct kmem_cache *files_cachep;
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index cc760491f201..a7c905f06048 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1773,7 +1773,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
>  		goto out;
>  	}
>  
> -	newf = dup_fd(oldf, NR_OPEN_MAX, &error);
> +	newf = dup_fd(oldf, NULL, &error);
>  	if (!newf)
>  		goto out;
>  
> @@ -3232,15 +3232,14 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
>  /*
>   * Unshare file descriptor table if it is being shared
>   */
> -int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
> -	       struct files_struct **new_fdp)
> +int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
>  {
>  	struct files_struct *fd = current->files;
>  	int error = 0;
>  
>  	if ((unshare_flags & CLONE_FILES) &&
>  	    (fd && atomic_read(&fd->count) > 1)) {
> -		*new_fdp = dup_fd(fd, max_fds, &error);
> +		*new_fdp = dup_fd(fd, NULL, &error);
>  		if (!*new_fdp)
>  			return error;
>  	}
> @@ -3300,7 +3299,7 @@ int ksys_unshare(unsigned long unshare_flags)
>  	err = unshare_fs(unshare_flags, &new_fs);
>  	if (err)
>  		goto bad_unshare_out;
> -	err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
> +	err = unshare_fd(unshare_flags, &new_fd);
>  	if (err)
>  		goto bad_unshare_cleanup_fs;
>  	err = unshare_userns(unshare_flags, &new_cred);
> @@ -3392,7 +3391,7 @@ int unshare_files(void)
>  	struct files_struct *old, *copy = NULL;
>  	int error;
>  
> -	error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
> +	error = unshare_fd(CLONE_FILES, &copy);
>  	if (error || !copy)
>  		return error;
>
Al Viro Aug. 16, 2024, 11:15 a.m. UTC | #3
On Fri, Aug 16, 2024 at 10:25:52AM +0200, Christian Brauner wrote:

> I don't think so. It is clear that the file descriptor table is unshared
> and that fds are closed afterwards and that this can race with file
> descriptors being inserted into the currently shared fdtable. Imho,
> there's nothing to fix here.
> 
> I also question whether any userspace out there has any such ordering
> expectations between the two dup2()s and the close_range() call and
> specifically whether we should even bother giving any such guarantees.

Huh?

It's not those dup2() vs unsharing; it's relative order of those dup2().

Hell, make that

	dup2(0, 1023);
	dup2(1023, 10);

Do you agree that asynchronous code observing 10 already open, but 1023
still not open would be unexpected?
Al Viro Aug. 16, 2024, 11:49 a.m. UTC | #4
On Fri, Aug 16, 2024 at 12:15:12PM +0100, Al Viro wrote:
> On Fri, Aug 16, 2024 at 10:25:52AM +0200, Christian Brauner wrote:
> 
> > I don't think so. It is clear that the file descriptor table is unshared
> > and that fds are closed afterwards and that this can race with file
> > descriptors being inserted into the currently shared fdtable. Imho,
> > there's nothing to fix here.
> > 
> > I also question whether any userspace out there has any such ordering
> > expectations between the two dup2()s and the close_range() call and
> > specifically whether we should even bother giving any such guarantees.
> 
> Huh?
> 
> It's not those dup2() vs unsharing; it's relative order of those dup2().
> 
> Hell, make that
> 
> 	dup2(0, 1023);
> 	dup2(1023, 10);
> 
> Do you agree that asynchronous code observing 10 already open, but 1023
> still not open would be unexpected?

FWIW, for descriptor table unsharing we do (except for that odd case) have
the following:
	* the effect of operations not ordered wrt unshare (i.e. done
by another thread with no userland serialization) may or may not be
visible in the unshared copy; however, if two operations are ordered
wrt to each other, we won't see the effect of the later one without the
effect of the earlier.

Here neither of those dup2() is ordered wrt unsharing close_range();
we might see the effect of both or none or only the first one, but
seeing the effect of the second _without_ the effect of the first is
very odd, especially since the effect of the second does depend upon
just the state change we do *NOT* see.

Actual closing done by unsharing close_range() is not an issue - none
of the affected descriptors are getting closed.  It's the unshare
part that is deeply odd here.  And yes, unshare(2) (or clone(2) without
CLONE_FILES) would have the ordering warranties I'm talking about.
Linus Torvalds Aug. 16, 2024, 4:26 p.m. UTC | #5
On Thu, 15 Aug 2024 at 20:03, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> It *can* actually happen - all it takes is close_range(2) decision
> to trim the copied descriptor table made before the first dup2()
> and actual copying done after both dup2() are done.

I think this is fine. It's one of those "if user threads have no
serialization, they get what they get" situations.

IOW, I can't actually imagine that anybody would be affected by this
in any sane real situation. If you unshare your file descriptors while
another thread is modifying it, and you don't have any serialization
in user space, you are doing odd things.

Now, I'm not opposed to improving on this, but I do think this is a
"stupis is as stupid does" kind of thing that we shouldn't care about.

          Linus
Al Viro Aug. 16, 2024, 5:19 p.m. UTC | #6
On Fri, Aug 16, 2024 at 09:26:45AM -0700, Linus Torvalds wrote:
> On Thu, 15 Aug 2024 at 20:03, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > It *can* actually happen - all it takes is close_range(2) decision
> > to trim the copied descriptor table made before the first dup2()
> > and actual copying done after both dup2() are done.
> 
> I think this is fine. It's one of those "if user threads have no
> serialization, they get what they get" situations.

As it is, unshare(CLOSE_FILES) gives you a state that might be possible
if you e.g. attached a debugger to the process and poked around in
descriptor table.  CLOSE_RANGE_UNSHARE is supposed to be a shortcut
for unshare + plain close_range(), so having it end up with weird
states looks wrong.

For descriptor tables we have something very close to TSO (and possibly
the full TSO - I'll need to get some coffee and go through the barriers
we've got on the lockless side of fd_install()); this, OTOH, is not
quite Alpha-level weirdness, but it's not far from that.  And unlike
Alpha we don't have excuses along the lines of "it's cheaper that way" -
it really isn't any cheaper.

The variant I'm testing right now seems to be doing fine (LTP and about
halfway through the xfstests, with no regressions and no slowdowns)
and it's at
 fs/file.c               | 63 +++++++++++++++++--------------------------------
 include/linux/fdtable.h |  6 ++---
 kernel/fork.c           | 11 ++++-----
 3 files changed, 28 insertions(+), 52 deletions(-)

Basically,
	* switch CLOSE_UNSHARE_RANGE from unshare_fd() to dup_fd()
	* instead of "trim down to that much" pass dup_fd() an
optional "we'll be punching a hole from <this> to <that>", which
gets passed to sane_fdtable_size() (NULL == no hole to be punched).
	* in sane_fdtable_size()
		find last occupied bit in ->open_fds[]
		if asked to punch a hole and if that last bit is within
the hole, find last occupied bit below the hole
		round up last occupied plus 1 to BITS_PER_LONG.
All it takes, and IMO it's simpler that way.
Al Viro Aug. 16, 2024, 5:22 p.m. UTC | #7
On Fri, Aug 16, 2024 at 06:19:25PM +0100, Al Viro wrote:
> On Fri, Aug 16, 2024 at 09:26:45AM -0700, Linus Torvalds wrote:
> > On Thu, 15 Aug 2024 at 20:03, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > It *can* actually happen - all it takes is close_range(2) decision
> > > to trim the copied descriptor table made before the first dup2()
> > > and actual copying done after both dup2() are done.
> > 
> > I think this is fine. It's one of those "if user threads have no
> > serialization, they get what they get" situations.
> 
> As it is, unshare(CLOSE_FILES) gives you a state that might be possible

CLONE_FILES, that is.
Linus Torvalds Aug. 16, 2024, 5:55 p.m. UTC | #8
On Fri, 16 Aug 2024 at 10:19, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> All it takes, and IMO it's simpler that way.

Hey, if it' simpler and gives more natural semantics, I obviously
won't argue against it.

That said, I do hate your "punch_hole" argument. At least make it a
'struct' with start/end, not a random int pointer, ok?

Oh, and can we please make 'dup_fd()' return an error pointer instead
of having that other int pointer argument for the error code?

I wonder why it was done that way - it goes back to 2006 and commit
a016f3389c06 ("unshare system call -v5: unshare files"), it's not like
it's some ancient interface that predates that model.

                Linus
Linus Torvalds Aug. 16, 2024, 5:58 p.m. UTC | #9
On Fri, 16 Aug 2024 at 10:55, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Oh, and can we please make 'dup_fd()' return an error pointer instead
> of having that other int pointer argument for the error code?

Something like this, IOW.. Entirely untested, but looks obvious enough.

             Linus
Al Viro Aug. 16, 2024, 6:15 p.m. UTC | #10
On Fri, Aug 16, 2024 at 10:55:30AM -0700, Linus Torvalds wrote:
> On Fri, 16 Aug 2024 at 10:19, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > All it takes, and IMO it's simpler that way.
> 
> Hey, if it' simpler and gives more natural semantics, I obviously
> won't argue against it.
> 
> That said, I do hate your "punch_hole" argument. At least make it a
> 'struct' with start/end, not a random int pointer, ok?
> 
> Oh, and can we please make 'dup_fd()' return an error pointer instead
> of having that other int pointer argument for the error code?

As in https://lore.kernel.org/all/20240812064427.240190-11-viro@zeniv.linux.org.uk/?

Sure, I can separate it from dependency on alloc_fd() calling conventions
(previous patch in that series) and fold it into this one...
Linus Torvalds Aug. 16, 2024, 6:26 p.m. UTC | #11
On Fri, 16 Aug 2024 at 11:15, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> As in https://lore.kernel.org/all/20240812064427.240190-11-viro@zeniv.linux.org.uk/?

Heh. Ack.

           Linus
Al Viro Aug. 16, 2024, 8:26 p.m. UTC | #12
On Fri, Aug 16, 2024 at 11:26:10AM -0700, Linus Torvalds wrote:
> On Fri, 16 Aug 2024 at 11:15, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > As in https://lore.kernel.org/all/20240812064427.240190-11-viro@zeniv.linux.org.uk/?
> 
> Heh. Ack.

Variant being tested right now:

commit e52e593a8494feb745489cdf5e826ca06d32f495
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Fri Aug 16 15:17:00 2024 -0400

    close_range(): fix the logics in descriptor table trimming
    
    Cloning a descriptor table picks the size that would cover all currently
    opened files.  That's fine for clone() and unshare(), but for close_range()
    there's an additional twist - we clone before we close, and it would be
    a shame to have
            close_range(3, ~0U, CLOSE_RANGE_UNSHARE)
    leave us with a huge descriptor table when we are not going to keep
    anything past stderr, just because some large file descriptor used to
    be open before our call has taken it out.
    
    Unfortunately, it had been dealt with in an inherently racy way -
    sane_fdtable_size() gets a "don't copy anything past that" argument
    (passed via unshare_fd() and dup_fd()), close_range() decides how much
    should be trimmed and passes that to unshare_fd().
    
    The problem is, a range that used to extend to the end of descriptor
    table back when close_range() had looked at it might very well have stuff
    grown after it by the time dup_fd() has allocated a new files_struct
    and started to figure out the capacity of fdtable to be attached to that.
    
    That leads to interesting pathological cases; at the very least it's a
    QoI issue, since unshare(CLONE_FILES) is atomic in a sense that it takes
    a snapshot of descriptor table one might have observed at some point.
    Since CLOSE_RANGE_UNSHARE close_range() is supposed to be a combination
    of unshare(CLONE_FILES) with plain close_range(), ending up with a
    weird state that would never occur with unshare(2) is confusing, to put
    it mildly.
    
    It's not hard to get rid of - all it takes is passing both ends of the
    range down to sane_fdtable_size().  There we are under ->files_lock,
    so the race is trivially avoided.
    
    So we do the following:
            * switch close_files() from calling unshare_fd() to calling
    dup_fd().
            * undo the calling convention change done to unshare_fd() in
    60997c3d45d9 "close_range: add CLOSE_RANGE_UNSHARE"
            * introduce struct fd_range, pass a pointer to that to dup_fd()
    and sane_fdtable_size() instead of "trim everything past that point"
    they are currently getting.  NULL means "we are not going to be punching
    any holes"; NR_OPEN_MAX is gone.
            * make sane_fdtable_size() use find_last_bit() instead of
    open-coding it; it's easier to follow that way.
            * while we are at it, have dup_fd() report errors by returning
    ERR_PTR(), no need to use a separate int *errorp argument.
    
    Fixes: 60997c3d45d9 "close_range: add CLOSE_RANGE_UNSHARE"
    Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/file.c b/fs/file.c
index 655338effe9c..c2403cde40e4 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -272,59 +272,45 @@ static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
 	return test_bit(fd, fdt->open_fds);
 }
 
-static unsigned int count_open_files(struct fdtable *fdt)
-{
-	unsigned int size = fdt->max_fds;
-	unsigned int i;
-
-	/* Find the last open fd */
-	for (i = size / BITS_PER_LONG; i > 0; ) {
-		if (fdt->open_fds[--i])
-			break;
-	}
-	i = (i + 1) * BITS_PER_LONG;
-	return i;
-}
-
 /*
  * Note that a sane fdtable size always has to be a multiple of
  * BITS_PER_LONG, since we have bitmaps that are sized by this.
  *
- * 'max_fds' will normally already be properly aligned, but it
- * turns out that in the close_range() -> __close_range() ->
- * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
- * up having a 'max_fds' value that isn't already aligned.
- *
- * Rather than make close_range() have to worry about this,
- * just make that BITS_PER_LONG alignment be part of a sane
- * fdtable size. Becuase that's really what it is.
+ * punch_hole is optional - when close_range() is asked to unshare
+ * and close, we don't need to copy descriptors in that range, so
+ * a smaller cloned descriptor table might suffice if the last
+ * currently opened descriptor falls into that range.
  */
-static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
+static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole)
 {
-	unsigned int count;
-
-	count = count_open_files(fdt);
-	if (max_fds < NR_OPEN_DEFAULT)
-		max_fds = NR_OPEN_DEFAULT;
-	return ALIGN(min(count, max_fds), BITS_PER_LONG);
+	unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);
+
+	if (last == fdt->max_fds)
+		return NR_OPEN_DEFAULT;
+	if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) {
+		last = find_last_bit(fdt->open_fds, punch_hole->from);
+		if (last == punch_hole->from)
+			return NR_OPEN_DEFAULT;
+	}
+	return ALIGN(last + 1, BITS_PER_LONG);
 }
 
 /*
- * Allocate a new files structure and copy contents from the
- * passed in files structure.
- * errorp will be valid only when the returned files_struct is NULL.
+ * Allocate a new descriptor table and copy contents from the passed in
+ * instance.  Returns a pointer to cloned table on success, ERR_PTR()
+ * on failure.  For 'punch_hole' see sane_fdtable_size().
  */
-struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
+struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole)
 {
 	struct files_struct *newf;
 	struct file **old_fds, **new_fds;
 	unsigned int open_files, i;
 	struct fdtable *old_fdt, *new_fdt;
+	int error;
 
-	*errorp = -ENOMEM;
 	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
 	if (!newf)
-		goto out;
+		return ERR_PTR(-ENOMEM);
 
 	atomic_set(&newf->count, 1);
 
@@ -341,7 +327,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 
 	spin_lock(&oldf->file_lock);
 	old_fdt = files_fdtable(oldf);
-	open_files = sane_fdtable_size(old_fdt, max_fds);
+	open_files = sane_fdtable_size(old_fdt, punch_hole);
 
 	/*
 	 * Check whether we need to allocate a larger fd array and fd set.
@@ -354,14 +340,14 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 
 		new_fdt = alloc_fdtable(open_files - 1);
 		if (!new_fdt) {
-			*errorp = -ENOMEM;
+			error = -ENOMEM;
 			goto out_release;
 		}
 
 		/* beyond sysctl_nr_open; nothing to do */
 		if (unlikely(new_fdt->max_fds < open_files)) {
 			__free_fdtable(new_fdt);
-			*errorp = -EMFILE;
+			error = -EMFILE;
 			goto out_release;
 		}
 
@@ -372,7 +358,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 		 */
 		spin_lock(&oldf->file_lock);
 		old_fdt = files_fdtable(oldf);
-		open_files = sane_fdtable_size(old_fdt, max_fds);
+		open_files = sane_fdtable_size(old_fdt, punch_hole);
 	}
 
 	copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
@@ -406,8 +392,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 
 out_release:
 	kmem_cache_free(files_cachep, newf);
-out:
-	return NULL;
+	return ERR_PTR(error);
 }
 
 static struct fdtable *close_files(struct files_struct * files)
@@ -748,37 +733,25 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 	if (fd > max_fd)
 		return -EINVAL;
 
-	if (flags & CLOSE_RANGE_UNSHARE) {
-		int ret;
-		unsigned int max_unshare_fds = NR_OPEN_MAX;
+	if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
+		struct fd_range range = {fd, max_fd}, *punch_hole = &range;
 
 		/*
 		 * If the caller requested all fds to be made cloexec we always
 		 * copy all of the file descriptors since they still want to
 		 * use them.
 		 */
-		if (!(flags & CLOSE_RANGE_CLOEXEC)) {
-			/*
-			 * If the requested range is greater than the current
-			 * maximum, we're closing everything so only copy all
-			 * file descriptors beneath the lowest file descriptor.
-			 */
-			rcu_read_lock();
-			if (max_fd >= last_fd(files_fdtable(cur_fds)))
-				max_unshare_fds = fd;
-			rcu_read_unlock();
-		}
-
-		ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
-		if (ret)
-			return ret;
+		if (flags & CLOSE_RANGE_CLOEXEC)
+			punch_hole = NULL;
 
+		fds = dup_fd(cur_fds, punch_hole);
+		if (IS_ERR(fds))
+			return PTR_ERR(fds);
 		/*
 		 * We used to share our file descriptor table, and have now
 		 * created a private one, make sure we're using it below.
 		 */
-		if (fds)
-			swap(cur_fds, fds);
+		swap(cur_fds, fds);
 	}
 
 	if (flags & CLOSE_RANGE_CLOEXEC)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 2944d4aa413b..b1c5722f2b3c 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -22,7 +22,6 @@
  * as this is the granularity returned by copy_fdset().
  */
 #define NR_OPEN_DEFAULT BITS_PER_LONG
-#define NR_OPEN_MAX ~0U
 
 struct fdtable {
 	unsigned int max_fds;
@@ -106,7 +105,10 @@ struct task_struct;
 
 void put_files_struct(struct files_struct *fs);
 int unshare_files(void);
-struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
+struct fd_range {
+	unsigned int from, to;
+};
+struct files_struct *dup_fd(struct files_struct *, struct fd_range *) __latent_entropy;
 void do_close_on_exec(struct files_struct *);
 int iterate_fd(struct files_struct *, unsigned,
 		int (*)(const void *, struct file *, unsigned),
@@ -115,8 +117,6 @@ int iterate_fd(struct files_struct *, unsigned,
 extern int close_fd(unsigned int fd);
 extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
 extern struct file *file_close_fd(unsigned int fd);
-extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
-		      struct files_struct **new_fdp);
 
 extern struct kmem_cache *files_cachep;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index cc760491f201..6b97fb2ac4af 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1754,33 +1754,30 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
 		      int no_files)
 {
 	struct files_struct *oldf, *newf;
-	int error = 0;
 
 	/*
 	 * A background process may not have any files ...
 	 */
 	oldf = current->files;
 	if (!oldf)
-		goto out;
+		return 0;
 
 	if (no_files) {
 		tsk->files = NULL;
-		goto out;
+		return 0;
 	}
 
 	if (clone_flags & CLONE_FILES) {
 		atomic_inc(&oldf->count);
-		goto out;
+		return 0;
 	}
 
-	newf = dup_fd(oldf, NR_OPEN_MAX, &error);
-	if (!newf)
-		goto out;
+	newf = dup_fd(oldf, NULL);
+	if (IS_ERR(newf))
+		return PTR_ERR(newf);
 
 	tsk->files = newf;
-	error = 0;
-out:
-	return error;
+	return 0;
 }
 
 static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
@@ -3232,17 +3229,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 /*
  * Unshare file descriptor table if it is being shared
  */
-int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
-	       struct files_struct **new_fdp)
+static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
 {
 	struct files_struct *fd = current->files;
-	int error = 0;
 
 	if ((unshare_flags & CLONE_FILES) &&
 	    (fd && atomic_read(&fd->count) > 1)) {
-		*new_fdp = dup_fd(fd, max_fds, &error);
-		if (!*new_fdp)
-			return error;
+		fd = dup_fd(fd, NULL);
+		if (IS_ERR(fd))
+			return PTR_ERR(fd);
+		*new_fdp = fd;
 	}
 
 	return 0;
@@ -3300,7 +3296,7 @@ int ksys_unshare(unsigned long unshare_flags)
 	err = unshare_fs(unshare_flags, &new_fs);
 	if (err)
 		goto bad_unshare_out;
-	err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
+	err = unshare_fd(unshare_flags, &new_fd);
 	if (err)
 		goto bad_unshare_cleanup_fs;
 	err = unshare_userns(unshare_flags, &new_cred);
@@ -3392,7 +3388,7 @@ int unshare_files(void)
 	struct files_struct *old, *copy = NULL;
 	int error;
 
-	error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
+	error = unshare_fd(CLONE_FILES, &copy);
 	if (error || !copy)
 		return error;
Al Viro Aug. 16, 2024, 11:35 p.m. UTC | #13
On Fri, Aug 16, 2024 at 09:26:57PM +0100, Al Viro wrote:
> On Fri, Aug 16, 2024 at 11:26:10AM -0700, Linus Torvalds wrote:
> > On Fri, 16 Aug 2024 at 11:15, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > As in https://lore.kernel.org/all/20240812064427.240190-11-viro@zeniv.linux.org.uk/?
> > 
> > Heh. Ack.
> 
> Variant being tested right now:

	No regressions, AFAICT.  Pushed to
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #proposed-fix
(head at 8c86f1a4574d).  Unless somebody objects, I'm going to take that
into #fixes...
Al Viro Aug. 22, 2024, midnight UTC | #14
On Sat, Aug 17, 2024 at 12:35:21AM +0100, Al Viro wrote:
> On Fri, Aug 16, 2024 at 09:26:57PM +0100, Al Viro wrote:
> > On Fri, Aug 16, 2024 at 11:26:10AM -0700, Linus Torvalds wrote:
> > > On Fri, 16 Aug 2024 at 11:15, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > > >
> > > > As in https://lore.kernel.org/all/20240812064427.240190-11-viro@zeniv.linux.org.uk/?
> > > 
> > > Heh. Ack.
> > 
> > Variant being tested right now:
> 
> 	No regressions, AFAICT.  Pushed to
> git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #proposed-fix
> (head at 8c86f1a4574d).  Unless somebody objects, I'm going to take that
> into #fixes...

Well, since nobody has objected, in #fixes it went...
diff mbox series

Patch

diff --git a/fs/file.c b/fs/file.c
index 655338effe9c..870c0c65530b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -272,20 +272,6 @@  static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
 	return test_bit(fd, fdt->open_fds);
 }
 
-static unsigned int count_open_files(struct fdtable *fdt)
-{
-	unsigned int size = fdt->max_fds;
-	unsigned int i;
-
-	/* Find the last open fd */
-	for (i = size / BITS_PER_LONG; i > 0; ) {
-		if (fdt->open_fds[--i])
-			break;
-	}
-	i = (i + 1) * BITS_PER_LONG;
-	return i;
-}
-
 /*
  * Note that a sane fdtable size always has to be a multiple of
  * BITS_PER_LONG, since we have bitmaps that are sized by this.
@@ -299,14 +285,18 @@  static unsigned int count_open_files(struct fdtable *fdt)
  * just make that BITS_PER_LONG alignment be part of a sane
  * fdtable size. Becuase that's really what it is.
  */
-static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
+static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int *punch_hole)
 {
-	unsigned int count;
-
-	count = count_open_files(fdt);
-	if (max_fds < NR_OPEN_DEFAULT)
-		max_fds = NR_OPEN_DEFAULT;
-	return ALIGN(min(count, max_fds), BITS_PER_LONG);
+	unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);
+
+	if (last == fdt->max_fds)	// empty
+		return NR_OPEN_DEFAULT;
+	if (punch_hole && punch_hole[1] >= last) {
+		last = find_last_bit(fdt->open_fds, punch_hole[0]);
+		if (last == punch_hole[0])
+			return NR_OPEN_DEFAULT;
+	}
+	return ALIGN(last + 1, BITS_PER_LONG);
 }
 
 /*
@@ -314,7 +304,7 @@  static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
  * passed in files structure.
  * errorp will be valid only when the returned files_struct is NULL.
  */
-struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
+struct files_struct *dup_fd(struct files_struct *oldf, unsigned int *punch_hole, int *errorp)
 {
 	struct files_struct *newf;
 	struct file **old_fds, **new_fds;
@@ -341,7 +331,7 @@  struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 
 	spin_lock(&oldf->file_lock);
 	old_fdt = files_fdtable(oldf);
-	open_files = sane_fdtable_size(old_fdt, max_fds);
+	open_files = sane_fdtable_size(old_fdt, punch_hole);
 
 	/*
 	 * Check whether we need to allocate a larger fd array and fd set.
@@ -372,7 +362,7 @@  struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 		 */
 		spin_lock(&oldf->file_lock);
 		old_fdt = files_fdtable(oldf);
-		open_files = sane_fdtable_size(old_fdt, max_fds);
+		open_files = sane_fdtable_size(old_fdt, punch_hole);
 	}
 
 	copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
@@ -748,37 +738,26 @@  int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 	if (fd > max_fd)
 		return -EINVAL;
 
-	if (flags & CLOSE_RANGE_UNSHARE) {
+	if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
 		int ret;
-		unsigned int max_unshare_fds = NR_OPEN_MAX;
+		unsigned int range[2] = {fd, max_fd}, *punch_hole = range;
 
 		/*
 		 * If the caller requested all fds to be made cloexec we always
 		 * copy all of the file descriptors since they still want to
 		 * use them.
 		 */
-		if (!(flags & CLOSE_RANGE_CLOEXEC)) {
-			/*
-			 * If the requested range is greater than the current
-			 * maximum, we're closing everything so only copy all
-			 * file descriptors beneath the lowest file descriptor.
-			 */
-			rcu_read_lock();
-			if (max_fd >= last_fd(files_fdtable(cur_fds)))
-				max_unshare_fds = fd;
-			rcu_read_unlock();
-		}
+		if (flags & CLOSE_RANGE_CLOEXEC)
+			punch_hole = NULL;
 
-		ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
-		if (ret)
+		fds = dup_fd(cur_fds, punch_hole, &ret);
+		if (!fds)
 			return ret;
-
 		/*
 		 * We used to share our file descriptor table, and have now
 		 * created a private one, make sure we're using it below.
 		 */
-		if (fds)
-			swap(cur_fds, fds);
+		swap(cur_fds, fds);
 	}
 
 	if (flags & CLOSE_RANGE_CLOEXEC)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 2944d4aa413b..6bc4353f919e 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -22,7 +22,6 @@ 
  * as this is the granularity returned by copy_fdset().
  */
 #define NR_OPEN_DEFAULT BITS_PER_LONG
-#define NR_OPEN_MAX ~0U
 
 struct fdtable {
 	unsigned int max_fds;
@@ -106,7 +105,7 @@  struct task_struct;
 
 void put_files_struct(struct files_struct *fs);
 int unshare_files(void);
-struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
+struct files_struct *dup_fd(struct files_struct *, unsigned *, int *) __latent_entropy;
 void do_close_on_exec(struct files_struct *);
 int iterate_fd(struct files_struct *, unsigned,
 		int (*)(const void *, struct file *, unsigned),
@@ -115,8 +114,7 @@  int iterate_fd(struct files_struct *, unsigned,
 extern int close_fd(unsigned int fd);
 extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
 extern struct file *file_close_fd(unsigned int fd);
-extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
-		      struct files_struct **new_fdp);
+extern int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp);
 
 extern struct kmem_cache *files_cachep;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index cc760491f201..a7c905f06048 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1773,7 +1773,7 @@  static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
 		goto out;
 	}
 
-	newf = dup_fd(oldf, NR_OPEN_MAX, &error);
+	newf = dup_fd(oldf, NULL, &error);
 	if (!newf)
 		goto out;
 
@@ -3232,15 +3232,14 @@  static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 /*
  * Unshare file descriptor table if it is being shared
  */
-int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
-	       struct files_struct **new_fdp)
+int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
 {
 	struct files_struct *fd = current->files;
 	int error = 0;
 
 	if ((unshare_flags & CLONE_FILES) &&
 	    (fd && atomic_read(&fd->count) > 1)) {
-		*new_fdp = dup_fd(fd, max_fds, &error);
+		*new_fdp = dup_fd(fd, NULL, &error);
 		if (!*new_fdp)
 			return error;
 	}
@@ -3300,7 +3299,7 @@  int ksys_unshare(unsigned long unshare_flags)
 	err = unshare_fs(unshare_flags, &new_fs);
 	if (err)
 		goto bad_unshare_out;
-	err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
+	err = unshare_fd(unshare_flags, &new_fd);
 	if (err)
 		goto bad_unshare_cleanup_fs;
 	err = unshare_userns(unshare_flags, &new_cred);
@@ -3392,7 +3391,7 @@  int unshare_files(void)
 	struct files_struct *old, *copy = NULL;
 	int error;
 
-	error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
+	error = unshare_fd(CLONE_FILES, &copy);
 	if (error || !copy)
 		return error;