diff mbox

[05/13] vfs: Replace array of file pointers with an IDR

Message ID 3f984d2111ae4856bf5183173b0f9fcaaefed8c0.1493315290.git.bankarsandhya512@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Sandhya Bankar April 27, 2017, 7:08 p.m. UTC
Instead of storing all the file pointers in a single array, use an
IDR.  It is RCU-safe, and does not need to be reallocated when the
fd array grows.  It also handles allocation of new file descriptors.

Signed-off-by: Sandhya Bankar <bankarsandhya512@gmail.com>
[mawilcox@microsoft.com: fixes]
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 fs/file.c               | 180 ++++++++++++++++++++----------------------------
 include/linux/fdtable.h |  10 +--
 2 files changed, 79 insertions(+), 111 deletions(-)

Comments

Mateusz Guzik Oct. 4, 2017, 3:45 p.m. UTC | #1
On Tue, Jul 11, 2017 at 06:50:52PM +0530, Sandhya Bankar wrote:
> Instead of storing all the file pointers in a single array, use an
> IDR.  It is RCU-safe, and does not need to be reallocated when the
> fd array grows.  It also handles allocation of new file descriptors.
> 
> ---
>  
[snip]
> @@ -604,22 +576,9 @@ void put_unused_fd(unsigned int fd)
>  void __fd_install(struct files_struct *files, unsigned int fd,
>  		struct file *file)
>  {
> -	struct fdtable *fdt;
> -
> -	might_sleep();
> -	rcu_read_lock_sched();
> -
> -	while (unlikely(files->resize_in_progress)) {
> -		rcu_read_unlock_sched();
> -		wait_event(files->resize_wait, !files->resize_in_progress);
> -		rcu_read_lock_sched();
> -	}
> -	/* coupled with smp_wmb() in expand_fdtable() */
> -	smp_rmb();
> -	fdt = rcu_dereference_sched(files->fdt);
> -	BUG_ON(fdt->fd[fd] != NULL);
> -	rcu_assign_pointer(fdt->fd[fd], file);
> -	rcu_read_unlock_sched();
> +	rcu_read_lock();
> +	BUG_ON(idr_replace(&files->fd_idr, file, fd));
> +	rcu_read_unlock();
>  }
>  
>  void fd_install(unsigned int fd, struct file *file)
> @@ -641,10 +600,9 @@ int __close_fd(struct files_struct *files, unsigned fd)
>  	fdt = files_fdtable(files);
>  	if (fd >= fdt->max_fds)
>  		goto out_unlock;
> -	file = fdt->fd[fd];
> +	file = idr_remove(&files->fd_idr, fd);
>  	if (!file)
>  		goto out_unlock;
> -	rcu_assign_pointer(fdt->fd[fd], NULL);
>  	__clear_close_on_exec(fd, fdt);
>  	__put_unused_fd(files, fd);
>  	spin_unlock(&files->file_lock);

I have no opinions about the switch, however these 2 places make me
worried. I did not check all the changes so perhaps I missed something.

In the current code we are safe when it comes to concurrent install and
close, in particular here:

CPU0		CPU1
alloc_fd
		__close_fd
fd_install

__close_fd will either see a NULL pointer and return -EBADF or will see
an installed pointer and proceed with the close.

Your proposed patch seems to be buggy in this regard.

You call idr_remove, which from what I understand will free up the slot
no matter what. You only detect an error based on whether there was a
non-NULL pointer there or not. If so, fd_install can proceed to play
with a deallocated entry.
diff mbox

Patch

diff --git a/fs/file.c b/fs/file.c
index ad6f094..1c000d8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -47,7 +47,6 @@  static void *alloc_fdmem(size_t size)
 
 static void __free_fdtable(struct fdtable *fdt)
 {
-	kvfree(fdt->fd);
 	kvfree(fdt->open_fds);
 	kfree(fdt);
 }
@@ -89,15 +88,7 @@  static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
  */
 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
 {
-	unsigned int cpy, set;
-
 	BUG_ON(nfdt->max_fds < ofdt->max_fds);
-
-	cpy = ofdt->max_fds * sizeof(struct file *);
-	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
-	memcpy(nfdt->fd, ofdt->fd, cpy);
-	memset((char *)nfdt->fd + cpy, 0, set);
-
 	copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
 }
 
@@ -131,15 +122,11 @@  static struct fdtable * alloc_fdtable(unsigned int nr)
 	if (!fdt)
 		goto out;
 	fdt->max_fds = nr;
-	data = alloc_fdmem(nr * sizeof(struct file *));
-	if (!data)
-		goto out_fdt;
-	fdt->fd = data;
 
 	data = alloc_fdmem(max_t(size_t,
 				 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES));
 	if (!data)
-		goto out_arr;
+		goto out_fdt;
 	fdt->open_fds = data;
 	data += nr / BITS_PER_BYTE;
 	fdt->close_on_exec = data;
@@ -148,8 +135,6 @@  static struct fdtable * alloc_fdtable(unsigned int nr)
 
 	return fdt;
 
-out_arr:
-	kvfree(fdt->fd);
 out_fdt:
 	kfree(fdt);
 out:
@@ -170,6 +155,7 @@  static int expand_fdtable(struct files_struct *files, unsigned int nr)
 	struct fdtable *new_fdt, *cur_fdt;
 
 	spin_unlock(&files->file_lock);
+	idr_preload_end();
 	new_fdt = alloc_fdtable(nr);
 
 	/* make sure all __fd_install() have seen resize_in_progress
@@ -178,6 +164,7 @@  static int expand_fdtable(struct files_struct *files, unsigned int nr)
 	if (atomic_read(&files->count) > 1)
 		synchronize_sched();
 
+	idr_preload(GFP_KERNEL);
 	spin_lock(&files->file_lock);
 	if (!new_fdt)
 		return -ENOMEM;
@@ -228,8 +215,10 @@  static int expand_files(struct files_struct *files, unsigned int nr)
 
 	if (unlikely(files->resize_in_progress)) {
 		spin_unlock(&files->file_lock);
+		idr_preload_end();
 		expanded = 1;
 		wait_event(files->resize_wait, !files->resize_in_progress);
+		idr_preload(GFP_KERNEL);
 		spin_lock(&files->file_lock);
 		goto repeat;
 	}
@@ -290,8 +279,8 @@  static unsigned int count_open_files(struct fdtable *fdt)
 struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 {
 	struct files_struct *newf;
-	struct file **old_fds, **new_fds;
 	unsigned int open_files, i;
+	struct file *f;
 	struct fdtable *old_fdt, *new_fdt;
 
 	*errorp = -ENOMEM;
@@ -302,6 +291,7 @@  struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	atomic_set(&newf->count, 1);
 
 	spin_lock_init(&newf->file_lock);
+	idr_init(&newf->fd_idr);
 	newf->resize_in_progress = false;
 	init_waitqueue_head(&newf->resize_wait);
 	newf->next_fd = 0;
@@ -310,8 +300,9 @@  struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	new_fdt->close_on_exec = newf->close_on_exec_init;
 	new_fdt->open_fds = newf->open_fds_init;
 	new_fdt->full_fds_bits = newf->full_fds_bits_init;
-	new_fdt->fd = &newf->fd_array[0];
 
+restart:
+	idr_copy_preload(&oldf->fd_idr, GFP_KERNEL);
 	spin_lock(&oldf->file_lock);
 	old_fdt = files_fdtable(oldf);
 	open_files = count_open_files(old_fdt);
@@ -321,6 +312,7 @@  struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	 */
 	while (unlikely(open_files > new_fdt->max_fds)) {
 		spin_unlock(&oldf->file_lock);
+		idr_preload_end();
 
 		if (new_fdt != &newf->fdtab)
 			__free_fdtable(new_fdt);
@@ -343,41 +335,50 @@  struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 		 * who knows it may have a new bigger fd table. We need
 		 * the latest pointer.
 		 */
+		idr_copy_preload(&oldf->fd_idr, GFP_KERNEL);
 		spin_lock(&oldf->file_lock);
 		old_fdt = files_fdtable(oldf);
 		open_files = count_open_files(old_fdt);
 	}
 
+	if (!idr_check_preload(&oldf->fd_idr)) {
+		spin_unlock(&oldf->file_lock);
+		idr_preload_end();
+		goto restart;
+	}
+
 	copy_fd_bitmaps(new_fdt, old_fdt, open_files);
 
-	old_fds = old_fdt->fd;
-	new_fds = new_fdt->fd;
-
-	for (i = open_files; i != 0; i--) {
-		struct file *f = *old_fds++;
-		if (f) {
-			get_file(f);
-		} else {
-			/*
-			 * The fd may be claimed in the fd bitmap but not yet
-			 * instantiated in the files array if a sibling thread
-			 * is partway through open().  So make sure that this
-			 * fd is available to the new process.
-			 */
-			__clear_open_fd(open_files - i, new_fdt);
+	idr_for_each_entry(&oldf->fd_idr, f, i) {
+		int err;
+
+		get_file(f);
+		err = idr_alloc(&newf->fd_idr, f, i, i + 1, GFP_NOWAIT);
+		if (WARN(err != i, "Could not allocate %d: %d", i, err)) {
+			spin_unlock(&oldf->file_lock);
+			goto out;
 		}
-		rcu_assign_pointer(*new_fds++, f);
 	}
+
 	spin_unlock(&oldf->file_lock);
+	idr_preload_end();
 
-	/* clear the remainder */
-	memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
+	/*
+	 * The fd may be claimed in the fd bitmap but not yet
+	 * instantiated in the files array if a sibling thread
+	 * is partway through open().
+	 */
+	for_each_set_bit(i, new_fdt->open_fds, new_fdt->max_fds) {
+		if (!idr_find(&newf->fd_idr, i))
+			__clear_bit(i, new_fdt->open_fds);
+	}
 
 	rcu_assign_pointer(newf->fdt, new_fdt);
 
 	return newf;
 
 out_release:
+	idr_destroy(&newf->fd_idr);
 	kmem_cache_free(files_cachep, newf);
 out:
 	return NULL;
@@ -401,7 +402,8 @@  static struct fdtable *close_files(struct files_struct * files)
 		set = fdt->open_fds[j++];
 		while (set) {
 			if (set & 1) {
-				struct file * file = xchg(&fdt->fd[i], NULL);
+				struct file *file;
+				file = idr_remove(&files->fd_idr, i);
 				if (file) {
 					filp_close(file, files);
 					cond_resched_rcu_qs();
@@ -469,28 +471,14 @@  struct files_struct init_files = {
 	.fdt		= &init_files.fdtab,
 	.fdtab		= {
 		.max_fds	= NR_OPEN_DEFAULT,
-		.fd		= &init_files.fd_array[0],
 		.close_on_exec	= init_files.close_on_exec_init,
 		.open_fds	= init_files.open_fds_init,
 		.full_fds_bits	= init_files.full_fds_bits_init,
 	},
 	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+	.fd_idr		= IDR_INIT,
 };
 
-static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
-{
-	unsigned int maxfd = fdt->max_fds;
-	unsigned int maxbit = maxfd / BITS_PER_LONG;
-	unsigned int bitbit = start / BITS_PER_LONG;
-
-	bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
-	if (bitbit > maxfd)
-		return maxfd;
-	if (bitbit > start)
-		start = bitbit;
-	return find_next_zero_bit(fdt->open_fds, maxfd, start);
-}
-
 /*
  * allocate a file descriptor, mark it busy.
  */
@@ -501,54 +489,37 @@  int __alloc_fd(struct files_struct *files,
 	int error;
 	struct fdtable *fdt;
 
+	idr_preload(GFP_KERNEL);
 	spin_lock(&files->file_lock);
-repeat:
-	fdt = files_fdtable(files);
-	fd = start;
-	if (fd < files->next_fd)
-		fd = files->next_fd;
 
-	if (fd < fdt->max_fds)
-		fd = find_next_fd(fdt, fd);
-
-	/*
-	 * N.B. For clone tasks sharing a files structure, this test
-	 * will limit the total number of files that can be opened.
-	 */
-	error = -EMFILE;
-	if (fd >= end)
+	error = idr_alloc(&files->fd_idr, NULL, start, end, GFP_NOWAIT);
+	if (error == -ENOSPC) {
+		error = -EMFILE;
 		goto out;
+	}
+	BUG_ON(error < 0);
+	fd = error;
 
 	error = expand_files(files, fd);
-	if (error < 0)
+	if (error < 0) {
+		idr_remove(&files->fd_idr, fd);
 		goto out;
-
-	/*
-	 * If we needed to expand the fs array we
-	 * might have blocked - try again.
-	 */
-	if (error)
-		goto repeat;
+	}
 
 	if (start <= files->next_fd)
 		files->next_fd = fd + 1;
 
+	fdt = files_fdtable(files);
 	__set_open_fd(fd, fdt);
 	if (flags & O_CLOEXEC)
 		__set_close_on_exec(fd, fdt);
 	else
 		__clear_close_on_exec(fd, fdt);
 	error = fd;
-#if 1
-	/* Sanity check */
-	if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
-		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
-		rcu_assign_pointer(fdt->fd[fd], NULL);
-	}
-#endif
 
 out:
 	spin_unlock(&files->file_lock);
+	idr_preload_end();
 	return error;
 }
 
@@ -575,6 +546,7 @@  void put_unused_fd(unsigned int fd)
 {
 	struct files_struct *files = current->files;
 	spin_lock(&files->file_lock);
+	BUG_ON(idr_remove(&files->fd_idr, fd));
 	__put_unused_fd(files, fd);
 	spin_unlock(&files->file_lock);
 }
@@ -604,22 +576,9 @@  void put_unused_fd(unsigned int fd)
 void __fd_install(struct files_struct *files, unsigned int fd,
 		struct file *file)
 {
-	struct fdtable *fdt;
-
-	might_sleep();
-	rcu_read_lock_sched();
-
-	while (unlikely(files->resize_in_progress)) {
-		rcu_read_unlock_sched();
-		wait_event(files->resize_wait, !files->resize_in_progress);
-		rcu_read_lock_sched();
-	}
-	/* coupled with smp_wmb() in expand_fdtable() */
-	smp_rmb();
-	fdt = rcu_dereference_sched(files->fdt);
-	BUG_ON(fdt->fd[fd] != NULL);
-	rcu_assign_pointer(fdt->fd[fd], file);
-	rcu_read_unlock_sched();
+	rcu_read_lock();
+	BUG_ON(idr_replace(&files->fd_idr, file, fd));
+	rcu_read_unlock();
 }
 
 void fd_install(unsigned int fd, struct file *file)
@@ -641,10 +600,9 @@  int __close_fd(struct files_struct *files, unsigned fd)
 	fdt = files_fdtable(files);
 	if (fd >= fdt->max_fds)
 		goto out_unlock;
-	file = fdt->fd[fd];
+	file = idr_remove(&files->fd_idr, fd);
 	if (!file)
 		goto out_unlock;
-	rcu_assign_pointer(fdt->fd[fd], NULL);
 	__clear_close_on_exec(fd, fdt);
 	__put_unused_fd(files, fd);
 	spin_unlock(&files->file_lock);
@@ -676,10 +634,9 @@  void do_close_on_exec(struct files_struct *files)
 			struct file *file;
 			if (!(set & 1))
 				continue;
-			file = fdt->fd[fd];
+			file = idr_remove(&files->fd_idr, fd);
 			if (!file)
 				continue;
-			rcu_assign_pointer(fdt->fd[fd], NULL);
 			__put_unused_fd(files, fd);
 			spin_unlock(&files->file_lock);
 			filp_close(file, files);
@@ -842,17 +799,27 @@  static int do_dup2(struct files_struct *files,
 	 * tables and this condition does not arise without those.
 	 */
 	fdt = files_fdtable(files);
-	tofree = fdt->fd[fd];
+	tofree = idr_find(&files->fd_idr, fd);
 	if (!tofree && fd_is_open(fd, fdt))
 		goto Ebusy;
 	get_file(file);
-	rcu_assign_pointer(fdt->fd[fd], file);
+	if (tofree) {
+		idr_replace(&files->fd_idr, file, fd);
+	} else {
+		int err = idr_alloc(&files->fd_idr, file, fd, fd + 1,
+								GFP_NOWAIT);
+		if (err != fd) {
+			WARN(1, "do_dup2 got %d for fd %d\n", err, fd);
+			goto Ebusy;
+		}
+	}
 	__set_open_fd(fd, fdt);
 	if (flags & O_CLOEXEC)
 		__set_close_on_exec(fd, fdt);
 	else
 		__clear_close_on_exec(fd, fdt);
 	spin_unlock(&files->file_lock);
+	idr_preload_end();
 
 	if (tofree)
 		filp_close(tofree, files);
@@ -861,6 +828,7 @@  static int do_dup2(struct files_struct *files,
 
 Ebusy:
 	spin_unlock(&files->file_lock);
+	idr_preload_end();
 	return -EBUSY;
 }
 
@@ -875,6 +843,7 @@  int replace_fd(unsigned fd, struct file *file, unsigned flags)
 	if (fd >= rlimit(RLIMIT_NOFILE))
 		return -EBADF;
 
+	idr_preload(GFP_KERNEL);
 	spin_lock(&files->file_lock);
 	err = expand_files(files, fd);
 	if (unlikely(err < 0))
@@ -883,6 +852,7 @@  int replace_fd(unsigned fd, struct file *file, unsigned flags)
 
 out_unlock:
 	spin_unlock(&files->file_lock);
+	idr_preload_end();
 	return err;
 }
 
@@ -901,6 +871,7 @@  int replace_fd(unsigned fd, struct file *file, unsigned flags)
 	if (newfd >= rlimit(RLIMIT_NOFILE))
 		return -EBADF;
 
+	idr_preload(GFP_KERNEL);
 	spin_lock(&files->file_lock);
 	err = expand_files(files, newfd);
 	file = fcheck(oldfd);
@@ -917,6 +888,7 @@  int replace_fd(unsigned fd, struct file *file, unsigned flags)
 	err = -EBADF;
 out_unlock:
 	spin_unlock(&files->file_lock);
+	idr_preload_end();
 	return err;
 }
 
@@ -974,7 +946,7 @@  int iterate_fd(struct files_struct *files, unsigned n,
 	spin_lock(&files->file_lock);
 	for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
 		struct file *file;
-		file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
+		file = __fcheck_files(files, n);
 		if (!file)
 			continue;
 		res = f(p, file, n);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 6e84b2cae..4072f24 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -12,6 +12,7 @@ 
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/idr.h>
 
 #include <linux/atomic.h>
 
@@ -23,7 +24,6 @@ 
 
 struct fdtable {
 	unsigned int max_fds;
-	struct file __rcu **fd;      /* current fd array */
 	unsigned long *close_on_exec;
 	unsigned long *open_fds;
 	unsigned long *full_fds_bits;
@@ -51,6 +51,7 @@  struct files_struct {
 	bool resize_in_progress;
 	wait_queue_head_t resize_wait;
 
+	struct idr fd_idr;
 	struct fdtable __rcu *fdt;
 	struct fdtable fdtab;
   /*
@@ -61,7 +62,6 @@  struct files_struct {
 	unsigned long close_on_exec_init[1];
 	unsigned long open_fds_init[1];
 	unsigned long full_fds_bits_init[1];
-	struct file __rcu * fd_array[NR_OPEN_DEFAULT];
 };
 
 struct file_operations;
@@ -79,11 +79,7 @@  struct files_struct {
  */
 static inline struct file *__fcheck_files(struct files_struct *files, unsigned int fd)
 {
-	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
-
-	if (fd < fdt->max_fds)
-		return rcu_dereference_raw(fdt->fd[fd]);
-	return NULL;
+	return idr_find(&files->fd_idr, fd);
 }
 
 static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd)