diff mbox series

[4/4] mm: swapoff: shmem_unuse() stop eviction without igrab()

Message ID alpine.LSU.2.11.1904081259400.1523@eggly.anvils (mailing list archive)
State New, archived
Headers show
Series mm: swapoff: fixes for 5.1-rc | expand

Commit Message

Hugh Dickins April 8, 2019, 8:01 p.m. UTC
The igrab() in shmem_unuse() looks good, but we forgot that it gives no
protection against concurrent unmounting: a point made by Konstantin
Khlebnikov eight years ago, and then fixed in 2.6.39 by 778dd893ae78
("tmpfs: fix race between umount and swapoff"). The current 5.1-rc
swapoff is liable to hit "VFS: Busy inodes after unmount of tmpfs.
Self-destruct in 5 seconds.  Have a nice day..." followed by GPF.

Once again, give up on using igrab(); but don't go back to making such
heavy-handed use of shmem_swaplist_mutex as last time: that would spoil
the new design, and I expect could deadlock inside shmem_swapin_page().

Instead, shmem_unuse() just raise a "stop_eviction" count in the shmem-
specific inode, and shmem_evict_inode() wait for that to go down to 0.
Call it "stop_eviction" rather than "swapoff_busy" because it can be
put to use for others later (huge tmpfs patches expect to use it).

That simplifies shmem_unuse(), protecting it from both unlink and unmount;
and in practice lets it locate all the swap in its first try.  But do not
rely on that: there's still a theoretical case, when shmem_writepage()
might have been preempted after its get_swap_page(), before making the
swap entry visible to swapoff.

Fixes: b56a2d8af914 ("mm: rid swapoff of quadratic complexity")
Signed-off-by: Hugh Dickins <hughd@google.com>
---

 include/linux/shmem_fs.h |    1 +
 mm/shmem.c               |   39 ++++++++++++++++++---------------------
 mm/swapfile.c            |   11 +++++------
 3 files changed, 24 insertions(+), 27 deletions(-)

Comments

Konstantin Khlebnikov April 9, 2019, 7:50 a.m. UTC | #1
On 08.04.2019 23:01, Hugh Dickins wrote:
> The igrab() in shmem_unuse() looks good, but we forgot that it gives no
> protection against concurrent unmounting: a point made by Konstantin
> Khlebnikov eight years ago, and then fixed in 2.6.39 by 778dd893ae78
> ("tmpfs: fix race between umount and swapoff"). The current 5.1-rc
> swapoff is liable to hit "VFS: Busy inodes after unmount of tmpfs.
> Self-destruct in 5 seconds.  Have a nice day..." followed by GPF.
> 
> Once again, give up on using igrab(); but don't go back to making such
> heavy-handed use of shmem_swaplist_mutex as last time: that would spoil
> the new design, and I expect could deadlock inside shmem_swapin_page().
> 
> Instead, shmem_unuse() just raise a "stop_eviction" count in the shmem-
> specific inode, and shmem_evict_inode() wait for that to go down to 0.
> Call it "stop_eviction" rather than "swapoff_busy" because it can be
> put to use for others later (huge tmpfs patches expect to use it).
> 
> That simplifies shmem_unuse(), protecting it from both unlink and unmount;
> and in practice lets it locate all the swap in its first try.  But do not
> rely on that: there's still a theoretical case, when shmem_writepage()
> might have been preempted after its get_swap_page(), before making the
> swap entry visible to swapoff.
> 
> Fixes: b56a2d8af914 ("mm: rid swapoff of quadratic complexity")
> Signed-off-by: Hugh Dickins <hughd@google.com>
> ---
> 
>   include/linux/shmem_fs.h |    1 +
>   mm/shmem.c               |   39 ++++++++++++++++++---------------------
>   mm/swapfile.c            |   11 +++++------
>   3 files changed, 24 insertions(+), 27 deletions(-)
> 
> --- 5.1-rc4/include/linux/shmem_fs.h	2019-03-17 16:18:15.181820820 -0700
> +++ linux/include/linux/shmem_fs.h	2019-04-07 19:18:43.248639711 -0700
> @@ -21,6 +21,7 @@ struct shmem_inode_info {
>   	struct list_head	swaplist;	/* chain of maybes on swap */
>   	struct shared_policy	policy;		/* NUMA memory alloc policy */
>   	struct simple_xattrs	xattrs;		/* list of xattrs */
> +	atomic_t		stop_eviction;	/* hold when working on inode */
>   	struct inode		vfs_inode;
>   };
>   
> --- 5.1-rc4/mm/shmem.c	2019-04-07 19:12:23.603858531 -0700
> +++ linux/mm/shmem.c	2019-04-07 19:18:43.248639711 -0700
> @@ -1081,9 +1081,15 @@ static void shmem_evict_inode(struct ino
>   			}
>   			spin_unlock(&sbinfo->shrinklist_lock);
>   		}
> -		if (!list_empty(&info->swaplist)) {
> +		while (!list_empty(&info->swaplist)) {
> +			/* Wait while shmem_unuse() is scanning this inode... */
> +			wait_var_event(&info->stop_eviction,
> +				       !atomic_read(&info->stop_eviction));
>   			mutex_lock(&shmem_swaplist_mutex);

>   			list_del_init(&info->swaplist);

Obviously, line above should be deleted.

> +			/* ...but beware of the race if we peeked too early */
> +			if (!atomic_read(&info->stop_eviction))
> +				list_del_init(&info->swaplist);
>   			mutex_unlock(&shmem_swaplist_mutex);
>   		}
>   	}
> @@ -1227,36 +1233,27 @@ int shmem_unuse(unsigned int type, bool
>   		unsigned long *fs_pages_to_unuse)
>   {
>   	struct shmem_inode_info *info, *next;
> -	struct inode *inode;
> -	struct inode *prev_inode = NULL;
>   	int error = 0;
>   
>   	if (list_empty(&shmem_swaplist))
>   		return 0;
>   
>   	mutex_lock(&shmem_swaplist_mutex);
> -
> -	/*
> -	 * The extra refcount on the inode is necessary to safely dereference
> -	 * p->next after re-acquiring the lock. New shmem inodes with swap
> -	 * get added to the end of the list and we will scan them all.
> -	 */
>   	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
>   		if (!info->swapped) {
>   			list_del_init(&info->swaplist);
>   			continue;
>   		}
> -
> -		inode = igrab(&info->vfs_inode);
> -		if (!inode)
> -			continue;
> -
> +		/*
> +		 * Drop the swaplist mutex while searching the inode for swap;
> +		 * but before doing so, make sure shmem_evict_inode() will not
> +		 * remove placeholder inode from swaplist, nor let it be freed
> +		 * (igrab() would protect from unlink, but not from unmount).
> +		 */
> +		atomic_inc(&info->stop_eviction);
>   		mutex_unlock(&shmem_swaplist_mutex);
> -		if (prev_inode)
> -			iput(prev_inode);
> -		prev_inode = inode;
>   
> -		error = shmem_unuse_inode(inode, type, frontswap,
> +		error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
>   					  fs_pages_to_unuse);
>   		cond_resched();
>   
> @@ -1264,14 +1261,13 @@ int shmem_unuse(unsigned int type, bool
>   		next = list_next_entry(info, swaplist);
>   		if (!info->swapped)
>   			list_del_init(&info->swaplist);
> +		if (atomic_dec_and_test(&info->stop_eviction))
> +			wake_up_var(&info->stop_eviction);
>   		if (error)
>   			break;
>   	}
>   	mutex_unlock(&shmem_swaplist_mutex);
>   
> -	if (prev_inode)
> -		iput(prev_inode);
> -
>   	return error;
>   }
>   
> @@ -2238,6 +2234,7 @@ static struct inode *shmem_get_inode(str
>   		info = SHMEM_I(inode);
>   		memset(info, 0, (char *)inode - (char *)info);
>   		spin_lock_init(&info->lock);
> +		atomic_set(&info->stop_eviction, 0);
>   		info->seals = F_SEAL_SEAL;
>   		info->flags = flags & VM_NORESERVE;
>   		INIT_LIST_HEAD(&info->shrinklist);
> --- 5.1-rc4/mm/swapfile.c	2019-04-07 19:17:13.291957539 -0700
> +++ linux/mm/swapfile.c	2019-04-07 19:18:43.248639711 -0700
> @@ -2116,12 +2116,11 @@ retry:
>   	 * Under global memory pressure, swap entries can be reinserted back
>   	 * into process space after the mmlist loop above passes over them.
>   	 *
> -	 * Limit the number of retries? No: when shmem_unuse()'s igrab() fails,
> -	 * a shmem inode using swap is being evicted; and when mmget_not_zero()
> -	 * above fails, that mm is likely to be freeing swap from exit_mmap().
> -	 * Both proceed at their own independent pace: we could move them to
> -	 * separate lists, and wait for those lists to be emptied; but it's
> -	 * easier and more robust (though cpu-intensive) just to keep retrying.
> +	 * Limit the number of retries? No: when mmget_not_zero() above fails,
> +	 * that mm is likely to be freeing swap from exit_mmap(), which proceeds
> +	 * at its own independent pace; and even shmem_writepage() could have
> +	 * been preempted after get_swap_page(), temporarily hiding that swap.
> +	 * It's easy and robust (though cpu-intensive) just to keep retrying.
>   	 */
>   	if (si->inuse_pages) {
>   		if (!signal_pending(current))
>
Hugh Dickins April 9, 2019, 6:43 p.m. UTC | #2
On Tue, 9 Apr 2019, Konstantin Khlebnikov wrote:
> On 08.04.2019 23:01, Hugh Dickins wrote:
> > -		if (!list_empty(&info->swaplist)) {
> > +		while (!list_empty(&info->swaplist)) {
> > +			/* Wait while shmem_unuse() is scanning this inode...
> > */
> > +			wait_var_event(&info->stop_eviction,
> > +				       !atomic_read(&info->stop_eviction));
> >   			mutex_lock(&shmem_swaplist_mutex);
> >   			list_del_init(&info->swaplist);
> 
> Obviously, line above should be deleted.

Definitely. Worryingly stupid. I guess I left it behind while translating
from an earlier tree.  Many thanks for catching that in time, Konstantin.
I've rechecked the rest of this patch, and the others, and didn't find
anything else as stupid.

Andrew, please add this fixup for folding in - thanks:

[PATCH] mm: swapoff: shmem_unuse() stop eviction without igrab() fix

Fix my stupidity, thankfully caught by Konstantin.

Signed-off-by: Hugh Dickins <hughd@google.com>
---
Fix to fold into mm-swapoff-shmem_unuse-stop-eviction-without-igrab.patch

 mm/shmem.c |    1 -
 1 file changed, 1 deletion(-)

--- patch4/mm/shmem.c	2019-04-07 19:18:43.248639711 -0700
+++ patch5/mm/shmem.c	2019-04-09 11:24:32.745337734 -0700
@@ -1086,7 +1086,6 @@ static void shmem_evict_inode(struct ino
 			wait_var_event(&info->stop_eviction,
 				       !atomic_read(&info->stop_eviction));
 			mutex_lock(&shmem_swaplist_mutex);
-			list_del_init(&info->swaplist);
 			/* ...but beware of the race if we peeked too early */
 			if (!atomic_read(&info->stop_eviction))
 				list_del_init(&info->swaplist);
diff mbox series

Patch

--- 5.1-rc4/include/linux/shmem_fs.h	2019-03-17 16:18:15.181820820 -0700
+++ linux/include/linux/shmem_fs.h	2019-04-07 19:18:43.248639711 -0700
@@ -21,6 +21,7 @@  struct shmem_inode_info {
 	struct list_head	swaplist;	/* chain of maybes on swap */
 	struct shared_policy	policy;		/* NUMA memory alloc policy */
 	struct simple_xattrs	xattrs;		/* list of xattrs */
+	atomic_t		stop_eviction;	/* hold when working on inode */
 	struct inode		vfs_inode;
 };
 
--- 5.1-rc4/mm/shmem.c	2019-04-07 19:12:23.603858531 -0700
+++ linux/mm/shmem.c	2019-04-07 19:18:43.248639711 -0700
@@ -1081,9 +1081,15 @@  static void shmem_evict_inode(struct ino
 			}
 			spin_unlock(&sbinfo->shrinklist_lock);
 		}
-		if (!list_empty(&info->swaplist)) {
+		while (!list_empty(&info->swaplist)) {
+			/* Wait while shmem_unuse() is scanning this inode... */
+			wait_var_event(&info->stop_eviction,
+				       !atomic_read(&info->stop_eviction));
 			mutex_lock(&shmem_swaplist_mutex);
 			list_del_init(&info->swaplist);
+			/* ...but beware of the race if we peeked too early */
+			if (!atomic_read(&info->stop_eviction))
+				list_del_init(&info->swaplist);
 			mutex_unlock(&shmem_swaplist_mutex);
 		}
 	}
@@ -1227,36 +1233,27 @@  int shmem_unuse(unsigned int type, bool
 		unsigned long *fs_pages_to_unuse)
 {
 	struct shmem_inode_info *info, *next;
-	struct inode *inode;
-	struct inode *prev_inode = NULL;
 	int error = 0;
 
 	if (list_empty(&shmem_swaplist))
 		return 0;
 
 	mutex_lock(&shmem_swaplist_mutex);
-
-	/*
-	 * The extra refcount on the inode is necessary to safely dereference
-	 * p->next after re-acquiring the lock. New shmem inodes with swap
-	 * get added to the end of the list and we will scan them all.
-	 */
 	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
 		if (!info->swapped) {
 			list_del_init(&info->swaplist);
 			continue;
 		}
-
-		inode = igrab(&info->vfs_inode);
-		if (!inode)
-			continue;
-
+		/*
+		 * Drop the swaplist mutex while searching the inode for swap;
+		 * but before doing so, make sure shmem_evict_inode() will not
+		 * remove placeholder inode from swaplist, nor let it be freed
+		 * (igrab() would protect from unlink, but not from unmount).
+		 */
+		atomic_inc(&info->stop_eviction);
 		mutex_unlock(&shmem_swaplist_mutex);
-		if (prev_inode)
-			iput(prev_inode);
-		prev_inode = inode;
 
-		error = shmem_unuse_inode(inode, type, frontswap,
+		error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
 					  fs_pages_to_unuse);
 		cond_resched();
 
@@ -1264,14 +1261,13 @@  int shmem_unuse(unsigned int type, bool
 		next = list_next_entry(info, swaplist);
 		if (!info->swapped)
 			list_del_init(&info->swaplist);
+		if (atomic_dec_and_test(&info->stop_eviction))
+			wake_up_var(&info->stop_eviction);
 		if (error)
 			break;
 	}
 	mutex_unlock(&shmem_swaplist_mutex);
 
-	if (prev_inode)
-		iput(prev_inode);
-
 	return error;
 }
 
@@ -2238,6 +2234,7 @@  static struct inode *shmem_get_inode(str
 		info = SHMEM_I(inode);
 		memset(info, 0, (char *)inode - (char *)info);
 		spin_lock_init(&info->lock);
+		atomic_set(&info->stop_eviction, 0);
 		info->seals = F_SEAL_SEAL;
 		info->flags = flags & VM_NORESERVE;
 		INIT_LIST_HEAD(&info->shrinklist);
--- 5.1-rc4/mm/swapfile.c	2019-04-07 19:17:13.291957539 -0700
+++ linux/mm/swapfile.c	2019-04-07 19:18:43.248639711 -0700
@@ -2116,12 +2116,11 @@  retry:
 	 * Under global memory pressure, swap entries can be reinserted back
 	 * into process space after the mmlist loop above passes over them.
 	 *
-	 * Limit the number of retries? No: when shmem_unuse()'s igrab() fails,
-	 * a shmem inode using swap is being evicted; and when mmget_not_zero()
-	 * above fails, that mm is likely to be freeing swap from exit_mmap().
-	 * Both proceed at their own independent pace: we could move them to
-	 * separate lists, and wait for those lists to be emptied; but it's
-	 * easier and more robust (though cpu-intensive) just to keep retrying.
+	 * Limit the number of retries? No: when mmget_not_zero() above fails,
+	 * that mm is likely to be freeing swap from exit_mmap(), which proceeds
+	 * at its own independent pace; and even shmem_writepage() could have
+	 * been preempted after get_swap_page(), temporarily hiding that swap.
+	 * It's easy and robust (though cpu-intensive) just to keep retrying.
 	 */
 	if (si->inuse_pages) {
 		if (!signal_pending(current))