diff mbox

fsnotify: Avoid spurious EMFILE errors from inotify_init()

Message ID 1461743762-3520-1-git-send-email-jack@suse.cz (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Kara April 27, 2016, 7:56 a.m. UTC
Inotify instance is destroyed when all references to it are dropped.
That not only means that the corresponding file descriptor needs to be
closed but also that all corresponding instance marks are freed (as each
mark holds a reference to the inotify instance). However marks are freed
only after SRCU period ends which can take some time and thus if
user rapidly creates and frees inotify instances, number of existing
inotify instances can exceed max_user_instances limit although from user
point of view there is always at most one existing instance. Thus
inotify_init() returns EMFILE error which is hard to justify from user
point of view. This problem is exposed by LTP inotify06 testcase on some
machines.

We fix the problem by making sure all group marks are properly freed
while destroying inotify instance. We wait for SRCU period to end in
that path anyway since we have to make sure there is no event being
added to the instance while we are tearing down the instance. So it
takes only some plumbing to allow for marks to be destroyed in that path
as well and not from a dedicated work item.

Reported-and-tested-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fsnotify.h             |  5 +++
 fs/notify/group.c                | 17 ++++++---
 fs/notify/mark.c                 | 78 +++++++++++++++++++++++++++++++---------
 include/linux/fsnotify_backend.h |  2 --
 4 files changed, 79 insertions(+), 23 deletions(-)

Andrew, can you please merge this patch? Thanks!

Comments

Jeff Layton April 27, 2016, 12:30 p.m. UTC | #1
On Wed, 2016-04-27 at 09:56 +0200, Jan Kara wrote:
> Inotify instance is destroyed when all references to it are dropped.
> That not only means that the corresponding file descriptor needs to be
> closed but also that all corresponding instance marks are freed (as each
> mark holds a reference to the inotify instance). However marks are freed
> only after SRCU period ends which can take some time and thus if
> user rapidly creates and frees inotify instances, number of existing
> inotify instances can exceed max_user_instances limit although from user
> point of view there is always at most one existing instance. Thus
> inotify_init() returns EMFILE error which is hard to justify from user
> point of view. This problem is exposed by LTP inotify06 testcase on some
> machines.
> 
> We fix the problem by making sure all group marks are properly freed
> while destroying inotify instance. We wait for SRCU period to end in
> that path anyway since we have to make sure there is no event being
> added to the instance while we are tearing down the instance. So it
> takes only some plumbing to allow for marks to be destroyed in that path
> as well and not from a dedicated work item.
> 
> Reported-and-tested-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
> Signed-off-by: Jan Kara <jack@suse.cz>

Acked-by: Jeff Layton <jlayton@poochiereds.net>

> ---
>  fs/notify/fsnotify.h             |  5 +++
>  fs/notify/group.c                | 17 ++++++---
>  fs/notify/mark.c                 | 78 +++++++++++++++++++++++++++++++---------
>  include/linux/fsnotify_backend.h |  2 --
>  4 files changed, 79 insertions(+), 23 deletions(-)
> 
> Andrew, can you please merge this patch? Thanks!
> 
> diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
> index b44c68a857e7..72fb4f33982f 100644
> --- a/fs/notify/fsnotify.h
> +++ b/fs/notify/fsnotify.h
> @@ -56,6 +56,11 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
>  	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
>  			       &mnt->mnt_root->d_lock);
>  }
> +/* prepare for freeing all marks associated with given group */
> +extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
> +/* wait for fsnotify_mark_srcu period to end and free all marks in destroy_list */
> +extern void fsnotify_mark_destroy_list(void);
> +
>  /*
>   * update the dentry->d_flags of all of inode's children to indicate if inode cares
>   * about events that happen to its children.
> diff --git a/fs/notify/group.c b/fs/notify/group.c
> index d16b62cb2854..3e2dd85be5dd 100644
> --- a/fs/notify/group.c
> +++ b/fs/notify/group.c
> @@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
>   */
>  void fsnotify_destroy_group(struct fsnotify_group *group)
>  {
> -	/* clear all inode marks for this group */
> -	fsnotify_clear_marks_by_group(group);
> +	/* clear all inode marks for this group, attach them to destroy_list */
> +	fsnotify_detach_group_marks(group);
>  
> -	synchronize_srcu(&fsnotify_mark_srcu);
> +	/*
> +	 * Wait for fsnotify_mark_srcu period to end and free all marks in
> +	 * destroy_list
> +	 */
> +	fsnotify_mark_destroy_list();
>  
> -	/* clear the notification queue of all events */
> +	/*
> +	 * Since we have waited for fsnotify_mark_srcu in
> +	 * fsnotify_mark_destroy_list() there can be no outstanding event
> +	 * notification against this group. So clearing the notification queue
> +	 * of all events is reliable now.
> +	 */
>  	fsnotify_flush_notify(group);
>  
>  	/*
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index 7115c5d7d373..d3fea0bd89e2 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu;
>  static DEFINE_SPINLOCK(destroy_lock);
>  static LIST_HEAD(destroy_list);
>  
> -static void fsnotify_mark_destroy(struct work_struct *work);
> -static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
> +static void fsnotify_mark_destroy_workfn(struct work_struct *work);
> +static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
>  
>  void fsnotify_get_mark(struct fsnotify_mark *mark)
>  {
> @@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
>  }
>  
>  /*
> - * Free fsnotify mark. The freeing is actually happening from a kthread which
> - * first waits for srcu period end. Caller must have a reference to the mark
> - * or be protected by fsnotify_mark_srcu.
> + * Prepare mark for freeing and add it to the list of marks prepared for
> + * freeing. The actual freeing must happen after SRCU period ends and the
> + * caller is responsible for this.
> + *
> + * The function returns true if the mark was added to the list of marks for
> + * freeing. The function returns false if someone else has already called
> + * __fsnotify_free_mark() for the mark.
>   */
> -void fsnotify_free_mark(struct fsnotify_mark *mark)
> +static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
>  {
>  	struct fsnotify_group *group = mark->group;
>  
> @@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
>  	/* something else already called this function on this mark */
>  	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
>  		spin_unlock(&mark->lock);
> -		return;
> +		return false;
>  	}
>  	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
>  	spin_unlock(&mark->lock);
>  
> -	spin_lock(&destroy_lock);
> -	list_add(&mark->g_list, &destroy_list);
> -	spin_unlock(&destroy_lock);
> -	queue_delayed_work(system_unbound_wq, &reaper_work,
> -				FSNOTIFY_REAPER_DELAY);
> -
>  	/*
>  	 * Some groups like to know that marks are being freed.  This is a
>  	 * callback to the group function to let it know that this mark
> @@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
>  	 */
>  	if (group->ops->freeing_mark)
>  		group->ops->freeing_mark(mark, group);
> +
> +	spin_lock(&destroy_lock);
> +	list_add(&mark->g_list, &destroy_list);
> +	spin_unlock(&destroy_lock);
> +
> +	return true;
> +}
> +
> +/*
> + * Free fsnotify mark. The freeing is actually happening from a workqueue which
> + * first waits for srcu period end. Caller must have a reference to the mark
> + * or be protected by fsnotify_mark_srcu.
> + */
> +void fsnotify_free_mark(struct fsnotify_mark *mark)
> +{
> +	if (__fsnotify_free_mark(mark)) {
> +		queue_delayed_work(system_unbound_wq, &reaper_work,
> +				   FSNOTIFY_REAPER_DELAY);
> +	}
>  }
>  
>  void fsnotify_destroy_mark(struct fsnotify_mark *mark,
> @@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
>  }
>  
>  /*
> - * Given a group, destroy all of the marks associated with that group.
> + * Given a group, prepare for freeing all the marks associated with that group.
> + * The marks are attached to the list of marks prepared for destruction, the
> + * caller is responsible for freeing marks in that list after SRCU period has
> + * ended.
>   */
> -void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
> +void fsnotify_detach_group_marks(struct fsnotify_group *group)
>  {
> -	fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
> +	struct fsnotify_mark *mark;
> +
> +	while (1) {
> +		mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
> +		if (list_empty(&group->marks_list)) {
> +			mutex_unlock(&group->mark_mutex);
> +			break;
> +		}
> +		mark = list_first_entry(&group->marks_list,
> +					struct fsnotify_mark, g_list);
> +		fsnotify_get_mark(mark);
> +		fsnotify_detach_mark(mark);
> +		mutex_unlock(&group->mark_mutex);
> +		__fsnotify_free_mark(mark);
> +		fsnotify_put_mark(mark);
> +	}
>  }
>  
>  void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
> @@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
>  	mark->free_mark = free_mark;
>  }
>  
> -static void fsnotify_mark_destroy(struct work_struct *work)
> +/*
> + * Destroy all marks in destroy_list, waits for SRCU period to finish before
> + * actually freeing marks.
> + */
> +void fsnotify_mark_destroy_list(void)
>  {
>  	struct fsnotify_mark *mark, *next;
>  	struct list_head private_destroy_list;
> @@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work)
>  		fsnotify_put_mark(mark);
>  	}
>  }
> +
> +static void fsnotify_mark_destroy_workfn(struct work_struct *work)
> +{
> +	fsnotify_mark_destroy_list();
> +}
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index 1259e53d9296..29f917517299 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -359,8 +359,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
>  extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
>  /* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
>  extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
> -/* run all the marks in a group, and flag them to be freed */
> -extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
>  extern void fsnotify_get_mark(struct fsnotify_mark *mark);
>  extern void fsnotify_put_mark(struct fsnotify_mark *mark);
>  extern void fsnotify_unmount_inodes(struct super_block *sb);
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index b44c68a857e7..72fb4f33982f 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -56,6 +56,11 @@  static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
 	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
 			       &mnt->mnt_root->d_lock);
 }
+/* prepare for freeing all marks associated with given group */
+extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
+/* wait for fsnotify_mark_srcu period to end and free all marks in destroy_list */
+extern void fsnotify_mark_destroy_list(void);
+
 /*
  * update the dentry->d_flags of all of inode's children to indicate if inode cares
  * about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index d16b62cb2854..3e2dd85be5dd 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -47,12 +47,21 @@  static void fsnotify_final_destroy_group(struct fsnotify_group *group)
  */
 void fsnotify_destroy_group(struct fsnotify_group *group)
 {
-	/* clear all inode marks for this group */
-	fsnotify_clear_marks_by_group(group);
+	/* clear all inode marks for this group, attach them to destroy_list */
+	fsnotify_detach_group_marks(group);
 
-	synchronize_srcu(&fsnotify_mark_srcu);
+	/*
+	 * Wait for fsnotify_mark_srcu period to end and free all marks in
+	 * destroy_list
+	 */
+	fsnotify_mark_destroy_list();
 
-	/* clear the notification queue of all events */
+	/*
+	 * Since we have waited for fsnotify_mark_srcu in
+	 * fsnotify_mark_destroy_list() there can be no outstanding event
+	 * notification against this group. So clearing the notification queue
+	 * of all events is reliable now.
+	 */
 	fsnotify_flush_notify(group);
 
 	/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 7115c5d7d373..d3fea0bd89e2 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -97,8 +97,8 @@  struct srcu_struct fsnotify_mark_srcu;
 static DEFINE_SPINLOCK(destroy_lock);
 static LIST_HEAD(destroy_list);
 
-static void fsnotify_mark_destroy(struct work_struct *work);
-static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
+static void fsnotify_mark_destroy_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
 
 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
@@ -173,11 +173,15 @@  void fsnotify_detach_mark(struct fsnotify_mark *mark)
 }
 
 /*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Prepare mark for freeing and add it to the list of marks prepared for
+ * freeing. The actual freeing must happen after SRCU period ends and the
+ * caller is responsible for this.
+ *
+ * The function returns true if the mark was added to the list of marks for
+ * freeing. The function returns false if someone else has already called
+ * __fsnotify_free_mark() for the mark.
  */
-void fsnotify_free_mark(struct fsnotify_mark *mark)
+static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
 {
 	struct fsnotify_group *group = mark->group;
 
@@ -185,17 +189,11 @@  void fsnotify_free_mark(struct fsnotify_mark *mark)
 	/* something else already called this function on this mark */
 	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
 		spin_unlock(&mark->lock);
-		return;
+		return false;
 	}
 	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
 	spin_unlock(&mark->lock);
 
-	spin_lock(&destroy_lock);
-	list_add(&mark->g_list, &destroy_list);
-	spin_unlock(&destroy_lock);
-	queue_delayed_work(system_unbound_wq, &reaper_work,
-				FSNOTIFY_REAPER_DELAY);
-
 	/*
 	 * Some groups like to know that marks are being freed.  This is a
 	 * callback to the group function to let it know that this mark
@@ -203,6 +201,25 @@  void fsnotify_free_mark(struct fsnotify_mark *mark)
 	 */
 	if (group->ops->freeing_mark)
 		group->ops->freeing_mark(mark, group);
+
+	spin_lock(&destroy_lock);
+	list_add(&mark->g_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+
+	return true;
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a workqueue which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+	if (__fsnotify_free_mark(mark)) {
+		queue_delayed_work(system_unbound_wq, &reaper_work,
+				   FSNOTIFY_REAPER_DELAY);
+	}
 }
 
 void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -468,11 +485,29 @@  void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
 }
 
 /*
- * Given a group, destroy all of the marks associated with that group.
+ * Given a group, prepare for freeing all the marks associated with that group.
+ * The marks are attached to the list of marks prepared for destruction, the
+ * caller is responsible for freeing marks in that list after SRCU period has
+ * ended.
  */
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+void fsnotify_detach_group_marks(struct fsnotify_group *group)
 {
-	fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+	struct fsnotify_mark *mark;
+
+	while (1) {
+		mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+		if (list_empty(&group->marks_list)) {
+			mutex_unlock(&group->mark_mutex);
+			break;
+		}
+		mark = list_first_entry(&group->marks_list,
+					struct fsnotify_mark, g_list);
+		fsnotify_get_mark(mark);
+		fsnotify_detach_mark(mark);
+		mutex_unlock(&group->mark_mutex);
+		__fsnotify_free_mark(mark);
+		fsnotify_put_mark(mark);
+	}
 }
 
 void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
@@ -499,7 +534,11 @@  void fsnotify_init_mark(struct fsnotify_mark *mark,
 	mark->free_mark = free_mark;
 }
 
-static void fsnotify_mark_destroy(struct work_struct *work)
+/*
+ * Destroy all marks in destroy_list, waits for SRCU period to finish before
+ * actually freeing marks.
+ */
+void fsnotify_mark_destroy_list(void)
 {
 	struct fsnotify_mark *mark, *next;
 	struct list_head private_destroy_list;
@@ -516,3 +555,8 @@  static void fsnotify_mark_destroy(struct work_struct *work)
 		fsnotify_put_mark(mark);
 	}
 }
+
+static void fsnotify_mark_destroy_workfn(struct work_struct *work)
+{
+	fsnotify_mark_destroy_list();
+}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 1259e53d9296..29f917517299 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -359,8 +359,6 @@  extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
 /* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
 extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
-/* run all the marks in a group, and flag them to be freed */
-extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_unmount_inodes(struct super_block *sb);