diff mbox

[RFC,2/2] fsnotify: handle permission events without holding fsnotify_mark_srcu[0]

Message ID 1479124107-8477-3-git-send-email-amir73il@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Amir Goldstein Nov. 14, 2016, 11:48 a.m. UTC
Handling fanotify events does not entail dereferencing fsnotify_mark
beyond the point of fanotify_should_send_event().

For the case of permission events, which may block indefinitely,
return -EAGAIN and then fsnotify() will call handle_event() again
without a reference to the mark.

Without a reference to the mark, there is no need to call
handle_event() under fsnotify_mark_srcu[0] read side lock,
so we drop fsnotify_mark_srcu[0] while handling the event
and grab it back before continuing to the next mark.

After this change, a blocking permission event will no longer
block closing of any file descriptors of 0 priority groups,
i.e: inotify and fanotify groups of class FAN_CLASS_NOTIF.

Reported-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/notify/fanotify/fanotify.c | 15 +++++++++++----
 fs/notify/fsnotify.c          | 23 +++++++++++++++++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)

Comments

Jan Kara Nov. 14, 2016, 1:20 p.m. UTC | #1
On Mon 14-11-16 13:48:27, Amir Goldstein wrote:
> Handling fanotify events does not entail dereferencing fsnotify_mark
> beyond the point of fanotify_should_send_event().
> 
> For the case of permission events, which may block indefinitely,
> return -EAGAIN and then fsnotify() will call handle_event() again
> without a reference to the mark.
> 
> Without a reference to the mark, there is no need to call
> handle_event() under fsnotify_mark_srcu[0] read side lock,
> so we drop fsnotify_mark_srcu[0] while handling the event
> and grab it back before continuing to the next mark.
> 
> After this change, a blocking permission event will no longer
> block closing of any file descriptors of 0 priority groups,
> i.e: inotify and fanotify groups of class FAN_CLASS_NOTIF.
> 
> Reported-by: Miklos Szeredi <miklos@szeredi.hu>
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>

Well, this has a similar problem as my attempt to fix the issue. The
current mark can get removed from the mark list while waiting for userspace
response. ->next pointer is still valid at that moment but ->next->pprev
already points to mark preceding us (that's how rcu lists work). When
->next mark then gets removed from the list and destroyed (it may be
protected by the second srcu), our ->next pointer points to freed memory.

Furthermore I don't like the scheme of ->handle_event returning -EAGAIN and
then dropping the srcu lock - I'd rather have some helpers provided by the
generic fsnotify code to drop srcu lock. That needs some propagation of
information inside the ->handle_event and then the helper but that's IMO
cleaner. Anyway, that is just a technical detail.

I have some ideas how to fix up issues with my refcounting approach -
refcount would pin marks not only in memory but also in lists but I have
yet to see whether that works out sensibly (it would mean that dropping
mark reference would then need to take group->mark_mutex and that may cause
lock ordering issues).

								Honza


> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index e0e5f7c..c7689ad 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -176,7 +176,7 @@ init: __maybe_unused
>  static int fanotify_handle_event(struct fsnotify_group *group,
>  				 struct inode *inode,
>  				 struct fsnotify_mark *inode_mark,
> -				 struct fsnotify_mark *fanotify_mark,
> +				 struct fsnotify_mark *vfsmnt_mark,
>  				 u32 mask, void *data, int data_type,
>  				 const unsigned char *file_name, u32 cookie)
>  {
> @@ -195,9 +195,16 @@ static int fanotify_handle_event(struct fsnotify_group *group,
>  	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
>  	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
>  
> -	if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data,
> -					data_type))
> -		return 0;
> +	if (inode_mark || vfsmnt_mark) {
> +		if (!fanotify_should_send_event(inode_mark, vfsmnt_mark, mask,
> +						data, data_type))
> +			return 0;
> +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
> +		/* Ask to be called again without a reference to mark */
> +		if (mask & FAN_ALL_PERM_EVENTS)
> +			return -EAGAIN;
> +#endif
> +	}
>  
>  	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
>  		 mask);
> diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> index af5c523a..5b9a248 100644
> --- a/fs/notify/fsnotify.c
> +++ b/fs/notify/fsnotify.c
> @@ -291,6 +291,29 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
>  		ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
>  				    data, data_is, cookie, file_name);
>  
> +		/*
> +		 * If handle_event() is going to block, we call it again
> +		 * witout holding fsnotify_mark_srcu[0], which is protecting
> +		 * the low priority mark lists.
> +		 * We are still holding fsnotify_mark_srcu[1], which
> +		 * is protecting the high priority marks in the first half
> +		 * of the mark list, which is where we are at.
> +		 */
> +		if (group->priority > 0 && ret == -EAGAIN) {
> +			srcu_read_unlock(&fsnotify_mark_srcu[0], idx);
> +
> +			ret = group->ops->handle_event(group, to_tell,
> +						       NULL, NULL,
> +						       mask, data, data_is,
> +						       file_name, cookie);
> +
> +			/*
> +			 * We need to hold fsnotify_mark_srcu[0], because
> +			 * next mark may be low priority.
> +			 */
> +			idx = srcu_read_lock(&fsnotify_mark_srcu[0]);
> +		}
> +
>  		if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
>  			goto out;
>  
> -- 
> 2.7.4
>
Amir Goldstein Nov. 14, 2016, 3:09 p.m. UTC | #2
On Mon, Nov 14, 2016 at 3:20 PM, Jan Kara <jack@suse.cz> wrote:
> On Mon 14-11-16 13:48:27, Amir Goldstein wrote:
>> Handling fanotify events does not entail dereferencing fsnotify_mark
>> beyond the point of fanotify_should_send_event().
>>
>> For the case of permission events, which may block indefinitely,
>> return -EAGAIN and then fsnotify() will call handle_event() again
>> without a reference to the mark.
>>
>> Without a reference to the mark, there is no need to call
>> handle_event() under fsnotify_mark_srcu[0] read side lock,
>> so we drop fsnotify_mark_srcu[0] while handling the event
>> and grab it back before continuing to the next mark.
>>
>> After this change, a blocking permission event will no longer
>> block closing of any file descriptors of 0 priority groups,
>> i.e: inotify and fanotify groups of class FAN_CLASS_NOTIF.
>>
>> Reported-by: Miklos Szeredi <miklos@szeredi.hu>
>> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
>
> Well, this has a similar problem as my attempt to fix the issue. The
> current mark can get removed from the mark list while waiting for userspace
> response. ->next pointer is still valid at that moment but ->next->pprev
> already points to mark preceding us (that's how rcu lists work). When
> ->next mark then gets removed from the list and destroyed (it may be
> protected by the second srcu), our ->next pointer points to freed memory.

Oh! I missed the fact that the SRCU does not protect mark->obj_list.
Can resurrecting mark->free_list buy us anything?
Perhaps keep the marks on obj_list without FLAG_ATTACHED
and then remove marks from both free_list and obj_list post srcu_synchronize()?
I think that removing mark->obj_list was optimization of good faith
and not because it really hurts the system's memory usage that much?

>
> Furthermore I don't like the scheme of ->handle_event returning -EAGAIN and
> then dropping the srcu lock - I'd rather have some helpers provided by the
> generic fsnotify code to drop srcu lock. That needs some propagation of
> information inside the ->handle_event and then the helper but that's IMO
> cleaner. Anyway, that is just a technical detail.
>

Yeh, that's just the minimal LOC implementation. I figured the implementation
may be rejected for more profound flaws. Although personally,
I do like the so called O_NONBLOCKING semantics here.
I debated with myself if I should use EAGAIN or EWOULDBLOCK
for sake of readability.

> I have some ideas how to fix up issues with my refcounting approach -
> refcount would pin marks not only in memory but also in lists but I have
> yet to see whether that works out sensibly (it would mean that dropping
> mark reference would then need to take group->mark_mutex and that may cause
> lock ordering issues).
>

It sounds like a chicken and egg problem, but I suppose you don't mean
the actual mark refcount, but a secondary "pinned refcount"?

Anyway, if you have something ready, my test setup is already in place..

Cheers,
Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Nov. 16, 2016, 9:35 a.m. UTC | #3
On Mon 14-11-16 17:09:47, Amir Goldstein wrote:
> On Mon, Nov 14, 2016 at 3:20 PM, Jan Kara <jack@suse.cz> wrote:
> > On Mon 14-11-16 13:48:27, Amir Goldstein wrote:
> >> Handling fanotify events does not entail dereferencing fsnotify_mark
> >> beyond the point of fanotify_should_send_event().
> >>
> >> For the case of permission events, which may block indefinitely,
> >> return -EAGAIN and then fsnotify() will call handle_event() again
> >> without a reference to the mark.
> >>
> >> Without a reference to the mark, there is no need to call
> >> handle_event() under fsnotify_mark_srcu[0] read side lock,
> >> so we drop fsnotify_mark_srcu[0] while handling the event
> >> and grab it back before continuing to the next mark.
> >>
> >> After this change, a blocking permission event will no longer
> >> block closing of any file descriptors of 0 priority groups,
> >> i.e: inotify and fanotify groups of class FAN_CLASS_NOTIF.
> >>
> >> Reported-by: Miklos Szeredi <miklos@szeredi.hu>
> >> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> >
> > Well, this has a similar problem as my attempt to fix the issue. The
> > current mark can get removed from the mark list while waiting for userspace
> > response. ->next pointer is still valid at that moment but ->next->pprev
> > already points to mark preceding us (that's how rcu lists work). When
> > ->next mark then gets removed from the list and destroyed (it may be
> > protected by the second srcu), our ->next pointer points to freed memory.
> 
> Oh! I missed the fact that the SRCU does not protect mark->obj_list.
> Can resurrecting mark->free_list buy us anything?
> Perhaps keep the marks on obj_list without FLAG_ATTACHED
> and then remove marks from both free_list and obj_list post srcu_synchronize()?
> I think that removing mark->obj_list was optimization of good faith
> and not because it really hurts the system's memory usage that much?

You have to be really careful here. Minimally you'd need then another
srcu_synchronize() call after removing mark from the list and before
freeing the mark so that you are sure no process iterating the list can
have a reference to a mark being freed but even then it needs careful
checking whether it would work. The joy of lockless algorithms...

> > I have some ideas how to fix up issues with my refcounting approach -
> > refcount would pin marks not only in memory but also in lists but I have
> > yet to see whether that works out sensibly (it would mean that dropping
> > mark reference would then need to take group->mark_mutex and that may cause
> > lock ordering issues).
> 
> It sounds like a chicken and egg problem, but I suppose you don't mean
> the actual mark refcount, but a secondary "pinned refcount"?

No, I mean the original refcount. I have patches that already postpone part
of the mark cleanup to happen only once the refcount drops to 0.

> Anyway, if you have something ready, my test setup is already in place..

Thanks for an offer. I don't have anything yet, hopefully later today or
tomorrow.

								Honza
diff mbox

Patch

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index e0e5f7c..c7689ad 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -176,7 +176,7 @@  init: __maybe_unused
 static int fanotify_handle_event(struct fsnotify_group *group,
 				 struct inode *inode,
 				 struct fsnotify_mark *inode_mark,
-				 struct fsnotify_mark *fanotify_mark,
+				 struct fsnotify_mark *vfsmnt_mark,
 				 u32 mask, void *data, int data_type,
 				 const unsigned char *file_name, u32 cookie)
 {
@@ -195,9 +195,16 @@  static int fanotify_handle_event(struct fsnotify_group *group,
 	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
 	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
 
-	if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data,
-					data_type))
-		return 0;
+	if (inode_mark || vfsmnt_mark) {
+		if (!fanotify_should_send_event(inode_mark, vfsmnt_mark, mask,
+						data, data_type))
+			return 0;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+		/* Ask to be called again without a reference to mark */
+		if (mask & FAN_ALL_PERM_EVENTS)
+			return -EAGAIN;
+#endif
+	}
 
 	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
 		 mask);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index af5c523a..5b9a248 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -291,6 +291,29 @@  int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 		ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
 				    data, data_is, cookie, file_name);
 
+		/*
+		 * If handle_event() is going to block, we call it again
+		 * witout holding fsnotify_mark_srcu[0], which is protecting
+		 * the low priority mark lists.
+		 * We are still holding fsnotify_mark_srcu[1], which
+		 * is protecting the high priority marks in the first half
+		 * of the mark list, which is where we are at.
+		 */
+		if (group->priority > 0 && ret == -EAGAIN) {
+			srcu_read_unlock(&fsnotify_mark_srcu[0], idx);
+
+			ret = group->ops->handle_event(group, to_tell,
+						       NULL, NULL,
+						       mask, data, data_is,
+						       file_name, cookie);
+
+			/*
+			 * We need to hold fsnotify_mark_srcu[0], because
+			 * next mark may be low priority.
+			 */
+			idx = srcu_read_lock(&fsnotify_mark_srcu[0]);
+		}
+
 		if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
 			goto out;