diff mbox

[3/3] fanotify: Fix list corruption in fanotify_get_response()

Message ID 1473344745-20634-4-git-send-email-jack@suse.cz (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Kara Sept. 8, 2016, 2:25 p.m. UTC
fanotify_get_response() calls fsnotify_remove_event() when it finds that
group is being released from fanotify_release() (bypass_perm is set).
However the event it removes need not be only in the group's notification
queue but it can have already moved to access_list (userspace read the
event before closing the fanotify instance fd) which is protected by a
different lock. Thus when fsnotify_remove_event() races with
fanotify_release() operating on access_list, the list can get corrupted.

Fix the problem by moving all the logic removing permission events from
the lists to one place - fanotify_release().

Reported-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c      | 13 +------------
 fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++------------
 include/linux/fsnotify_backend.h   |  1 -
 3 files changed, 25 insertions(+), 25 deletions(-)

Comments

Miklos Szeredi Sept. 9, 2016, 1:39 p.m. UTC | #1
On Thu, Sep 8, 2016 at 4:25 PM, Jan Kara <jack@suse.cz> wrote:
> fanotify_get_response() calls fsnotify_remove_event() when it finds that
> group is being released from fanotify_release() (bypass_perm is set).
> However the event it removes need not be only in the group's notification
> queue but it can have already moved to access_list (userspace read the
> event before closing the fanotify instance fd) which is protected by a
> different lock. Thus when fsnotify_remove_event() races with
> fanotify_release() operating on access_list, the list can get corrupted.
>
> Fix the problem by moving all the logic removing permission events from
> the lists to one place - fanotify_release().

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>

This should fix the reported issue, and it's cleaner than my solution.
I'll backport and ask QA to test.  I haven't tried reproducing it
myself.

However the theoretical memory access ordering issue (which might
possibly be impossible to trigger) is still there AFAICS:

CPU1:
list_del_init(&event->fae.fse.list);
event->response = FAN_ALLOW;

CPU2:
wait_event(group->fanotify_data.access_waitq, event->response);
...
WARN_ON(!list_empty(&event->list));

So I think we still need a separate patch adding smp_wmb() before
setting event->response and smp_rmb() after the wait_event().

Thanks,
Miklos

>
> Reported-by: Miklos Szeredi <mszeredi@redhat.com>
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/notify/fanotify/fanotify.c      | 13 +------------
>  fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++------------
>  include/linux/fsnotify_backend.h   |  1 -
>  3 files changed, 25 insertions(+), 25 deletions(-)
>
> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index 3c6c81e0c2c8..363540df7db9 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
>
>         pr_debug("%s: group=%p event=%p\n", __func__, group, event);
>
> -       wait_event(group->fanotify_data.access_waitq, event->response ||
> -                               atomic_read(&group->fanotify_data.bypass_perm));
> -
> -       if (!event->response) { /* bypass_perm set */
> -               /*
> -                * Event was canceled because group is being destroyed. Remove
> -                * it from group's event list because we are responsible for
> -                * freeing the permission event.
> -                */
> -               fsnotify_remove_event(group, &event->fae.fse);
> -               return 0;
> -       }
> +       wait_event(group->fanotify_data.access_waitq, event->response);
>
>         /* userspace responded, convert to something usable */
>         switch (event->response) {
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index 8e8e6bcd1d43..a64313868d3a 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file)
>
>  #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
>         struct fanotify_perm_event_info *event, *next;
> +       struct fsnotify_event *fsn_event;
>
>         /*
> -        * There may be still new events arriving in the notification queue
> -        * but since userspace cannot use fanotify fd anymore, no event can
> -        * enter or leave access_list by now.
> +        * Stop new events from arriving in the notification queue. since
> +        * userspace cannot use fanotify fd anymore, no event can enter or
> +        * leave access_list by now either.
>          */
> -       spin_lock(&group->fanotify_data.access_lock);
> -
> -       atomic_inc(&group->fanotify_data.bypass_perm);
> +       fsnotify_group_stop_queueing(group);
>
> +       /*
> +        * Process all permission events on access_list and notification queue
> +        * and simulate reply from userspace.
> +        */
> +       spin_lock(&group->fanotify_data.access_lock);
>         list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
>                                  fae.fse.list) {
>                 pr_debug("%s: found group=%p event=%p\n", __func__, group,
> @@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
>         spin_unlock(&group->fanotify_data.access_lock);
>
>         /*
> -        * Since bypass_perm is set, newly queued events will not wait for
> -        * access response. Wake up the already sleeping ones now.
> -        * synchronize_srcu() in fsnotify_destroy_group() will wait for all
> -        * processes sleeping in fanotify_handle_event() waiting for access
> -        * response and thus also for all permission events to be freed.
> +        * Destroy all non-permission events. For permission events just
> +        * dequeue them and set the response. They will be freed once the
> +        * response is consumed and fanotify_get_response() returns.
>          */
> +       mutex_lock(&group->notification_mutex);
> +       while (!fsnotify_notify_queue_is_empty(group)) {
> +               fsn_event = fsnotify_remove_first_event(group);
> +               if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
> +                       fsnotify_destroy_event(group, fsn_event);
> +               else
> +                       FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
> +       }
> +       mutex_unlock(&group->notification_mutex);
> +
> +       /* Response for all permission events it set, wakeup waiters */
>         wake_up(&group->fanotify_data.access_waitq);
>  #endif
>
> @@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>         spin_lock_init(&group->fanotify_data.access_lock);
>         init_waitqueue_head(&group->fanotify_data.access_waitq);
>         INIT_LIST_HEAD(&group->fanotify_data.access_list);
> -       atomic_set(&group->fanotify_data.bypass_perm, 0);
>  #endif
>         switch (flags & FAN_ALL_CLASS_BITS) {
>         case FAN_CLASS_NOTIF:
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index bcba826a99fc..7ca04e082002 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -180,7 +180,6 @@ struct fsnotify_group {
>                         spinlock_t access_lock;
>                         struct list_head access_list;
>                         wait_queue_head_t access_waitq;
> -                       atomic_t bypass_perm;
>  #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
>                         int f_flags;
>                         unsigned int max_marks;
> --
> 2.6.6
>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Sept. 9, 2016, 3:17 p.m. UTC | #2
On Fri 09-09-16 15:39:44, Miklos Szeredi wrote:
> On Thu, Sep 8, 2016 at 4:25 PM, Jan Kara <jack@suse.cz> wrote:
> > fanotify_get_response() calls fsnotify_remove_event() when it finds that
> > group is being released from fanotify_release() (bypass_perm is set).
> > However the event it removes need not be only in the group's notification
> > queue but it can have already moved to access_list (userspace read the
> > event before closing the fanotify instance fd) which is protected by a
> > different lock. Thus when fsnotify_remove_event() races with
> > fanotify_release() operating on access_list, the list can get corrupted.
> >
> > Fix the problem by moving all the logic removing permission events from
> > the lists to one place - fanotify_release().
> 
> Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
> 
> This should fix the reported issue, and it's cleaner than my solution.
> I'll backport and ask QA to test.  I haven't tried reproducing it
> myself.

OK, thanks!

> However the theoretical memory access ordering issue (which might
> possibly be impossible to trigger) is still there AFAICS:
> 
> CPU1:
> list_del_init(&event->fae.fse.list);
> event->response = FAN_ALLOW;
> 
> CPU2:
> wait_event(group->fanotify_data.access_waitq, event->response);
> ...
> WARN_ON(!list_empty(&event->list));
> 
> So I think we still need a separate patch adding smp_wmb() before
> setting event->response and smp_rmb() after the wait_event().

Ugh, this is really theoretical ;) But I agree nothing really prevents it.
The core reason for this seems to be the lockless check of event->list. I
dislike adding barriers just to accommodate that WARN_ON() (although that
is a useful one so I wouldn't like to lose it either). I'll think about it.
Thanks for spotting this.

								Honza
Lino Sanfilippo Sept. 10, 2016, 12:33 a.m. UTC | #3
Hi,

On 08.09.2016 16:25, Jan Kara wrote:
> fanotify_get_response() calls fsnotify_remove_event() when it finds that
> group is being released from fanotify_release() (bypass_perm is set).
> However the event it removes need not be only in the group's notification
> queue but it can have already moved to access_list (userspace read the
> event before closing the fanotify instance fd) which is protected by a
> different lock. Thus when fsnotify_remove_event() races with
> fanotify_release() operating on access_list, the list can get corrupted.
> 
> Fix the problem by moving all the logic removing permission events from
> the lists to one place - fanotify_release().
> 
> Reported-by: Miklos Szeredi <mszeredi@redhat.com>
> Signed-off-by: Jan Kara <jack@suse.cz>

I think all in all this patchset implements a sane solution. Nevertheless 
some comments/questions concerning the new code in fanotify_release below:

> @@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
>  	spin_unlock(&group->fanotify_data.access_lock);
>  
>  	/*
> -	 * Since bypass_perm is set, newly queued events will not wait for
> -	 * access response. Wake up the already sleeping ones now.
> -	 * synchronize_srcu() in fsnotify_destroy_group() will wait for all
> -	 * processes sleeping in fanotify_handle_event() waiting for access
> -	 * response and thus also for all permission events to be freed.
> +	 * Destroy all non-permission events. For permission events just
> +	 * dequeue them and set the response. They will be freed once the
> +	 * response is consumed and fanotify_get_response() returns.
>  	 */
> +	mutex_lock(&group->notification_mutex);
> +	while (!fsnotify_notify_queue_is_empty(group)) {
> +		fsn_event = fsnotify_remove_first_event(group);
> +		if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
> +			fsnotify_destroy_event(group, fsn_event);
> +		else
> +			FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;

Why do we have to handle (i.e destroy) non-permission events here? Is this 
meant as some kind of optimization?

Also, we now set the response during both iterations. I assume the assignment during
the iteration of the access list is only a leftover and can be skipped.

Why dont we set the shutdown flag directly in release() as soon as we grep 
the notification mutex. I dont see any reason to do this seperated from
the removal of events from the notification list (or do I miss something?).
Similar situation in destroy_group(): We can set the flag in flush_notify instead
of destroy_group (we then do not even need the dedicated function stop_queueing() then).

fsnotify_remove_event() is not used any more now, so we should remove this function.

Regards,
Lino
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Sept. 12, 2016, 8:32 a.m. UTC | #4
On Sat 10-09-16 02:33:34, Lino Sanfilippo wrote:
> > @@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
> >  	spin_unlock(&group->fanotify_data.access_lock);
> >  
> >  	/*
> > -	 * Since bypass_perm is set, newly queued events will not wait for
> > -	 * access response. Wake up the already sleeping ones now.
> > -	 * synchronize_srcu() in fsnotify_destroy_group() will wait for all
> > -	 * processes sleeping in fanotify_handle_event() waiting for access
> > -	 * response and thus also for all permission events to be freed.
> > +	 * Destroy all non-permission events. For permission events just
> > +	 * dequeue them and set the response. They will be freed once the
> > +	 * response is consumed and fanotify_get_response() returns.
> >  	 */
> > +	mutex_lock(&group->notification_mutex);
> > +	while (!fsnotify_notify_queue_is_empty(group)) {
> > +		fsn_event = fsnotify_remove_first_event(group);
> > +		if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
> > +			fsnotify_destroy_event(group, fsn_event);
> > +		else
> > +			FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
> 
> Why do we have to handle (i.e destroy) non-permission events here? Is this 
> meant as some kind of optimization?

Where else would they be destroyed? Normal events are destroyed when
userspace reads them but nobody is going to read these anymore. For
permission events things are more complex - they get destroyed by the
process creating these events since that process waits for the response.

> Also, we now set the response during both iterations. I assume the
> assignment during the iteration of the access list is only a leftover and
> can be skipped.

Both are currently needed because I've changed fanotify_get_response() to
check only the response value.

> Why dont we set the shutdown flag directly in release() as soon as we
> grep the notification mutex. I dont see any reason to do this seperated
> from the removal of events from the notification list (or do I miss
> something?).  Similar situation in destroy_group(): We can set the flag
> in flush_notify instead of destroy_group (we then do not even need the
> dedicated function stop_queueing() then).

We could do it like that but I didn't want fanotify to hook into generic
fsnotify internals and rather wanted to encapsulate the functionality in
a function. Since this is not really performance critical, extra round trip
on notification_mutex should be fine.

> fsnotify_remove_event() is not used any more now, so we should remove
> this function.

Yeah, will fix that.

								Honza
Miklos Szeredi Sept. 12, 2016, 8:36 a.m. UTC | #5
On Mon, Sep 12, 2016 at 10:32 AM, Jan Kara <jack@suse.cz> wrote:
> On Sat 10-09-16 02:33:34, Lino Sanfilippo wrote:
>> > @@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
>> >     spin_unlock(&group->fanotify_data.access_lock);
>> >
>> >     /*
>> > -    * Since bypass_perm is set, newly queued events will not wait for
>> > -    * access response. Wake up the already sleeping ones now.
>> > -    * synchronize_srcu() in fsnotify_destroy_group() will wait for all
>> > -    * processes sleeping in fanotify_handle_event() waiting for access
>> > -    * response and thus also for all permission events to be freed.
>> > +    * Destroy all non-permission events. For permission events just
>> > +    * dequeue them and set the response. They will be freed once the
>> > +    * response is consumed and fanotify_get_response() returns.
>> >      */
>> > +   mutex_lock(&group->notification_mutex);
>> > +   while (!fsnotify_notify_queue_is_empty(group)) {
>> > +           fsn_event = fsnotify_remove_first_event(group);
>> > +           if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
>> > +                   fsnotify_destroy_event(group, fsn_event);
>> > +           else
>> > +                   FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
>>
>> Why do we have to handle (i.e destroy) non-permission events here? Is this
>> meant as some kind of optimization?
>
> Where else would they be destroyed? Normal events are destroyed when
> userspace reads them but nobody is going to read these anymore. For
> permission events things are more complex - they get destroyed by the
> process creating these events since that process waits for the response.

I think he meant that normal events get destroyed anyway in
fsnotify_destroy_group().  Which is true, but then we'd need a proper
iterator for picking just permission events and it would be a lot more
complicated in the end.

Thanks,
Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lino Sanfilippo Sept. 12, 2016, 9:11 p.m. UTC | #6
On 12.09.2016 10:32, Jan Kara wrote:

> 
>> Why dont we set the shutdown flag directly in release() as soon as we
>> grep the notification mutex. I dont see any reason to do this seperated
>> from the removal of events from the notification list (or do I miss
>> something?).  Similar situation in destroy_group(): We can set the flag
>> in flush_notify instead of destroy_group (we then do not even need the
>> dedicated function stop_queueing() then).
> 
> We could do it like that but I didn't want fanotify to hook into generic
> fsnotify internals and rather wanted to encapsulate the functionality in
> a function. Since this is not really performance critical, extra round trip
> on notification_mutex should be fine.

I understood that flag as an extension for generic fsnotify groups, not only
for fanotify. Sure, there is nothing else that uses it right now, but
"being destroyed" is a state that is valid for all types of fsnotify groups, and it could
 be useful in future to know when a group is in this state. But maybe it depends
on the point of view if this should be treated as a fanotify or fsnotify extension. And
its up to you, of course.

Regards,
Lino



 

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miklos Szeredi Sept. 13, 2016, 7:58 a.m. UTC | #7
The backported patches test out OK.

I asked permission to publish the reproducer.

Thanks,
Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miklos Szeredi Sept. 13, 2016, 9:25 a.m. UTC | #8
On Tue, Sep 13, 2016 at 9:58 AM, Miklos Szeredi <mszeredi@redhat.com> wrote:
> The backported patches test out OK.
>
> I asked permission to publish the reproducer.

Fujitsu provided the attached reproducer.  Entry point is "./run.sh"

Thanks,
Miklos
diff mbox

Patch

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 3c6c81e0c2c8..363540df7db9 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -67,18 +67,7 @@  static int fanotify_get_response(struct fsnotify_group *group,
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	wait_event(group->fanotify_data.access_waitq, event->response ||
-				atomic_read(&group->fanotify_data.bypass_perm));
-
-	if (!event->response) {	/* bypass_perm set */
-		/*
-		 * Event was canceled because group is being destroyed. Remove
-		 * it from group's event list because we are responsible for
-		 * freeing the permission event.
-		 */
-		fsnotify_remove_event(group, &event->fae.fse);
-		return 0;
-	}
+	wait_event(group->fanotify_data.access_waitq, event->response);
 
 	/* userspace responded, convert to something usable */
 	switch (event->response) {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8e8e6bcd1d43..a64313868d3a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -358,16 +358,20 @@  static int fanotify_release(struct inode *ignored, struct file *file)
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	struct fanotify_perm_event_info *event, *next;
+	struct fsnotify_event *fsn_event;
 
 	/*
-	 * There may be still new events arriving in the notification queue
-	 * but since userspace cannot use fanotify fd anymore, no event can
-	 * enter or leave access_list by now.
+	 * Stop new events from arriving in the notification queue. since
+	 * userspace cannot use fanotify fd anymore, no event can enter or
+	 * leave access_list by now either.
 	 */
-	spin_lock(&group->fanotify_data.access_lock);
-
-	atomic_inc(&group->fanotify_data.bypass_perm);
+	fsnotify_group_stop_queueing(group);
 
+	/*
+	 * Process all permission events on access_list and notification queue
+	 * and simulate reply from userspace.
+	 */
+	spin_lock(&group->fanotify_data.access_lock);
 	list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
 				 fae.fse.list) {
 		pr_debug("%s: found group=%p event=%p\n", __func__, group,
@@ -379,12 +383,21 @@  static int fanotify_release(struct inode *ignored, struct file *file)
 	spin_unlock(&group->fanotify_data.access_lock);
 
 	/*
-	 * Since bypass_perm is set, newly queued events will not wait for
-	 * access response. Wake up the already sleeping ones now.
-	 * synchronize_srcu() in fsnotify_destroy_group() will wait for all
-	 * processes sleeping in fanotify_handle_event() waiting for access
-	 * response and thus also for all permission events to be freed.
+	 * Destroy all non-permission events. For permission events just
+	 * dequeue them and set the response. They will be freed once the
+	 * response is consumed and fanotify_get_response() returns.
 	 */
+	mutex_lock(&group->notification_mutex);
+	while (!fsnotify_notify_queue_is_empty(group)) {
+		fsn_event = fsnotify_remove_first_event(group);
+		if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
+			fsnotify_destroy_event(group, fsn_event);
+		else
+			FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+	}
+	mutex_unlock(&group->notification_mutex);
+
+	/* Response for all permission events it set, wakeup waiters */
 	wake_up(&group->fanotify_data.access_waitq);
 #endif
 
@@ -755,7 +768,6 @@  SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	spin_lock_init(&group->fanotify_data.access_lock);
 	init_waitqueue_head(&group->fanotify_data.access_waitq);
 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
-	atomic_set(&group->fanotify_data.bypass_perm, 0);
 #endif
 	switch (flags & FAN_ALL_CLASS_BITS) {
 	case FAN_CLASS_NOTIF:
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index bcba826a99fc..7ca04e082002 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -180,7 +180,6 @@  struct fsnotify_group {
 			spinlock_t access_lock;
 			struct list_head access_list;
 			wait_queue_head_t access_waitq;
-			atomic_t bypass_perm;
 #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
 			int f_flags;
 			unsigned int max_marks;