diff mbox series

[v6,15/21] fanotify: Preallocate per superblock mark error event

Message ID 20210812214010.3197279-16-krisman@collabora.com (mailing list archive)
State New, archived
Headers show
Series File system wide monitoring | expand

Commit Message

Gabriel Krisman Bertazi Aug. 12, 2021, 9:40 p.m. UTC
Error reporting needs to be done in an atomic context.  This patch
introduces a single error slot for superblock marks that report the
FAN_FS_ERROR event, to be used during event submission.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>

---
Changes v5:
  - Restore mark references. (jan)
  - Tie fee slot to the mark lifetime.(jan)
  - Don't reallocate event(jan)
---
 fs/notify/fanotify/fanotify.c      | 12 ++++++++++++
 fs/notify/fanotify/fanotify.h      | 13 +++++++++++++
 fs/notify/fanotify/fanotify_user.c | 31 ++++++++++++++++++++++++++++--
 3 files changed, 54 insertions(+), 2 deletions(-)

Comments

Amir Goldstein Aug. 13, 2021, 8:40 a.m. UTC | #1
On Fri, Aug 13, 2021 at 12:41 AM Gabriel Krisman Bertazi
<krisman@collabora.com> wrote:
>
> Error reporting needs to be done in an atomic context.  This patch
> introduces a single error slot for superblock marks that report the
> FAN_FS_ERROR event, to be used during event submission.
>
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
>
> ---
> Changes v5:
>   - Restore mark references. (jan)
>   - Tie fee slot to the mark lifetime.(jan)
>   - Don't reallocate event(jan)
> ---
>  fs/notify/fanotify/fanotify.c      | 12 ++++++++++++
>  fs/notify/fanotify/fanotify.h      | 13 +++++++++++++
>  fs/notify/fanotify/fanotify_user.c | 31 ++++++++++++++++++++++++++++--
>  3 files changed, 54 insertions(+), 2 deletions(-)
>
> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index ebb6c557cea1..3bf6fd85c634 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -855,6 +855,14 @@ static void fanotify_free_name_event(struct fanotify_event *event)
>         kfree(FANOTIFY_NE(event));
>  }
>
> +static void fanotify_free_error_event(struct fanotify_event *event)
> +{
> +       /*
> +        * The actual event is tied to a mark, and is released on mark
> +        * removal
> +        */
> +}
> +
>  static void fanotify_free_event(struct fsnotify_event *fsn_event)
>  {
>         struct fanotify_event *event;
> @@ -877,6 +885,9 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
>         case FANOTIFY_EVENT_TYPE_OVERFLOW:
>                 kfree(event);
>                 break;
> +       case FANOTIFY_EVENT_TYPE_FS_ERROR:
> +               fanotify_free_error_event(event);
> +               break;
>         default:
>                 WARN_ON_ONCE(1);
>         }
> @@ -894,6 +905,7 @@ static void fanotify_free_mark(struct fsnotify_mark *mark)
>         if (mark->flags & FANOTIFY_MARK_FLAG_SB_MARK) {
>                 struct fanotify_sb_mark *fa_mark = FANOTIFY_SB_MARK(mark);
>
> +               kfree(fa_mark->fee_slot);
>                 kmem_cache_free(fanotify_sb_mark_cache, fa_mark);
>         } else {
>                 kmem_cache_free(fanotify_mark_cache, mark);
> diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
> index b3ab620822c2..3f03333df32f 100644
> --- a/fs/notify/fanotify/fanotify.h
> +++ b/fs/notify/fanotify/fanotify.h
> @@ -139,6 +139,7 @@ enum fanotify_mark_bits {
>
>  struct fanotify_sb_mark {
>         struct fsnotify_mark fsn_mark;
> +       struct fanotify_error_event *fee_slot;
>  };
>
>  static inline
> @@ -161,6 +162,7 @@ enum fanotify_event_type {
>         FANOTIFY_EVENT_TYPE_PATH,
>         FANOTIFY_EVENT_TYPE_PATH_PERM,
>         FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
> +       FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */
>         __FANOTIFY_EVENT_TYPE_NUM
>  };
>
> @@ -216,6 +218,17 @@ FANOTIFY_NE(struct fanotify_event *event)
>         return container_of(event, struct fanotify_name_event, fae);
>  }
>
> +struct fanotify_error_event {
> +       struct fanotify_event fae;
> +       struct fanotify_sb_mark *sb_mark; /* Back reference to the mark. */
> +};
> +
> +static inline struct fanotify_error_event *
> +FANOTIFY_EE(struct fanotify_event *event)
> +{
> +       return container_of(event, struct fanotify_error_event, fae);
> +}
> +
>  static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event)
>  {
>         if (event->type == FANOTIFY_EVENT_TYPE_FID)
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index 54107f1533d5..b77030386d7f 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -947,8 +947,10 @@ static struct fsnotify_mark *fanotify_alloc_mark(struct fsnotify_group *group,
>
>         fsnotify_init_mark(mark, group);
>
> -       if (type == FSNOTIFY_OBJ_TYPE_SB)
> +       if (type == FSNOTIFY_OBJ_TYPE_SB) {
>                 mark->flags |= FANOTIFY_MARK_FLAG_SB_MARK;
> +               sb_mark->fee_slot = NULL;
> +       }
>
>         return mark;
>  }
> @@ -999,6 +1001,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
>  {
>         struct fsnotify_mark *fsn_mark;
>         __u32 added;
> +       int ret = 0;
>
>         mutex_lock(&group->mark_mutex);
>         fsn_mark = fsnotify_find_mark(connp, group);
> @@ -1009,13 +1012,37 @@ static int fanotify_add_mark(struct fsnotify_group *group,
>                         return PTR_ERR(fsn_mark);
>                 }
>         }
> +
> +       /*
> +        * Error events are allocated per super-block mark only if
> +        * strictly needed (i.e. FAN_FS_ERROR was requested).
> +        */
> +       if (type == FSNOTIFY_OBJ_TYPE_SB && !(flags & FAN_MARK_IGNORED_MASK) &&
> +           (mask & FAN_FS_ERROR)) {
> +               struct fanotify_sb_mark *sb_mark = FANOTIFY_SB_MARK(fsn_mark);
> +
> +               if (!sb_mark->fee_slot) {
> +                       struct fanotify_error_event *fee =
> +                               kzalloc(sizeof(*fee), GFP_KERNEL_ACCOUNT);
> +                       if (!fee) {
> +                               ret = -ENOMEM;
> +                               goto out;
> +                       }
> +                       fanotify_init_event(&fee->fae, 0, FS_ERROR);
> +                       fee->sb_mark = sb_mark;

I think Jan wanted to avoid zalloc()?
Please use kmalloc() and init the rest of the fee-> members.
We do not need to fill the entire fh buf with zeroes.

Thanks,
Amir.
Jan Kara Aug. 16, 2021, 3:57 p.m. UTC | #2
On Thu 12-08-21 17:40:04, Gabriel Krisman Bertazi wrote:
> Error reporting needs to be done in an atomic context.  This patch
> introduces a single error slot for superblock marks that report the
> FAN_FS_ERROR event, to be used during event submission.
> 
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> 
> ---
> Changes v5:
>   - Restore mark references. (jan)
>   - Tie fee slot to the mark lifetime.(jan)
>   - Don't reallocate event(jan)
> ---
>  fs/notify/fanotify/fanotify.c      | 12 ++++++++++++
>  fs/notify/fanotify/fanotify.h      | 13 +++++++++++++
>  fs/notify/fanotify/fanotify_user.c | 31 ++++++++++++++++++++++++++++--
>  3 files changed, 54 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index ebb6c557cea1..3bf6fd85c634 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -855,6 +855,14 @@ static void fanotify_free_name_event(struct fanotify_event *event)
>  	kfree(FANOTIFY_NE(event));
>  }
>  
> +static void fanotify_free_error_event(struct fanotify_event *event)
> +{
> +	/*
> +	 * The actual event is tied to a mark, and is released on mark
> +	 * removal
> +	 */
> +}
> +

I was pondering about the lifetime rules some more. This is also related to
patch 16/21 but I'll comment here. When we hold mark ref from queued event,
we introduce a subtle race into group destruction logic. There we first
evict all marks, wait for them to be destroyed by worker thread after SRCU
period expires, and then we remove queued events. When we hold mark
reference from an event we break this as mark will exist until the event is
dequeued and then group can get freed before we actually free the mark and
so mark freeing can hit use-after-free issues.

So we'll have to do this a bit differently. I have two options:

1) Instead of preallocating events explicitely like this, we could setup a
mempool to allocate error events from for each notification group. We would
resize the mempool when adding error mark so that it has as many reserved
events as error marks. Upside is error events will be much less special -
no special lifetime rules. We'd just need to setup & resize the mempool. We
would also have to provide proper merge function for error events (to merge
events from the same sb). Also there will be limitation of number of error
marks per group because mempools use kmalloc() for an array tracking
reserved events. But we could certainly manage 512, likely 1024 error marks
per notification group.

2) We would keep attaching event to mark as currently. As far as I have
checked the event doesn't actually need a back-ref to sb_mark. It is
really only used for mark reference taking (and then to get to sb from
fanotify_handle_error_event() but we can certainly get to sb by easier
means there). So I would just remove that. What we still need to know in
fanotify_free_error_event() though is whether the sb_mark is still alive or
not. If it is alive, we leave the event alone, otherwise we need to free it.
So we need a mark_alive flag in the error event and then do in ->freeing_mark
callback something like:

	if (mark->flags & FANOTIFY_MARK_FLAG_SB_MARK) {
		struct fanotify_sb_mark *fa_mark = FANOTIFY_SB_MARK(mark);

###		/* Maybe we could use mark->lock for this? */
		spin_lock(&group->notification_lock);
		if (fa_mark->fee_slot) {
			if (list_empty(&fa_mark->fee_slot->fae.fse.list)) {
				kfree(fa_mark->fee_slot);
				fa_mark->fee_slot = NULL;
			} else {
				fa_mark->fee_slot->mark_alive = 0;
			}
		}
		spin_unlock(&group->notification_lock);
	}

And then when queueing and dequeueing event we would have to carefully
check what is the mark & event state under appropriate lock (because
->handle_event() callbacks can see marks on the way to be destroyed as they
are protected just by SRCU).


> @@ -1009,13 +1012,37 @@ static int fanotify_add_mark(struct fsnotify_group *group,
>  			return PTR_ERR(fsn_mark);
>  		}
>  	}
> +
> +	/*
> +	 * Error events are allocated per super-block mark only if
> +	 * strictly needed (i.e. FAN_FS_ERROR was requested).
> +	 */
> +	if (type == FSNOTIFY_OBJ_TYPE_SB && !(flags & FAN_MARK_IGNORED_MASK) &&
> +	    (mask & FAN_FS_ERROR)) {
> +		struct fanotify_sb_mark *sb_mark = FANOTIFY_SB_MARK(fsn_mark);
> +
> +		if (!sb_mark->fee_slot) {
> +			struct fanotify_error_event *fee =
> +				kzalloc(sizeof(*fee), GFP_KERNEL_ACCOUNT);

As Amir mentioned, no need for kzalloc() here.

> +			if (!fee) {
> +				ret = -ENOMEM;
> +				goto out;
> +			}
> +			fanotify_init_event(&fee->fae, 0, FS_ERROR);
> +			fee->sb_mark = sb_mark;
> +			sb_mark->fee_slot = fee;

Careful here. The 'sb_mark' can be already attached to sb and events can
walk it. So we should make sure these readers don't see half initialized
'fee' due to CPU reordering stores. So this needs to be protected by the
same lock that we use when generating error event.

								Honza
Gabriel Krisman Bertazi Aug. 27, 2021, 6:18 p.m. UTC | #3
Jan Kara <jack@suse.cz> writes:

> On Thu 12-08-21 17:40:04, Gabriel Krisman Bertazi wrote:
>> Error reporting needs to be done in an atomic context.  This patch
>> introduces a single error slot for superblock marks that report the
>> FAN_FS_ERROR event, to be used during event submission.
>> 
>> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
>> 
>> ---
>> Changes v5:
>>   - Restore mark references. (jan)
>>   - Tie fee slot to the mark lifetime.(jan)
>>   - Don't reallocate event(jan)
>> ---
>>  fs/notify/fanotify/fanotify.c      | 12 ++++++++++++
>>  fs/notify/fanotify/fanotify.h      | 13 +++++++++++++
>>  fs/notify/fanotify/fanotify_user.c | 31 ++++++++++++++++++++++++++++--
>>  3 files changed, 54 insertions(+), 2 deletions(-)
>> 
>> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
>> index ebb6c557cea1..3bf6fd85c634 100644
>> --- a/fs/notify/fanotify/fanotify.c
>> +++ b/fs/notify/fanotify/fanotify.c
>> @@ -855,6 +855,14 @@ static void fanotify_free_name_event(struct fanotify_event *event)
>>  	kfree(FANOTIFY_NE(event));
>>  }
>>  
>> +static void fanotify_free_error_event(struct fanotify_event *event)
>> +{
>> +	/*
>> +	 * The actual event is tied to a mark, and is released on mark
>> +	 * removal
>> +	 */
>> +}
>> +
>
> I was pondering about the lifetime rules some more. This is also related to
> patch 16/21 but I'll comment here. When we hold mark ref from queued event,
> we introduce a subtle race into group destruction logic. There we first
> evict all marks, wait for them to be destroyed by worker thread after SRCU
> period expires, and then we remove queued events. When we hold mark
> reference from an event we break this as mark will exist until the event is
> dequeued and then group can get freed before we actually free the mark and
> so mark freeing can hit use-after-free issues.
>
> So we'll have to do this a bit differently. I have two options:
>
> 1) Instead of preallocating events explicitely like this, we could setup a
> mempool to allocate error events from for each notification group. We would
> resize the mempool when adding error mark so that it has as many reserved
> events as error marks. Upside is error events will be much less special -
> no special lifetime rules. We'd just need to setup & resize the mempool. We
> would also have to provide proper merge function for error events (to merge
> events from the same sb). Also there will be limitation of number of error
> marks per group because mempools use kmalloc() for an array tracking
> reserved events. But we could certainly manage 512, likely 1024 error marks
> per notification group.
>
> 2) We would keep attaching event to mark as currently. As far as I have
> checked the event doesn't actually need a back-ref to sb_mark. It is
> really only used for mark reference taking (and then to get to sb from
> fanotify_handle_error_event() but we can certainly get to sb by easier
> means there). So I would just remove that. What we still need to know in
> fanotify_free_error_event() though is whether the sb_mark is still alive or
> not. If it is alive, we leave the event alone, otherwise we need to free it.
> So we need a mark_alive flag in the error event and then do in ->freeing_mark
> callback something like:
>
> 	if (mark->flags & FANOTIFY_MARK_FLAG_SB_MARK) {
> 		struct fanotify_sb_mark *fa_mark = FANOTIFY_SB_MARK(mark);
>
> ###		/* Maybe we could use mark->lock for this? */
> 		spin_lock(&group->notification_lock);
> 		if (fa_mark->fee_slot) {
> 			if (list_empty(&fa_mark->fee_slot->fae.fse.list)) {
> 				kfree(fa_mark->fee_slot);
> 				fa_mark->fee_slot = NULL;
> 			} else {
> 				fa_mark->fee_slot->mark_alive = 0;
> 			}
> 		}
> 		spin_unlock(&group->notification_lock);
> 	}
>
> And then when queueing and dequeueing event we would have to carefully
> check what is the mark & event state under appropriate lock (because
> ->handle_event() callbacks can see marks on the way to be destroyed as they
> are protected just by SRCU).

Thanks for the review.  That is indeed a subtle race that I hadn't
noticed.

Option 2 is much more straightforward.  And considering the uABI won't
be changed if we decide to change to option 1 later, I gave that a try
and should be able to prepare a new version that leaves the error event
with a weak association to the mark, without the back reference, and
allowing it to be deleted by the latest between dequeue and
->freeing_mark, as you suggested.
Gabriel Krisman Bertazi Sept. 2, 2021, 9:24 p.m. UTC | #4
Gabriel Krisman Bertazi <krisman@collabora.com> writes:

> Jan Kara <jack@suse.cz> writes:
>
>> On Thu 12-08-21 17:40:04, Gabriel Krisman Bertazi wrote:
>>> Error reporting needs to be done in an atomic context.  This patch
>>> introduces a single error slot for superblock marks that report the
>>> FAN_FS_ERROR event, to be used during event submission.
>>> 
>>> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
>>> 
>>> ---
>>> Changes v5:
>>>   - Restore mark references. (jan)
>>>   - Tie fee slot to the mark lifetime.(jan)
>>>   - Don't reallocate event(jan)
>>> ---
>>>  fs/notify/fanotify/fanotify.c      | 12 ++++++++++++
>>>  fs/notify/fanotify/fanotify.h      | 13 +++++++++++++
>>>  fs/notify/fanotify/fanotify_user.c | 31 ++++++++++++++++++++++++++++--
>>>  3 files changed, 54 insertions(+), 2 deletions(-)
>>> 
>>> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
>>> index ebb6c557cea1..3bf6fd85c634 100644
>>> --- a/fs/notify/fanotify/fanotify.c
>>> +++ b/fs/notify/fanotify/fanotify.c
>>> @@ -855,6 +855,14 @@ static void fanotify_free_name_event(struct fanotify_event *event)
>>>  	kfree(FANOTIFY_NE(event));
>>>  }
>>>  
>>> +static void fanotify_free_error_event(struct fanotify_event *event)
>>> +{
>>> +	/*
>>> +	 * The actual event is tied to a mark, and is released on mark
>>> +	 * removal
>>> +	 */
>>> +}
>>> +
>>
>> I was pondering about the lifetime rules some more. This is also related to
>> patch 16/21 but I'll comment here. When we hold mark ref from queued event,
>> we introduce a subtle race into group destruction logic. There we first
>> evict all marks, wait for them to be destroyed by worker thread after SRCU
>> period expires, and then we remove queued events. When we hold mark
>> reference from an event we break this as mark will exist until the event is
>> dequeued and then group can get freed before we actually free the mark and
>> so mark freeing can hit use-after-free issues.
>>
>> So we'll have to do this a bit differently. I have two options:
>>
>> 1) Instead of preallocating events explicitely like this, we could setup a
>> mempool to allocate error events from for each notification group. We would
>> resize the mempool when adding error mark so that it has as many reserved
>> events as error marks. Upside is error events will be much less special -
>> no special lifetime rules. We'd just need to setup & resize the mempool. We
>> would also have to provide proper merge function for error events (to merge
>> events from the same sb). Also there will be limitation of number of error
>> marks per group because mempools use kmalloc() for an array tracking
>> reserved events. But we could certainly manage 512, likely 1024 error marks
>> per notification group.
>>
>> 2) We would keep attaching event to mark as currently. As far as I have
>> checked the event doesn't actually need a back-ref to sb_mark. It is
>> really only used for mark reference taking (and then to get to sb from
>> fanotify_handle_error_event() but we can certainly get to sb by easier
>> means there). So I would just remove that. What we still need to know in
>> fanotify_free_error_event() though is whether the sb_mark is still alive or
>> not. If it is alive, we leave the event alone, otherwise we need to free it.
>> So we need a mark_alive flag in the error event and then do in ->freeing_mark
>> callback something like:
>>
>> 	if (mark->flags & FANOTIFY_MARK_FLAG_SB_MARK) {
>> 		struct fanotify_sb_mark *fa_mark = FANOTIFY_SB_MARK(mark);
>>
>> ###		/* Maybe we could use mark->lock for this? */
>> 		spin_lock(&group->notification_lock);
>> 		if (fa_mark->fee_slot) {
>> 			if (list_empty(&fa_mark->fee_slot->fae.fse.list)) {
>> 				kfree(fa_mark->fee_slot);
>> 				fa_mark->fee_slot = NULL;
>> 			} else {
>> 				fa_mark->fee_slot->mark_alive = 0;
>> 			}
>> 		}
>> 		spin_unlock(&group->notification_lock);
>> 	}
>>
>> And then when queueing and dequeueing event we would have to carefully
>> check what is the mark & event state under appropriate lock (because
>> ->handle_event() callbacks can see marks on the way to be destroyed as they
>> are protected just by SRCU).
>
> Thanks for the review.  That is indeed a subtle race that I hadn't
> noticed.
>
> Option 2 is much more straightforward.  And considering the uABI won't
> be changed if we decide to change to option 1 later, I gave that a try
> and should be able to prepare a new version that leaves the error event
> with a weak association to the mark, without the back reference, and
> allowing it to be deleted by the latest between dequeue and
> ->freeing_mark, as you suggested.

Actually, I don't think this will work for insertion unless we keep a
bounce buffer for the file_handle, because we need to keep the
group->notification_lock to ensure the fee doesn't go away with the mark
(since it is not yet enqueued) but, as discussed before, we don't want
to hold that lock when generating the FH.

I think the correct way is to have some sort of refcount of the error
event slot.  We could use err_count for that and change the suggestion
above to:

if (mark->flags & FANOTIFY_MARK_FLAG_SB_MARK) {
	struct fanotify_sb_mark *fa_mark = FANOTIFY_SB_MARK(mark);

	spin_lock(&group->notification_lock);
	if (fa_mark->fee_slot) {
		if (!fee->err_count) {
			kfree(fa_mark->fee_slot);
			fa_mark->fee_slot = NULL;
		} else {
			fa_mark->fee_slot->mark_alive = 0;
		}
	}
	spin_unlock(&group->notification_lock);
}

And insertion would look like this:

static int fanotify_handle_error_event(....) {

	spin_lock(&group->notification_lock);

	if (!mark->fee || (mark->fee->err_count++) {
		spin_unlock(&group->notification_lock);
		return 0;
	}

	spin_unlock(&group->notification_lock);

	mark->fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;

	/* ... Write report data to error event ... */

	fanotify_encode_fh(&fee->object_fh, fanotify_encode_fh_len(inode),
 			   NULL, 0);

	fsnotify_add_event(group, &fee->fae.fse, NULL);
   }

Unless you think this is too hack-ish.

To be fair, I think it is hack-ish.  I would add a proper refcount_t
to the error event, and let the mark own a reference to it, which is
dropped when the mark goes away.  Enqueue and Dequeue will acquire and
drop references, respectively. In this case, err_count is not
overloaded.

Will it work?
Amir Goldstein Sept. 3, 2021, 4:16 a.m. UTC | #5
On Fri, Sep 3, 2021 at 12:24 AM Gabriel Krisman Bertazi
<krisman@collabora.com> wrote:
>
> Gabriel Krisman Bertazi <krisman@collabora.com> writes:
>
> > Jan Kara <jack@suse.cz> writes:
> >
> >> On Thu 12-08-21 17:40:04, Gabriel Krisman Bertazi wrote:
> >>> Error reporting needs to be done in an atomic context.  This patch
> >>> introduces a single error slot for superblock marks that report the
> >>> FAN_FS_ERROR event, to be used during event submission.
> >>>
> >>> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> >>>
> >>> ---
> >>> Changes v5:
> >>>   - Restore mark references. (jan)
> >>>   - Tie fee slot to the mark lifetime.(jan)
> >>>   - Don't reallocate event(jan)
> >>> ---
> >>>  fs/notify/fanotify/fanotify.c      | 12 ++++++++++++
> >>>  fs/notify/fanotify/fanotify.h      | 13 +++++++++++++
> >>>  fs/notify/fanotify/fanotify_user.c | 31 ++++++++++++++++++++++++++++--
> >>>  3 files changed, 54 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> >>> index ebb6c557cea1..3bf6fd85c634 100644
> >>> --- a/fs/notify/fanotify/fanotify.c
> >>> +++ b/fs/notify/fanotify/fanotify.c
> >>> @@ -855,6 +855,14 @@ static void fanotify_free_name_event(struct fanotify_event *event)
> >>>     kfree(FANOTIFY_NE(event));
> >>>  }
> >>>
> >>> +static void fanotify_free_error_event(struct fanotify_event *event)
> >>> +{
> >>> +   /*
> >>> +    * The actual event is tied to a mark, and is released on mark
> >>> +    * removal
> >>> +    */
> >>> +}
> >>> +
> >>
> >> I was pondering about the lifetime rules some more. This is also related to
> >> patch 16/21 but I'll comment here. When we hold mark ref from queued event,
> >> we introduce a subtle race into group destruction logic. There we first
> >> evict all marks, wait for them to be destroyed by worker thread after SRCU
> >> period expires, and then we remove queued events. When we hold mark
> >> reference from an event we break this as mark will exist until the event is
> >> dequeued and then group can get freed before we actually free the mark and
> >> so mark freeing can hit use-after-free issues.
> >>
> >> So we'll have to do this a bit differently. I have two options:
> >>
> >> 1) Instead of preallocating events explicitely like this, we could setup a
> >> mempool to allocate error events from for each notification group. We would
> >> resize the mempool when adding error mark so that it has as many reserved
> >> events as error marks. Upside is error events will be much less special -
> >> no special lifetime rules. We'd just need to setup & resize the mempool. We
> >> would also have to provide proper merge function for error events (to merge
> >> events from the same sb). Also there will be limitation of number of error
> >> marks per group because mempools use kmalloc() for an array tracking
> >> reserved events. But we could certainly manage 512, likely 1024 error marks
> >> per notification group.
> >>
> >> 2) We would keep attaching event to mark as currently. As far as I have
> >> checked the event doesn't actually need a back-ref to sb_mark. It is
> >> really only used for mark reference taking (and then to get to sb from
> >> fanotify_handle_error_event() but we can certainly get to sb by easier
> >> means there). So I would just remove that. What we still need to know in
> >> fanotify_free_error_event() though is whether the sb_mark is still alive or
> >> not. If it is alive, we leave the event alone, otherwise we need to free it.
> >> So we need a mark_alive flag in the error event and then do in ->freeing_mark
> >> callback something like:
> >>
> >>      if (mark->flags & FANOTIFY_MARK_FLAG_SB_MARK) {
> >>              struct fanotify_sb_mark *fa_mark = FANOTIFY_SB_MARK(mark);
> >>
> >> ###          /* Maybe we could use mark->lock for this? */
> >>              spin_lock(&group->notification_lock);
> >>              if (fa_mark->fee_slot) {
> >>                      if (list_empty(&fa_mark->fee_slot->fae.fse.list)) {
> >>                              kfree(fa_mark->fee_slot);
> >>                              fa_mark->fee_slot = NULL;
> >>                      } else {
> >>                              fa_mark->fee_slot->mark_alive = 0;
> >>                      }
> >>              }
> >>              spin_unlock(&group->notification_lock);
> >>      }
> >>
> >> And then when queueing and dequeueing event we would have to carefully

"would have to carefully..." oh oh! there are not words that I like to
read unless
I have to.
I think that fs error events are rare enough case and not performance sensitive
at all, so we should strive to KISS design principle in this case.

> >> check what is the mark & event state under appropriate lock (because
> >> ->handle_event() callbacks can see marks on the way to be destroyed as they
> >> are protected just by SRCU).
> >
> > Thanks for the review.  That is indeed a subtle race that I hadn't
> > noticed.
> >
> > Option 2 is much more straightforward.  And considering the uABI won't
> > be changed if we decide to change to option 1 later, I gave that a try
> > and should be able to prepare a new version that leaves the error event
> > with a weak association to the mark, without the back reference, and
> > allowing it to be deleted by the latest between dequeue and
> > ->freeing_mark, as you suggested.
>
> Actually, I don't think this will work for insertion unless we keep a
> bounce buffer for the file_handle, because we need to keep the
> group->notification_lock to ensure the fee doesn't go away with the mark
> (since it is not yet enqueued) but, as discussed before, we don't want
> to hold that lock when generating the FH.
>
> I think the correct way is to have some sort of refcount of the error
> event slot.  We could use err_count for that and change the suggestion
> above to:
>
> if (mark->flags & FANOTIFY_MARK_FLAG_SB_MARK) {
>         struct fanotify_sb_mark *fa_mark = FANOTIFY_SB_MARK(mark);
>
>         spin_lock(&group->notification_lock);
>         if (fa_mark->fee_slot) {
>                 if (!fee->err_count) {
>                         kfree(fa_mark->fee_slot);
>                         fa_mark->fee_slot = NULL;
>                 } else {
>                         fa_mark->fee_slot->mark_alive = 0;
>                 }
>         }
>         spin_unlock(&group->notification_lock);
> }
>
> And insertion would look like this:
>
> static int fanotify_handle_error_event(....) {
>
>         spin_lock(&group->notification_lock);
>
>         if (!mark->fee || (mark->fee->err_count++) {
>                 spin_unlock(&group->notification_lock);
>                 return 0;
>         }
>
>         spin_unlock(&group->notification_lock);
>
>         mark->fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
>
>         /* ... Write report data to error event ... */
>
>         fanotify_encode_fh(&fee->object_fh, fanotify_encode_fh_len(inode),
>                            NULL, 0);
>
>         fsnotify_add_event(group, &fee->fae.fse, NULL);
>    }
>
> Unless you think this is too hack-ish.
>
> To be fair, I think it is hack-ish.

Actually, I wouldn't mind the hack-ish-ness if it would simplify things,
but I do not see how this is the case here.
I still cannot wrap my head around the semantics, which is a big red light.
First of all a suggestion should start with the lifetime rules:
- Possible states
- State transition rules

Speaking for myself, I simply cannot review a proposal without these
documented rules.

> I would add a proper refcount_t
> to the error event, and let the mark own a reference to it, which is
> dropped when the mark goes away.  Enqueue and Dequeue will acquire and
> drop references, respectively. In this case, err_count is not
> overloaded.
>
> Will it work?

Maybe, I still don't see the full picture, but if this can get us to a state
where error events handling is simpler then it's a good idea.
Saving the space of refcount_t in error event struct is not important at all.

But if Jan's option #1 (mempool) brings us to less special casing
of enqueue/dequeue of error events, then I think that would be
my preference.

In any case, I suggest to wait for Jan's inputs before you continue.

Thanks,
Amir.
Jan Kara Sept. 15, 2021, 10:31 a.m. UTC | #6
On Fri 03-09-21 07:16:33, Amir Goldstein wrote:
> On Fri, Sep 3, 2021 at 12:24 AM Gabriel Krisman Bertazi
> <krisman@collabora.com> wrote:
> > Actually, I don't think this will work for insertion unless we keep a
> > bounce buffer for the file_handle, because we need to keep the
> > group->notification_lock to ensure the fee doesn't go away with the mark
> > (since it is not yet enqueued) but, as discussed before, we don't want
> > to hold that lock when generating the FH.
> >
> > I think the correct way is to have some sort of refcount of the error
> > event slot.  We could use err_count for that and change the suggestion
> > above to:
> >
> > if (mark->flags & FANOTIFY_MARK_FLAG_SB_MARK) {
> >         struct fanotify_sb_mark *fa_mark = FANOTIFY_SB_MARK(mark);
> >
> >         spin_lock(&group->notification_lock);
> >         if (fa_mark->fee_slot) {
> >                 if (!fee->err_count) {
> >                         kfree(fa_mark->fee_slot);
> >                         fa_mark->fee_slot = NULL;
> >                 } else {
> >                         fa_mark->fee_slot->mark_alive = 0;
> >                 }
> >         }
> >         spin_unlock(&group->notification_lock);
> > }
> >
> > And insertion would look like this:
> >
> > static int fanotify_handle_error_event(....) {
> >
> >         spin_lock(&group->notification_lock);
> >
> >         if (!mark->fee || (mark->fee->err_count++) {
> >                 spin_unlock(&group->notification_lock);
> >                 return 0;
> >         }
> >
> >         spin_unlock(&group->notification_lock);
> >
> >         mark->fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
> >
> >         /* ... Write report data to error event ... */
> >
> >         fanotify_encode_fh(&fee->object_fh, fanotify_encode_fh_len(inode),
> >                            NULL, 0);
> >
> >         fsnotify_add_event(group, &fee->fae.fse, NULL);
> >    }
> >
> > Unless you think this is too hack-ish.
> >
> > To be fair, I think it is hack-ish.
> 
> Actually, I wouldn't mind the hack-ish-ness if it would simplify things,
> but I do not see how this is the case here.
> I still cannot wrap my head around the semantics, which is a big red light.
> First of all a suggestion should start with the lifetime rules:
> - Possible states
> - State transition rules
> 
> Speaking for myself, I simply cannot review a proposal without these
> documented rules.

Hum, getting back up to speed on this after vacation is tough which
suggests maybe we've indeed overengineered this :) So let's try to simplify
things.

> > I would add a proper refcount_t
> > to the error event, and let the mark own a reference to it, which is
> > dropped when the mark goes away.  Enqueue and Dequeue will acquire and
> > drop references, respectively. In this case, err_count is not
> > overloaded.
> >
> > Will it work?
> 
> Maybe, I still don't see the full picture, but if this can get us to a state
> where error events handling is simpler then it's a good idea.
> Saving the space of refcount_t in error event struct is not important at all.
> 
> But if Jan's option #1 (mempool) brings us to less special casing
> of enqueue/dequeue of error events, then I think that would be
> my preference.

Yes, I think mempools would result in a simpler code overall (the
complexity of recycling events would be handled by mempool for us). Maybe
we would not even need to play tricks with mempool resizing? We could just
make sure it has couple of events reserved and if it ever happens that
mempool_alloc() cannot give us any event, we'd report queue overflow (like
we already do for other event types if that happens). I think we could
require that callers generating error events are in a context where GFP_NOFS
allocation is OK - this should be achievable target for filesystems and
allocation failures should be rare with such mask.

									Honza
diff mbox series

Patch

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index ebb6c557cea1..3bf6fd85c634 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -855,6 +855,14 @@  static void fanotify_free_name_event(struct fanotify_event *event)
 	kfree(FANOTIFY_NE(event));
 }
 
+static void fanotify_free_error_event(struct fanotify_event *event)
+{
+	/*
+	 * The actual event is tied to a mark, and is released on mark
+	 * removal
+	 */
+}
+
 static void fanotify_free_event(struct fsnotify_event *fsn_event)
 {
 	struct fanotify_event *event;
@@ -877,6 +885,9 @@  static void fanotify_free_event(struct fsnotify_event *fsn_event)
 	case FANOTIFY_EVENT_TYPE_OVERFLOW:
 		kfree(event);
 		break;
+	case FANOTIFY_EVENT_TYPE_FS_ERROR:
+		fanotify_free_error_event(event);
+		break;
 	default:
 		WARN_ON_ONCE(1);
 	}
@@ -894,6 +905,7 @@  static void fanotify_free_mark(struct fsnotify_mark *mark)
 	if (mark->flags & FANOTIFY_MARK_FLAG_SB_MARK) {
 		struct fanotify_sb_mark *fa_mark = FANOTIFY_SB_MARK(mark);
 
+		kfree(fa_mark->fee_slot);
 		kmem_cache_free(fanotify_sb_mark_cache, fa_mark);
 	} else {
 		kmem_cache_free(fanotify_mark_cache, mark);
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index b3ab620822c2..3f03333df32f 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -139,6 +139,7 @@  enum fanotify_mark_bits {
 
 struct fanotify_sb_mark {
 	struct fsnotify_mark fsn_mark;
+	struct fanotify_error_event *fee_slot;
 };
 
 static inline
@@ -161,6 +162,7 @@  enum fanotify_event_type {
 	FANOTIFY_EVENT_TYPE_PATH,
 	FANOTIFY_EVENT_TYPE_PATH_PERM,
 	FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
+	FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */
 	__FANOTIFY_EVENT_TYPE_NUM
 };
 
@@ -216,6 +218,17 @@  FANOTIFY_NE(struct fanotify_event *event)
 	return container_of(event, struct fanotify_name_event, fae);
 }
 
+struct fanotify_error_event {
+	struct fanotify_event fae;
+	struct fanotify_sb_mark *sb_mark; /* Back reference to the mark. */
+};
+
+static inline struct fanotify_error_event *
+FANOTIFY_EE(struct fanotify_event *event)
+{
+	return container_of(event, struct fanotify_error_event, fae);
+}
+
 static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event)
 {
 	if (event->type == FANOTIFY_EVENT_TYPE_FID)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 54107f1533d5..b77030386d7f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -947,8 +947,10 @@  static struct fsnotify_mark *fanotify_alloc_mark(struct fsnotify_group *group,
 
 	fsnotify_init_mark(mark, group);
 
-	if (type == FSNOTIFY_OBJ_TYPE_SB)
+	if (type == FSNOTIFY_OBJ_TYPE_SB) {
 		mark->flags |= FANOTIFY_MARK_FLAG_SB_MARK;
+		sb_mark->fee_slot = NULL;
+	}
 
 	return mark;
 }
@@ -999,6 +1001,7 @@  static int fanotify_add_mark(struct fsnotify_group *group,
 {
 	struct fsnotify_mark *fsn_mark;
 	__u32 added;
+	int ret = 0;
 
 	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_mark(connp, group);
@@ -1009,13 +1012,37 @@  static int fanotify_add_mark(struct fsnotify_group *group,
 			return PTR_ERR(fsn_mark);
 		}
 	}
+
+	/*
+	 * Error events are allocated per super-block mark only if
+	 * strictly needed (i.e. FAN_FS_ERROR was requested).
+	 */
+	if (type == FSNOTIFY_OBJ_TYPE_SB && !(flags & FAN_MARK_IGNORED_MASK) &&
+	    (mask & FAN_FS_ERROR)) {
+		struct fanotify_sb_mark *sb_mark = FANOTIFY_SB_MARK(fsn_mark);
+
+		if (!sb_mark->fee_slot) {
+			struct fanotify_error_event *fee =
+				kzalloc(sizeof(*fee), GFP_KERNEL_ACCOUNT);
+			if (!fee) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			fanotify_init_event(&fee->fae, 0, FS_ERROR);
+			fee->sb_mark = sb_mark;
+			sb_mark->fee_slot = fee;
+		}
+	}
+
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
 	if (added & ~fsnotify_conn_mask(fsn_mark->connector))
 		fsnotify_recalc_mask(fsn_mark->connector);
+
+out:
 	mutex_unlock(&group->mark_mutex);
 
 	fsnotify_put_mark(fsn_mark);
-	return 0;
+	return ret;
 }
 
 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,