diff mbox series

[2/7] fsnotify: support hashed notification queue

Message ID 20210202162010.305971-3-amir73il@gmail.com (mailing list archive)
State New, archived
Headers show
Series Performance improvement for fanotify merge | expand

Commit Message

Amir Goldstein Feb. 2, 2021, 4:20 p.m. UTC
In order to improve event merge performance, support sharding the
notification queue to several notification lists, hashed by an event
merge key.

The fanotify event merge key is the object id reduced to 32bit hash.

At the moment, both inotify and fanotify still use a single notification
list.

At the moment, reading events from the hashed queue is not by event
insertion order.  A non-empty list is read until it is empty.

The max events limit is accounted for total events in all lists.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/notify/fanotify/fanotify.c      |   5 +-
 fs/notify/fanotify/fanotify.h      |   4 +-
 fs/notify/fanotify/fanotify_user.c |  14 ++-
 fs/notify/group.c                  |  37 ++++++--
 fs/notify/inotify/inotify_user.c   |  17 ++--
 fs/notify/notification.c           | 131 ++++++++++++++++++++++++-----
 include/linux/fsnotify_backend.h   |  63 +++++++++++---
 7 files changed, 217 insertions(+), 54 deletions(-)

Comments

Jan Kara Feb. 16, 2021, 3:02 p.m. UTC | #1
On Tue 02-02-21 18:20:05, Amir Goldstein wrote:
> In order to improve event merge performance, support sharding the
> notification queue to several notification lists, hashed by an event
> merge key.
> 
> The fanotify event merge key is the object id reduced to 32bit hash.
> 
> At the moment, both inotify and fanotify still use a single notification
> list.
> 
> At the moment, reading events from the hashed queue is not by event
> insertion order.  A non-empty list is read until it is empty.
> 
> The max events limit is accounted for total events in all lists.
> 
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>

Style nit: Please try to stay within 80 columns...

Also I think this change would be more comprehensible if it was merged with
the following patch 3/8. Having code with TODOs is kind of awkward and
makes it more difficult to verify the result is actually correct.

> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index 8ff27d92d32c..dee12d927f8d 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -635,6 +635,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
>  {
>  	struct fsnotify_group *group;
>  	struct fsnotify_event *fsn_event;
> +	struct list_head *list;
>  	void __user *p;
>  	int ret = -ENOTTY;
>  	size_t send_len = 0;
> @@ -646,8 +647,15 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
>  	switch (cmd) {
>  	case FIONREAD:
>  		spin_lock(&group->notification_lock);
> -		list_for_each_entry(fsn_event, &group->notification_list, list)
> -			send_len += FAN_EVENT_METADATA_LEN;
> +		list = fsnotify_first_notification_list(group);
> +		/*
> +		 * With multi queue, send_len will be a lower bound
> +		 * on total events size.
> +		 */
> +		if (list) {
> +			list_for_each_entry(fsn_event, list, list)
> +				send_len += FAN_EVENT_METADATA_LEN;
> +		}

IMO we should just walk all the lists here. Otherwise reported length would
be just odd. That being said the returned value already is odd because we
don't properly account for different event sizes (unrelated problem). So if
we want to keep it simple for now, we can just return group->num_events *
FAN_EVENT_METADATA_LEN, can't we?

> diff --git a/fs/notify/group.c b/fs/notify/group.c
> index ffd723ffe46d..b41ed68f9ff9 100644
> --- a/fs/notify/group.c
> +++ b/fs/notify/group.c
> @@ -111,12 +116,20 @@ void fsnotify_put_group(struct fsnotify_group *group)
>  }
>  EXPORT_SYMBOL_GPL(fsnotify_put_group);
>  
> -static struct fsnotify_group *__fsnotify_alloc_group(
> +static struct fsnotify_group *__fsnotify_alloc_group(unsigned int q_hash_bits,
>  				const struct fsnotify_ops *ops, gfp_t gfp)
>  {
>  	struct fsnotify_group *group;
> +	int i;
> +
> +#ifdef FSNOTIFY_HASHED_QUEUE
> +	if (WARN_ON_ONCE(q_hash_bits > FSNOTIFY_HASHED_QUEUE_MAX_BITS))
> +		q_hash_bits = FSNOTIFY_HASHED_QUEUE_MAX_BITS;
> +#else
> +	q_hash_bits = 0;
> +#endif

Why the check for q_hash_bits? We have in-kernel users only so I would not
be that paranoid :) Maybe I'd just let the group specify whether it wants
hashed queue or not and for hashed queues always set number of buckets to
128. So far we don't need anything more flexible and if we ever do, it's
easy enough to change the in-kernel API...

Also, honestly, I find these ifdefs ugly. I'd just leave hashed queues
unconditionally compiled in.

> -	group = kzalloc(sizeof(struct fsnotify_group), gfp);
> +	group = kzalloc(fsnotify_group_size(q_hash_bits), gfp);
>  	if (!group)
>  		return ERR_PTR(-ENOMEM);
>  
> @@ -126,8 +139,12 @@ static struct fsnotify_group *__fsnotify_alloc_group(
>  	atomic_set(&group->user_waits, 0);
>  
>  	spin_lock_init(&group->notification_lock);
> -	INIT_LIST_HEAD(&group->notification_list);
>  	init_waitqueue_head(&group->notification_waitq);
> +	/* Initialize one or more notification lists */
> +	group->q_hash_bits = q_hash_bits;
> +	group->max_bucket = (1 << q_hash_bits) - 1;
> +	for (i = 0; i <= group->max_bucket; i++)
> +		INIT_LIST_HEAD(&group->notification_list[i]);
>  	group->max_events = UINT_MAX;
>  
>  	mutex_init(&group->mark_mutex);
> @@ -139,20 +156,24 @@ static struct fsnotify_group *__fsnotify_alloc_group(
>  }
>  
>  /*
> - * Create a new fsnotify_group and hold a reference for the group returned.
> + * Create a new fsnotify_group with no events queue.

How come "no events queue"? There will be always at least one queue...

> + * Hold a reference for the group returned.
>   */
>  struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
>  {
> -	return __fsnotify_alloc_group(ops, GFP_KERNEL);
> +	return __fsnotify_alloc_group(0, ops, GFP_KERNEL);
>  }
>  EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
>  
>  /*
> - * Create a new fsnotify_group and hold a reference for the group returned.
> + * Create a new fsnotify_group with an events queue.
> + * If @q_hash_bits > 0, the queue is shareded into several notification lists.
> + * Hold a reference for the group returned.
>   */
> -struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops)
> +struct fsnotify_group *fsnotify_alloc_user_group(unsigned int q_hash_bits,
> +						 const struct fsnotify_ops *ops)
>  {
> -	return __fsnotify_alloc_group(ops, GFP_KERNEL_ACCOUNT);
> +	return __fsnotify_alloc_group(q_hash_bits, ops, GFP_KERNEL_ACCOUNT);
>  }
>  EXPORT_SYMBOL_GPL(fsnotify_alloc_user_group);
>  
> diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
> index d8830be60a9b..1c476b9485dc 100644
> --- a/fs/notify/inotify/inotify_user.c
> +++ b/fs/notify/inotify/inotify_user.c
> @@ -288,6 +288,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
>  {
>  	struct fsnotify_group *group;
>  	struct fsnotify_event *fsn_event;
> +	struct list_head *list;
>  	void __user *p;
>  	int ret = -ENOTTY;
>  	size_t send_len = 0;
> @@ -300,10 +301,16 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
>  	switch (cmd) {
>  	case FIONREAD:
>  		spin_lock(&group->notification_lock);
> -		list_for_each_entry(fsn_event, &group->notification_list,
> -				    list) {
> -			send_len += sizeof(struct inotify_event);
> -			send_len += round_event_name_len(fsn_event);
> +		list = fsnotify_first_notification_list(group);
> +		/*
> +		 * With multi queue, send_len will be a lower bound
> +		 * on total events size.
> +		 */
> +		if (list) {
> +			list_for_each_entry(fsn_event, list, list) {
> +				send_len += sizeof(struct inotify_event);
> +				send_len += round_event_name_len(fsn_event);
> +			}

As I write below IMO we should enable hashed queues also for inotify (is
there good reason not to?) and here we should just walk through all the
lists.

>  		}
>  		spin_unlock(&group->notification_lock);
>  		ret = put_user(send_len, (int __user *) p);
> @@ -631,7 +638,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
>  	struct fsnotify_group *group;
>  	struct inotify_event_info *oevent;
>  
> -	group = fsnotify_alloc_user_group(&inotify_fsnotify_ops);
> +	group = fsnotify_alloc_user_group(0, &inotify_fsnotify_ops);
>  	if (IS_ERR(group))
>  		return group;
>  
> diff --git a/fs/notify/notification.c b/fs/notify/notification.c
> index fcac2d72cbf5..58c8f6c1be1b 100644
> --- a/fs/notify/notification.c
> +++ b/fs/notify/notification.c
...
> @@ -74,10 +67,75 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
>  	group->ops->free_event(event);
>  }
>  
> +/* Check and fix inconsistencies in hashed queue */
> +static void fsnotify_queue_check(struct fsnotify_group *group)
> +{
> +#ifdef FSNOTIFY_HASHED_QUEUE
> +	struct list_head *list;
> +	int i, nbuckets = 0;
> +	bool first_empty, last_empty;
> +
> +	assert_spin_locked(&group->notification_lock);
> +
> +	pr_debug("%s: group=%p events: num=%u max=%u buckets: first=%u last=%u max=%u\n",
> +		 __func__, group, group->num_events, group->max_events,
> +		 group->first_bucket, group->last_bucket, group->max_bucket);
> +
> +	if (fsnotify_notify_queue_is_empty(group))
> +		return;
> +
> +	first_empty = list_empty(&group->notification_list[group->first_bucket]);
> +	last_empty = list_empty(&group->notification_list[group->last_bucket]);
> +
> +	list = &group->notification_list[0];
> +	for (i = 0; i <= group->max_bucket; i++, list++) {
> +		if (list_empty(list))
> +			continue;
> +		if (nbuckets++)
> +			continue;
> +		if (first_empty)
> +			group->first_bucket = i;
> +		if (last_empty)
> +			group->last_bucket = i;
> +	}
> +
> +	pr_debug("%s: %u non-empty buckets\n", __func__, nbuckets);
> +
> +	/* All buckets are empty, but non-zero num_events? */
> +	if (WARN_ON_ONCE(!nbuckets && group->num_events))
> +		group->num_events = 0;
> +#endif

Hum, what is this function about? I understand you might want to have this
around for debugging when developing the patch but is there are legitimate
reason for this in production?

I understand current patch uses this function for searching for next
non-empty list but after this patch is merged with the next one, is there
still a need for this?

> @@ -147,17 +228,21 @@ void fsnotify_remove_queued_event(struct fsnotify_group *group,
>  struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
>  {
>  	struct fsnotify_event *event;
> +	struct list_head *list;
>  
>  	assert_spin_locked(&group->notification_lock);
>  
> -	if (fsnotify_notify_queue_is_empty(group))
> +	list = fsnotify_first_notification_list(group);
> +	if (!list)
>  		return NULL;
>  
> -	pr_debug("%s: group=%p\n", __func__, group);
> +	pr_debug("%s: group=%p bucket=%u\n", __func__, group, group->first_bucket);
>  
> -	event = list_first_entry(&group->notification_list,
> -				 struct fsnotify_event, list);
> +	event = list_first_entry(list, struct fsnotify_event, list);

Perhaps the above could just reuse fsnotify_peek_first_event()? It is now
complex enough to be worth it I guess.

>  	fsnotify_remove_queued_event(group, event);
> +	/*
> +	 * TODO: update group->first_bucket to next_bucket in first event.
> +	 */
>  	return event;
>  }
>  
> @@ -167,13 +252,15 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
>   */
>  struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
>  {
> +	struct list_head *list;
> +
>  	assert_spin_locked(&group->notification_lock);
>  
> -	if (fsnotify_notify_queue_is_empty(group))
> +	list = fsnotify_first_notification_list(group);
> +	if (!list)
>  		return NULL;
>  
> -	return list_first_entry(&group->notification_list,
> -				struct fsnotify_event, list);
> +	return list_first_entry(list, struct fsnotify_event, list);
>  }
>  
>  /*
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index e5409b83e731..b2a80bc00108 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -160,6 +160,11 @@ struct fsnotify_ops {
>  	void (*free_mark)(struct fsnotify_mark *mark);
>  };
>  
> +#ifdef CONFIG_FANOTIFY
> +#define FSNOTIFY_HASHED_QUEUE
> +#define FSNOTIFY_HASHED_QUEUE_MAX_BITS 8
> +#endif
> +

Would not inotify benefit from this work as well? Also I'd just compile in
hashed queues unconditionally. The ifdefs are ugly and IMHO not worth it.

...

> +static inline size_t fsnotify_group_size(unsigned int q_hash_bits)
> +{
> +	return sizeof(struct fsnotify_group) + (sizeof(struct list_head) << q_hash_bits);
> +}
> +
> +static inline unsigned int fsnotify_event_bucket(struct fsnotify_group *group,
> +						 struct fsnotify_event *event)
> +{
> +	/* High bits are better for hash */
> +	return (event->key >> (32 - group->q_hash_bits)) & group->max_bucket;
> +}

Why not use hash_32() here? IMHO better than just stripping bits...

> +
> +static inline struct list_head *fsnotify_event_notification_list(
> +						struct fsnotify_group *group,
> +						struct fsnotify_event *event)
> +{
> +	return &group->notification_list[fsnotify_event_bucket(group, event)];
> +}
> +

Any reason for this to be in a header? Will it ever have other user than
fsnotify_add_event()?

								Honza
Amir Goldstein Feb. 17, 2021, 12:33 p.m. UTC | #2
On Tue, Feb 16, 2021 at 5:02 PM Jan Kara <jack@suse.cz> wrote:
>
> On Tue 02-02-21 18:20:05, Amir Goldstein wrote:
> > In order to improve event merge performance, support sharding the
> > notification queue to several notification lists, hashed by an event
> > merge key.
> >
> > The fanotify event merge key is the object id reduced to 32bit hash.
> >
> > At the moment, both inotify and fanotify still use a single notification
> > list.
> >
> > At the moment, reading events from the hashed queue is not by event
> > insertion order.  A non-empty list is read until it is empty.
> >
> > The max events limit is accounted for total events in all lists.
> >
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
>
> Style nit: Please try to stay within 80 columns...
>
> Also I think this change would be more comprehensible if it was merged with
> the following patch 3/8. Having code with TODOs is kind of awkward and
> makes it more difficult to verify the result is actually correct.
>

Ok.

> > diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> > index 8ff27d92d32c..dee12d927f8d 100644
> > --- a/fs/notify/fanotify/fanotify_user.c
> > +++ b/fs/notify/fanotify/fanotify_user.c
> > @@ -635,6 +635,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
> >  {
> >       struct fsnotify_group *group;
> >       struct fsnotify_event *fsn_event;
> > +     struct list_head *list;
> >       void __user *p;
> >       int ret = -ENOTTY;
> >       size_t send_len = 0;
> > @@ -646,8 +647,15 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
> >       switch (cmd) {
> >       case FIONREAD:
> >               spin_lock(&group->notification_lock);
> > -             list_for_each_entry(fsn_event, &group->notification_list, list)
> > -                     send_len += FAN_EVENT_METADATA_LEN;
> > +             list = fsnotify_first_notification_list(group);
> > +             /*
> > +              * With multi queue, send_len will be a lower bound
> > +              * on total events size.
> > +              */
> > +             if (list) {
> > +                     list_for_each_entry(fsn_event, list, list)
> > +                             send_len += FAN_EVENT_METADATA_LEN;
> > +             }
>
> IMO we should just walk all the lists here. Otherwise reported length would
> be just odd. That being said the returned value already is odd because we
> don't properly account for different event sizes (unrelated problem). So if
> we want to keep it simple for now, we can just return group->num_events *
> FAN_EVENT_METADATA_LEN, can't we?

Sure. Makes sense.

>
> > diff --git a/fs/notify/group.c b/fs/notify/group.c
> > index ffd723ffe46d..b41ed68f9ff9 100644
> > --- a/fs/notify/group.c
> > +++ b/fs/notify/group.c
> > @@ -111,12 +116,20 @@ void fsnotify_put_group(struct fsnotify_group *group)
> >  }
> >  EXPORT_SYMBOL_GPL(fsnotify_put_group);
> >
> > -static struct fsnotify_group *__fsnotify_alloc_group(
> > +static struct fsnotify_group *__fsnotify_alloc_group(unsigned int q_hash_bits,
> >                               const struct fsnotify_ops *ops, gfp_t gfp)
> >  {
> >       struct fsnotify_group *group;
> > +     int i;
> > +
> > +#ifdef FSNOTIFY_HASHED_QUEUE
> > +     if (WARN_ON_ONCE(q_hash_bits > FSNOTIFY_HASHED_QUEUE_MAX_BITS))
> > +             q_hash_bits = FSNOTIFY_HASHED_QUEUE_MAX_BITS;
> > +#else
> > +     q_hash_bits = 0;
> > +#endif
>
> Why the check for q_hash_bits? We have in-kernel users only so I would not
> be that paranoid :) Maybe I'd just let the group specify whether it wants
> hashed queue or not and for hashed queues always set number of buckets to
> 128. So far we don't need anything more flexible and if we ever do, it's
> easy enough to change the in-kernel API...

Ok.

>
> Also, honestly, I find these ifdefs ugly. I'd just leave hashed queues
> unconditionally compiled in.
>

Ok.

> > -     group = kzalloc(sizeof(struct fsnotify_group), gfp);
> > +     group = kzalloc(fsnotify_group_size(q_hash_bits), gfp);
> >       if (!group)
> >               return ERR_PTR(-ENOMEM);
> >
> > @@ -126,8 +139,12 @@ static struct fsnotify_group *__fsnotify_alloc_group(
> >       atomic_set(&group->user_waits, 0);
> >
> >       spin_lock_init(&group->notification_lock);
> > -     INIT_LIST_HEAD(&group->notification_list);
> >       init_waitqueue_head(&group->notification_waitq);
> > +     /* Initialize one or more notification lists */
> > +     group->q_hash_bits = q_hash_bits;
> > +     group->max_bucket = (1 << q_hash_bits) - 1;
> > +     for (i = 0; i <= group->max_bucket; i++)
> > +             INIT_LIST_HEAD(&group->notification_list[i]);
> >       group->max_events = UINT_MAX;
> >
> >       mutex_init(&group->mark_mutex);
> > @@ -139,20 +156,24 @@ static struct fsnotify_group *__fsnotify_alloc_group(
> >  }
> >
> >  /*
> > - * Create a new fsnotify_group and hold a reference for the group returned.
> > + * Create a new fsnotify_group with no events queue.
>
> How come "no events queue"? There will be always at least one queue...
>

Bad phrasing. The backends that use fsnotify_alloc_group() do not
use the events queue.
I considered not allocating any event queue for them but decided it is
not worth the risk of introducing bugs.

> > + * Hold a reference for the group returned.
> >   */
> >  struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
> >  {
> > -     return __fsnotify_alloc_group(ops, GFP_KERNEL);
> > +     return __fsnotify_alloc_group(0, ops, GFP_KERNEL);
> >  }
> >  EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
> >
> >  /*
> > - * Create a new fsnotify_group and hold a reference for the group returned.
> > + * Create a new fsnotify_group with an events queue.
> > + * If @q_hash_bits > 0, the queue is shareded into several notification lists.
> > + * Hold a reference for the group returned.
> >   */
> > -struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops)
> > +struct fsnotify_group *fsnotify_alloc_user_group(unsigned int q_hash_bits,
> > +                                              const struct fsnotify_ops *ops)
> >  {
> > -     return __fsnotify_alloc_group(ops, GFP_KERNEL_ACCOUNT);
> > +     return __fsnotify_alloc_group(q_hash_bits, ops, GFP_KERNEL_ACCOUNT);
> >  }
> >  EXPORT_SYMBOL_GPL(fsnotify_alloc_user_group);
> >
> > diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
> > index d8830be60a9b..1c476b9485dc 100644
> > --- a/fs/notify/inotify/inotify_user.c
> > +++ b/fs/notify/inotify/inotify_user.c
> > @@ -288,6 +288,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
> >  {
> >       struct fsnotify_group *group;
> >       struct fsnotify_event *fsn_event;
> > +     struct list_head *list;
> >       void __user *p;
> >       int ret = -ENOTTY;
> >       size_t send_len = 0;
> > @@ -300,10 +301,16 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
> >       switch (cmd) {
> >       case FIONREAD:
> >               spin_lock(&group->notification_lock);
> > -             list_for_each_entry(fsn_event, &group->notification_list,
> > -                                 list) {
> > -                     send_len += sizeof(struct inotify_event);
> > -                     send_len += round_event_name_len(fsn_event);
> > +             list = fsnotify_first_notification_list(group);
> > +             /*
> > +              * With multi queue, send_len will be a lower bound
> > +              * on total events size.
> > +              */
> > +             if (list) {
> > +                     list_for_each_entry(fsn_event, list, list) {
> > +                             send_len += sizeof(struct inotify_event);
> > +                             send_len += round_event_name_len(fsn_event);
> > +                     }
>
> As I write below IMO we should enable hashed queues also for inotify (is
> there good reason not to?)

I see your perception of inotify_merge() is the same as mine was
when I wrote a patch to support hashed queues for inotify.
It is only after that I realized that inotify_merge() only ever merges
with the last event and I dropped that patch.
I see no reason to change this long time behavior.

> and here we should just walk through all the lists.

Ok.

>
> >               }
> >               spin_unlock(&group->notification_lock);
> >               ret = put_user(send_len, (int __user *) p);
> > @@ -631,7 +638,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
> >       struct fsnotify_group *group;
> >       struct inotify_event_info *oevent;
> >
> > -     group = fsnotify_alloc_user_group(&inotify_fsnotify_ops);
> > +     group = fsnotify_alloc_user_group(0, &inotify_fsnotify_ops);
> >       if (IS_ERR(group))
> >               return group;
> >
> > diff --git a/fs/notify/notification.c b/fs/notify/notification.c
> > index fcac2d72cbf5..58c8f6c1be1b 100644
> > --- a/fs/notify/notification.c
> > +++ b/fs/notify/notification.c
> ...
> > @@ -74,10 +67,75 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
> >       group->ops->free_event(event);
> >  }
> >
> > +/* Check and fix inconsistencies in hashed queue */
> > +static void fsnotify_queue_check(struct fsnotify_group *group)
> > +{
> > +#ifdef FSNOTIFY_HASHED_QUEUE
> > +     struct list_head *list;
> > +     int i, nbuckets = 0;
> > +     bool first_empty, last_empty;
> > +
> > +     assert_spin_locked(&group->notification_lock);
> > +
> > +     pr_debug("%s: group=%p events: num=%u max=%u buckets: first=%u last=%u max=%u\n",
> > +              __func__, group, group->num_events, group->max_events,
> > +              group->first_bucket, group->last_bucket, group->max_bucket);
> > +
> > +     if (fsnotify_notify_queue_is_empty(group))
> > +             return;
> > +
> > +     first_empty = list_empty(&group->notification_list[group->first_bucket]);
> > +     last_empty = list_empty(&group->notification_list[group->last_bucket]);
> > +
> > +     list = &group->notification_list[0];
> > +     for (i = 0; i <= group->max_bucket; i++, list++) {
> > +             if (list_empty(list))
> > +                     continue;
> > +             if (nbuckets++)
> > +                     continue;
> > +             if (first_empty)
> > +                     group->first_bucket = i;
> > +             if (last_empty)
> > +                     group->last_bucket = i;
> > +     }
> > +
> > +     pr_debug("%s: %u non-empty buckets\n", __func__, nbuckets);
> > +
> > +     /* All buckets are empty, but non-zero num_events? */
> > +     if (WARN_ON_ONCE(!nbuckets && group->num_events))
> > +             group->num_events = 0;
> > +#endif
>
> Hum, what is this function about? I understand you might want to have this
> around for debugging when developing the patch but is there are legitimate
> reason for this in production?
>
> I understand current patch uses this function for searching for next
> non-empty list but after this patch is merged with the next one, is there
> still a need for this?

Only needed for debugging (added by last patch).
I wanted to make sure that hash is balanced.
I'll keep this code around only for the debug print on queue overflow.

>
> > @@ -147,17 +228,21 @@ void fsnotify_remove_queued_event(struct fsnotify_group *group,
> >  struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
> >  {
> >       struct fsnotify_event *event;
> > +     struct list_head *list;
> >
> >       assert_spin_locked(&group->notification_lock);
> >
> > -     if (fsnotify_notify_queue_is_empty(group))
> > +     list = fsnotify_first_notification_list(group);
> > +     if (!list)
> >               return NULL;
> >
> > -     pr_debug("%s: group=%p\n", __func__, group);
> > +     pr_debug("%s: group=%p bucket=%u\n", __func__, group, group->first_bucket);
> >
> > -     event = list_first_entry(&group->notification_list,
> > -                              struct fsnotify_event, list);
> > +     event = list_first_entry(list, struct fsnotify_event, list);
>
> Perhaps the above could just reuse fsnotify_peek_first_event()? It is now
> complex enough to be worth it I guess.
>
> >       fsnotify_remove_queued_event(group, event);
> > +     /*
> > +      * TODO: update group->first_bucket to next_bucket in first event.
> > +      */
> >       return event;
> >  }
> >
> > @@ -167,13 +252,15 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
> >   */
> >  struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
> >  {
> > +     struct list_head *list;
> > +
> >       assert_spin_locked(&group->notification_lock);
> >
> > -     if (fsnotify_notify_queue_is_empty(group))
> > +     list = fsnotify_first_notification_list(group);
> > +     if (!list)
> >               return NULL;
> >
> > -     return list_first_entry(&group->notification_list,
> > -                             struct fsnotify_event, list);
> > +     return list_first_entry(list, struct fsnotify_event, list);
> >  }
> >
> >  /*
> > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> > index e5409b83e731..b2a80bc00108 100644
> > --- a/include/linux/fsnotify_backend.h
> > +++ b/include/linux/fsnotify_backend.h
> > @@ -160,6 +160,11 @@ struct fsnotify_ops {
> >       void (*free_mark)(struct fsnotify_mark *mark);
> >  };
> >
> > +#ifdef CONFIG_FANOTIFY
> > +#define FSNOTIFY_HASHED_QUEUE
> > +#define FSNOTIFY_HASHED_QUEUE_MAX_BITS 8
> > +#endif
> > +
>
> Would not inotify benefit from this work as well?

See above. inotify is not doing linear search.

> Also I'd just compile in
> hashed queues unconditionally. The ifdefs are ugly and IMHO not worth it.
>

OK.

> ...
>
> > +static inline size_t fsnotify_group_size(unsigned int q_hash_bits)
> > +{
> > +     return sizeof(struct fsnotify_group) + (sizeof(struct list_head) << q_hash_bits);
> > +}
> > +
> > +static inline unsigned int fsnotify_event_bucket(struct fsnotify_group *group,
> > +                                              struct fsnotify_event *event)
> > +{
> > +     /* High bits are better for hash */
> > +     return (event->key >> (32 - group->q_hash_bits)) & group->max_bucket;
> > +}
>
> Why not use hash_32() here? IMHO better than just stripping bits...

See hash_ptr(). There is a reason to use the highest bits.

>
> > +
> > +static inline struct list_head *fsnotify_event_notification_list(
> > +                                             struct fsnotify_group *group,
> > +                                             struct fsnotify_event *event)
> > +{
> > +     return &group->notification_list[fsnotify_event_bucket(group, event)];
> > +}
> > +
>
> Any reason for this to be in a header? Will it ever have other user than
> fsnotify_add_event()?

I guess not. I went thought several alternative patches.
It may have made sense in another variant.

Thanks,
Amir.
Jan Kara Feb. 17, 2021, 1:48 p.m. UTC | #3
On Wed 17-02-21 14:33:46, Amir Goldstein wrote:
> On Tue, Feb 16, 2021 at 5:02 PM Jan Kara <jack@suse.cz> wrote:
> > > @@ -300,10 +301,16 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
> > >       switch (cmd) {
> > >       case FIONREAD:
> > >               spin_lock(&group->notification_lock);
> > > -             list_for_each_entry(fsn_event, &group->notification_list,
> > > -                                 list) {
> > > -                     send_len += sizeof(struct inotify_event);
> > > -                     send_len += round_event_name_len(fsn_event);
> > > +             list = fsnotify_first_notification_list(group);
> > > +             /*
> > > +              * With multi queue, send_len will be a lower bound
> > > +              * on total events size.
> > > +              */
> > > +             if (list) {
> > > +                     list_for_each_entry(fsn_event, list, list) {
> > > +                             send_len += sizeof(struct inotify_event);
> > > +                             send_len += round_event_name_len(fsn_event);
> > > +                     }
> >
> > As I write below IMO we should enable hashed queues also for inotify (is
> > there good reason not to?)
> 
> I see your perception of inotify_merge() is the same as mine was
> when I wrote a patch to support hashed queues for inotify.
> It is only after that I realized that inotify_merge() only ever merges
> with the last event and I dropped that patch.
> I see no reason to change this long time behavior.

Ah, I even briefly looked at that code but didn't notice it merges only
with the last event. I agree that hashing for inotify doesn't make sense
then.

Hum, if the hashing and merging is specific to fanotify and as we decided
to keep the event->list for the global event list, we could easily have the
hash table just in fanotify private group data and hash->next pointer in
fanotify private part of the event? Maybe that would even result in a more
compact code?

> > > +static inline size_t fsnotify_group_size(unsigned int q_hash_bits)
> > > +{
> > > +     return sizeof(struct fsnotify_group) + (sizeof(struct list_head) << q_hash_bits);
> > > +}
> > > +
> > > +static inline unsigned int fsnotify_event_bucket(struct fsnotify_group *group,
> > > +                                              struct fsnotify_event *event)
> > > +{
> > > +     /* High bits are better for hash */
> > > +     return (event->key >> (32 - group->q_hash_bits)) & group->max_bucket;
> > > +}
> >
> > Why not use hash_32() here? IMHO better than just stripping bits...
> 
> See hash_ptr(). There is a reason to use the highest bits.

Well, but event->key is just a 32-bit number so I don't follow how high
bits used by hash_ptr() matter?

								Honza
Amir Goldstein Feb. 17, 2021, 3:42 p.m. UTC | #4
On Wed, Feb 17, 2021 at 3:48 PM Jan Kara <jack@suse.cz> wrote:
>
> On Wed 17-02-21 14:33:46, Amir Goldstein wrote:
> > On Tue, Feb 16, 2021 at 5:02 PM Jan Kara <jack@suse.cz> wrote:
> > > > @@ -300,10 +301,16 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
> > > >       switch (cmd) {
> > > >       case FIONREAD:
> > > >               spin_lock(&group->notification_lock);
> > > > -             list_for_each_entry(fsn_event, &group->notification_list,
> > > > -                                 list) {
> > > > -                     send_len += sizeof(struct inotify_event);
> > > > -                     send_len += round_event_name_len(fsn_event);
> > > > +             list = fsnotify_first_notification_list(group);
> > > > +             /*
> > > > +              * With multi queue, send_len will be a lower bound
> > > > +              * on total events size.
> > > > +              */
> > > > +             if (list) {
> > > > +                     list_for_each_entry(fsn_event, list, list) {
> > > > +                             send_len += sizeof(struct inotify_event);
> > > > +                             send_len += round_event_name_len(fsn_event);
> > > > +                     }
> > >
> > > As I write below IMO we should enable hashed queues also for inotify (is
> > > there good reason not to?)
> >
> > I see your perception of inotify_merge() is the same as mine was
> > when I wrote a patch to support hashed queues for inotify.
> > It is only after that I realized that inotify_merge() only ever merges
> > with the last event and I dropped that patch.
> > I see no reason to change this long time behavior.
>
> Ah, I even briefly looked at that code but didn't notice it merges only
> with the last event. I agree that hashing for inotify doesn't make sense
> then.
>
> Hum, if the hashing and merging is specific to fanotify and as we decided
> to keep the event->list for the global event list, we could easily have the
> hash table just in fanotify private group data and hash->next pointer in
> fanotify private part of the event? Maybe that would even result in a more
> compact code?
>

Maybe, I am not so sure. I will look into it.

> > > > +static inline size_t fsnotify_group_size(unsigned int q_hash_bits)
> > > > +{
> > > > +     return sizeof(struct fsnotify_group) + (sizeof(struct list_head) << q_hash_bits);
> > > > +}
> > > > +
> > > > +static inline unsigned int fsnotify_event_bucket(struct fsnotify_group *group,
> > > > +                                              struct fsnotify_event *event)
> > > > +{
> > > > +     /* High bits are better for hash */
> > > > +     return (event->key >> (32 - group->q_hash_bits)) & group->max_bucket;
> > > > +}
> > >
> > > Why not use hash_32() here? IMHO better than just stripping bits...
> >
> > See hash_ptr(). There is a reason to use the highest bits.
>
> Well, but event->key is just a 32-bit number so I don't follow how high
> bits used by hash_ptr() matter?

Of course, you are right.
But that 32-bit number was already generated using a xor of several
hash_32() results from hash_ptr() and full_name_hash(), so we do not really
need to mix it anymore to get better entropy in the higher 7 bits.

I do not mind using hash_32() here for clarity.

Thanks,
Amir.
Jan Kara Feb. 17, 2021, 4:49 p.m. UTC | #5
On Wed 17-02-21 17:42:34, Amir Goldstein wrote:
> On Wed, Feb 17, 2021 at 3:48 PM Jan Kara <jack@suse.cz> wrote:
> > > > > +static inline size_t fsnotify_group_size(unsigned int q_hash_bits)
> > > > > +{
> > > > > +     return sizeof(struct fsnotify_group) + (sizeof(struct list_head) << q_hash_bits);
> > > > > +}
> > > > > +
> > > > > +static inline unsigned int fsnotify_event_bucket(struct fsnotify_group *group,
> > > > > +                                              struct fsnotify_event *event)
> > > > > +{
> > > > > +     /* High bits are better for hash */
> > > > > +     return (event->key >> (32 - group->q_hash_bits)) & group->max_bucket;
> > > > > +}
> > > >
> > > > Why not use hash_32() here? IMHO better than just stripping bits...
> > >
> > > See hash_ptr(). There is a reason to use the highest bits.
> >
> > Well, but event->key is just a 32-bit number so I don't follow how high
> > bits used by hash_ptr() matter?
> 
> Of course, you are right.
> But that 32-bit number was already generated using a xor of several
> hash_32() results from hash_ptr() and full_name_hash(), so we do not really
> need to mix it anymore to get better entropy in the higher 7 bits.

True. Just masking it with q_hash_bits is fine as well.

								Honza
Amir Goldstein Feb. 18, 2021, 10:52 a.m. UTC | #6
On Wed, Feb 17, 2021 at 5:42 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Wed, Feb 17, 2021 at 3:48 PM Jan Kara <jack@suse.cz> wrote:
> >
> > On Wed 17-02-21 14:33:46, Amir Goldstein wrote:
> > > On Tue, Feb 16, 2021 at 5:02 PM Jan Kara <jack@suse.cz> wrote:
> > > > > @@ -300,10 +301,16 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
> > > > >       switch (cmd) {
> > > > >       case FIONREAD:
> > > > >               spin_lock(&group->notification_lock);
> > > > > -             list_for_each_entry(fsn_event, &group->notification_list,
> > > > > -                                 list) {
> > > > > -                     send_len += sizeof(struct inotify_event);
> > > > > -                     send_len += round_event_name_len(fsn_event);
> > > > > +             list = fsnotify_first_notification_list(group);
> > > > > +             /*
> > > > > +              * With multi queue, send_len will be a lower bound
> > > > > +              * on total events size.
> > > > > +              */
> > > > > +             if (list) {
> > > > > +                     list_for_each_entry(fsn_event, list, list) {
> > > > > +                             send_len += sizeof(struct inotify_event);
> > > > > +                             send_len += round_event_name_len(fsn_event);
> > > > > +                     }
> > > >
> > > > As I write below IMO we should enable hashed queues also for inotify (is
> > > > there good reason not to?)
> > >
> > > I see your perception of inotify_merge() is the same as mine was
> > > when I wrote a patch to support hashed queues for inotify.
> > > It is only after that I realized that inotify_merge() only ever merges
> > > with the last event and I dropped that patch.
> > > I see no reason to change this long time behavior.
> >
> > Ah, I even briefly looked at that code but didn't notice it merges only
> > with the last event. I agree that hashing for inotify doesn't make sense
> > then.
> >
> > Hum, if the hashing and merging is specific to fanotify and as we decided
> > to keep the event->list for the global event list, we could easily have the
> > hash table just in fanotify private group data and hash->next pointer in
> > fanotify private part of the event? Maybe that would even result in a more
> > compact code?
> >
>
> Maybe, I am not so sure. I will look into it.
>

I ended up doing something slightly different:
- The hash table and lists remained in fsnotify (and in a prep patch)
- event->key remains in fsnotify_event (and event->mask moved too)
- backend gets a callback insert() from fsnotify_add_event() to do it's thing
- event->next is in fanotify_event
- fanotify_insert() callback takes care of chaining all events by timeline

Hope you will like the result (pushed it to fanotify_merge branch).

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 1192c9953620..12df6957e4d8 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -97,7 +97,7 @@  static bool fanotify_should_merge(struct fsnotify_event *old_fsn,
 	old = FANOTIFY_E(old_fsn);
 	new = FANOTIFY_E(new_fsn);
 
-	if (old_fsn->objectid != new_fsn->objectid ||
+	if (old_fsn->key != new_fsn->key ||
 	    old->type != new->type || old->pid != new->pid)
 		return false;
 
@@ -600,8 +600,9 @@  static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
 	 * Use the victim inode instead of the watching inode as the id for
 	 * event queue, so event reported on parent is merged with event
 	 * reported on child when both directory and child watches exist.
+	 * Reduce object id to 32bit hash for hashed queue merge.
 	 */
-	fanotify_init_event(event, (unsigned long)id, mask);
+	fanotify_init_event(event, hash_ptr(id, 32), mask);
 	if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
 		event->pid = get_pid(task_pid(current));
 	else
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 896c819a1786..2e856372ffc8 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -145,9 +145,9 @@  struct fanotify_event {
 };
 
 static inline void fanotify_init_event(struct fanotify_event *event,
-				       unsigned long id, u32 mask)
+				       unsigned int key, u32 mask)
 {
-	fsnotify_init_event(&event->fse, id);
+	fsnotify_init_event(&event->fse, key);
 	event->mask = mask;
 	event->pid = NULL;
 }
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8ff27d92d32c..dee12d927f8d 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -635,6 +635,7 @@  static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
 {
 	struct fsnotify_group *group;
 	struct fsnotify_event *fsn_event;
+	struct list_head *list;
 	void __user *p;
 	int ret = -ENOTTY;
 	size_t send_len = 0;
@@ -646,8 +647,15 @@  static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
 	switch (cmd) {
 	case FIONREAD:
 		spin_lock(&group->notification_lock);
-		list_for_each_entry(fsn_event, &group->notification_list, list)
-			send_len += FAN_EVENT_METADATA_LEN;
+		list = fsnotify_first_notification_list(group);
+		/*
+		 * With multi queue, send_len will be a lower bound
+		 * on total events size.
+		 */
+		if (list) {
+			list_for_each_entry(fsn_event, list, list)
+				send_len += FAN_EVENT_METADATA_LEN;
+		}
 		spin_unlock(&group->notification_lock);
 		ret = put_user(send_len, (int __user *) p);
 		break;
@@ -982,7 +990,7 @@  SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		f_flags |= O_NONBLOCK;
 
 	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
-	group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
+	group = fsnotify_alloc_user_group(0, &fanotify_fsnotify_ops);
 	if (IS_ERR(group)) {
 		free_uid(user);
 		return PTR_ERR(group);
diff --git a/fs/notify/group.c b/fs/notify/group.c
index ffd723ffe46d..b41ed68f9ff9 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -21,6 +21,11 @@ 
  */
 static void fsnotify_final_destroy_group(struct fsnotify_group *group)
 {
+	int i;
+
+	for (i = 0; i <= group->max_bucket; i++)
+		WARN_ON_ONCE(!list_empty(&group->notification_list[i]));
+
 	if (group->ops->free_group_priv)
 		group->ops->free_group_priv(group);
 
@@ -111,12 +116,20 @@  void fsnotify_put_group(struct fsnotify_group *group)
 }
 EXPORT_SYMBOL_GPL(fsnotify_put_group);
 
-static struct fsnotify_group *__fsnotify_alloc_group(
+static struct fsnotify_group *__fsnotify_alloc_group(unsigned int q_hash_bits,
 				const struct fsnotify_ops *ops, gfp_t gfp)
 {
 	struct fsnotify_group *group;
+	int i;
+
+#ifdef FSNOTIFY_HASHED_QUEUE
+	if (WARN_ON_ONCE(q_hash_bits > FSNOTIFY_HASHED_QUEUE_MAX_BITS))
+		q_hash_bits = FSNOTIFY_HASHED_QUEUE_MAX_BITS;
+#else
+	q_hash_bits = 0;
+#endif
 
-	group = kzalloc(sizeof(struct fsnotify_group), gfp);
+	group = kzalloc(fsnotify_group_size(q_hash_bits), gfp);
 	if (!group)
 		return ERR_PTR(-ENOMEM);
 
@@ -126,8 +139,12 @@  static struct fsnotify_group *__fsnotify_alloc_group(
 	atomic_set(&group->user_waits, 0);
 
 	spin_lock_init(&group->notification_lock);
-	INIT_LIST_HEAD(&group->notification_list);
 	init_waitqueue_head(&group->notification_waitq);
+	/* Initialize one or more notification lists */
+	group->q_hash_bits = q_hash_bits;
+	group->max_bucket = (1 << q_hash_bits) - 1;
+	for (i = 0; i <= group->max_bucket; i++)
+		INIT_LIST_HEAD(&group->notification_list[i]);
 	group->max_events = UINT_MAX;
 
 	mutex_init(&group->mark_mutex);
@@ -139,20 +156,24 @@  static struct fsnotify_group *__fsnotify_alloc_group(
 }
 
 /*
- * Create a new fsnotify_group and hold a reference for the group returned.
+ * Create a new fsnotify_group with no events queue.
+ * Hold a reference for the group returned.
  */
 struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 {
-	return __fsnotify_alloc_group(ops, GFP_KERNEL);
+	return __fsnotify_alloc_group(0, ops, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
 
 /*
- * Create a new fsnotify_group and hold a reference for the group returned.
+ * Create a new fsnotify_group with an events queue.
+ * If @q_hash_bits > 0, the queue is shareded into several notification lists.
+ * Hold a reference for the group returned.
  */
-struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops)
+struct fsnotify_group *fsnotify_alloc_user_group(unsigned int q_hash_bits,
+						 const struct fsnotify_ops *ops)
 {
-	return __fsnotify_alloc_group(ops, GFP_KERNEL_ACCOUNT);
+	return __fsnotify_alloc_group(q_hash_bits, ops, GFP_KERNEL_ACCOUNT);
 }
 EXPORT_SYMBOL_GPL(fsnotify_alloc_user_group);
 
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index d8830be60a9b..1c476b9485dc 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -288,6 +288,7 @@  static long inotify_ioctl(struct file *file, unsigned int cmd,
 {
 	struct fsnotify_group *group;
 	struct fsnotify_event *fsn_event;
+	struct list_head *list;
 	void __user *p;
 	int ret = -ENOTTY;
 	size_t send_len = 0;
@@ -300,10 +301,16 @@  static long inotify_ioctl(struct file *file, unsigned int cmd,
 	switch (cmd) {
 	case FIONREAD:
 		spin_lock(&group->notification_lock);
-		list_for_each_entry(fsn_event, &group->notification_list,
-				    list) {
-			send_len += sizeof(struct inotify_event);
-			send_len += round_event_name_len(fsn_event);
+		list = fsnotify_first_notification_list(group);
+		/*
+		 * With multi queue, send_len will be a lower bound
+		 * on total events size.
+		 */
+		if (list) {
+			list_for_each_entry(fsn_event, list, list) {
+				send_len += sizeof(struct inotify_event);
+				send_len += round_event_name_len(fsn_event);
+			}
 		}
 		spin_unlock(&group->notification_lock);
 		ret = put_user(send_len, (int __user *) p);
@@ -631,7 +638,7 @@  static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 	struct fsnotify_group *group;
 	struct inotify_event_info *oevent;
 
-	group = fsnotify_alloc_user_group(&inotify_fsnotify_ops);
+	group = fsnotify_alloc_user_group(0, &inotify_fsnotify_ops);
 	if (IS_ERR(group))
 		return group;
 
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index fcac2d72cbf5..58c8f6c1be1b 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -47,13 +47,6 @@  u32 fsnotify_get_cookie(void)
 }
 EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
 
-/* return true if the notify queue is empty, false otherwise */
-bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
-{
-	assert_spin_locked(&group->notification_lock);
-	return list_empty(&group->notification_list) ? true : false;
-}
-
 void fsnotify_destroy_event(struct fsnotify_group *group,
 			    struct fsnotify_event *event)
 {
@@ -74,10 +67,75 @@  void fsnotify_destroy_event(struct fsnotify_group *group,
 	group->ops->free_event(event);
 }
 
+/* Check and fix inconsistencies in hashed queue */
+static void fsnotify_queue_check(struct fsnotify_group *group)
+{
+#ifdef FSNOTIFY_HASHED_QUEUE
+	struct list_head *list;
+	int i, nbuckets = 0;
+	bool first_empty, last_empty;
+
+	assert_spin_locked(&group->notification_lock);
+
+	pr_debug("%s: group=%p events: num=%u max=%u buckets: first=%u last=%u max=%u\n",
+		 __func__, group, group->num_events, group->max_events,
+		 group->first_bucket, group->last_bucket, group->max_bucket);
+
+	if (fsnotify_notify_queue_is_empty(group))
+		return;
+
+	first_empty = list_empty(&group->notification_list[group->first_bucket]);
+	last_empty = list_empty(&group->notification_list[group->last_bucket]);
+
+	list = &group->notification_list[0];
+	for (i = 0; i <= group->max_bucket; i++, list++) {
+		if (list_empty(list))
+			continue;
+		if (nbuckets++)
+			continue;
+		if (first_empty)
+			group->first_bucket = i;
+		if (last_empty)
+			group->last_bucket = i;
+	}
+
+	pr_debug("%s: %u non-empty buckets\n", __func__, nbuckets);
+
+	/* All buckets are empty, but non-zero num_events? */
+	if (WARN_ON_ONCE(!nbuckets && group->num_events))
+		group->num_events = 0;
+#endif
+}
+
 /*
- * Add an event to the group notification queue.  The group can later pull this
- * event off the queue to deal with.  The function returns 0 if the event was
- * added to the queue, 1 if the event was merged with some other queued event,
+ * Add an event to the group notification queue (no merge and no failure).
+ */
+static void fsnotify_queue_event(struct fsnotify_group *group,
+				struct fsnotify_event *event)
+{
+	/* Choose list to add event to */
+	unsigned int b = fsnotify_event_bucket(group, event);
+	struct list_head *list = &group->notification_list[b];
+
+	assert_spin_locked(&group->notification_lock);
+
+	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, group, event, b);
+
+	/*
+	 * TODO: set next_bucket of last event.
+	 */
+	group->last_bucket = b;
+	if (!group->num_events)
+		group->first_bucket = b;
+	group->num_events++;
+	list_add_tail(&event->list, list);
+}
+
+/*
+ * Try to Add an event to the group notification queue.
+ * The group can later pull this event off the queue to deal with.
+ * The function returns 0 if the event was added to a queue,
+ * 1 if the event was merged with some other queued event,
  * 2 if the event was not queued - either the queue of events has overflown
  * or the group is shutting down.
  */
@@ -87,7 +145,7 @@  int fsnotify_add_event(struct fsnotify_group *group,
 				    struct fsnotify_event *))
 {
 	int ret = 0;
-	struct list_head *list = &group->notification_list;
+	struct list_head *list;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
@@ -99,7 +157,7 @@  int fsnotify_add_event(struct fsnotify_group *group,
 	}
 
 	if (event == group->overflow_event ||
-	    group->q_len >= group->max_events) {
+	    group->num_events >= group->max_events) {
 		ret = 2;
 		/* Queue overflow event only if it isn't already queued */
 		if (!list_empty(&group->overflow_event->list)) {
@@ -110,6 +168,7 @@  int fsnotify_add_event(struct fsnotify_group *group,
 		goto queue;
 	}
 
+	list = fsnotify_event_notification_list(group, event);
 	if (!list_empty(list) && merge) {
 		ret = merge(list, event);
 		if (ret) {
@@ -119,8 +178,7 @@  int fsnotify_add_event(struct fsnotify_group *group,
 	}
 
 queue:
-	group->q_len++;
-	list_add_tail(&event->list, list);
+	fsnotify_queue_event(group, event);
 	spin_unlock(&group->notification_lock);
 
 	wake_up(&group->notification_waitq);
@@ -137,7 +195,30 @@  void fsnotify_remove_queued_event(struct fsnotify_group *group,
 	 * check in fsnotify_add_event() works
 	 */
 	list_del_init(&event->list);
-	group->q_len--;
+	group->num_events--;
+}
+
+/* Return the notification list of the first event */
+struct list_head *fsnotify_first_notification_list(struct fsnotify_group *group)
+{
+	struct list_head *list;
+
+	assert_spin_locked(&group->notification_lock);
+
+	if (fsnotify_notify_queue_is_empty(group))
+		return NULL;
+
+	list = &group->notification_list[group->first_bucket];
+	if (likely(!list_empty(list)))
+		return list;
+
+	/*
+	 * Look for any non-empty bucket.
+	 */
+	fsnotify_queue_check(group);
+	list = &group->notification_list[group->first_bucket];
+
+	return list_empty(list) ? NULL : list;
 }
 
 /*
@@ -147,17 +228,21 @@  void fsnotify_remove_queued_event(struct fsnotify_group *group,
 struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
 {
 	struct fsnotify_event *event;
+	struct list_head *list;
 
 	assert_spin_locked(&group->notification_lock);
 
-	if (fsnotify_notify_queue_is_empty(group))
+	list = fsnotify_first_notification_list(group);
+	if (!list)
 		return NULL;
 
-	pr_debug("%s: group=%p\n", __func__, group);
+	pr_debug("%s: group=%p bucket=%u\n", __func__, group, group->first_bucket);
 
-	event = list_first_entry(&group->notification_list,
-				 struct fsnotify_event, list);
+	event = list_first_entry(list, struct fsnotify_event, list);
 	fsnotify_remove_queued_event(group, event);
+	/*
+	 * TODO: update group->first_bucket to next_bucket in first event.
+	 */
 	return event;
 }
 
@@ -167,13 +252,15 @@  struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
  */
 struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
 {
+	struct list_head *list;
+
 	assert_spin_locked(&group->notification_lock);
 
-	if (fsnotify_notify_queue_is_empty(group))
+	list = fsnotify_first_notification_list(group);
+	if (!list)
 		return NULL;
 
-	return list_first_entry(&group->notification_list,
-				struct fsnotify_event, list);
+	return list_first_entry(list, struct fsnotify_event, list);
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index e5409b83e731..b2a80bc00108 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -160,6 +160,11 @@  struct fsnotify_ops {
 	void (*free_mark)(struct fsnotify_mark *mark);
 };
 
+#ifdef CONFIG_FANOTIFY
+#define FSNOTIFY_HASHED_QUEUE
+#define FSNOTIFY_HASHED_QUEUE_MAX_BITS 8
+#endif
+
 /*
  * all of the information about the original object we want to now send to
  * a group.  If you want to carry more info from the accessing task to the
@@ -167,7 +172,7 @@  struct fsnotify_ops {
  */
 struct fsnotify_event {
 	struct list_head list;
-	unsigned long objectid;	/* identifier for queue merges */
+	unsigned int key;		/* Key for hashed queue add/merge */
 };
 
 /*
@@ -189,12 +194,10 @@  struct fsnotify_group {
 	 */
 	refcount_t refcnt;		/* things with interest in this group */
 
-	/* needed to send notification to userspace */
-	spinlock_t notification_lock;		/* protect the notification_list */
-	struct list_head notification_list;	/* list of event_holder this group needs to send to userspace */
-	wait_queue_head_t notification_waitq;	/* read() on the notification file blocks on this waitq */
-	unsigned int q_len;			/* events on the queue */
-	unsigned int max_events;		/* maximum events allowed on the list */
+	/* needed to send notification to userspace and wait on group shutdown */
+	spinlock_t notification_lock;		/* protect the event queues */
+	wait_queue_head_t notification_waitq;	/* read() the events blocks on this waitq */
+
 	/*
 	 * Valid fsnotify group priorities.  Events are send in order from highest
 	 * priority to lowest priority.  We default to the lowest priority.
@@ -244,8 +247,36 @@  struct fsnotify_group {
 		} fanotify_data;
 #endif /* CONFIG_FANOTIFY */
 	};
+
+	/* Only relevant for groups that use events queue: */
+	unsigned int q_hash_bits;	/* >0 means hashed notification queue */
+	unsigned int max_bucket;	/* notification_list[] range is [0..max_bucket] */
+	unsigned int first_bucket;	/* List to read first event from */
+	unsigned int last_bucket;	/* List of last added event */
+	unsigned int num_events;	/* Number of events in all lists */
+	unsigned int max_events;	/* Maximum events allowed in all lists */
+	struct list_head notification_list[];	/* Queue of events sharded into several lists */
 };
 
+static inline size_t fsnotify_group_size(unsigned int q_hash_bits)
+{
+	return sizeof(struct fsnotify_group) + (sizeof(struct list_head) << q_hash_bits);
+}
+
+static inline unsigned int fsnotify_event_bucket(struct fsnotify_group *group,
+						 struct fsnotify_event *event)
+{
+	/* High bits are better for hash */
+	return (event->key >> (32 - group->q_hash_bits)) & group->max_bucket;
+}
+
+static inline struct list_head *fsnotify_event_notification_list(
+						struct fsnotify_group *group,
+						struct fsnotify_event *event)
+{
+	return &group->notification_list[fsnotify_event_bucket(group, event)];
+}
+
 /* When calling fsnotify tell it if the data is a path or inode */
 enum fsnotify_data_type {
 	FSNOTIFY_EVENT_NONE,
@@ -470,7 +501,8 @@  static inline void fsnotify_update_flags(struct dentry *dentry)
 
 /* create a new group */
 extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops);
-extern struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops);
+extern struct fsnotify_group *fsnotify_alloc_user_group(unsigned int q_hash_bits,
+							const struct fsnotify_ops *ops);
 /* get reference to a group */
 extern void fsnotify_get_group(struct fsnotify_group *group);
 /* drop reference on a group from fsnotify_alloc_group */
@@ -495,8 +527,15 @@  static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
 	fsnotify_add_event(group, group->overflow_event, NULL);
 }
 
-/* true if the group notification queue is empty */
-extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
+/* True if all the group event queues are empty */
+static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
+{
+	assert_spin_locked(&group->notification_lock);
+	return group->num_events == 0;
+}
+
+/* Return the notification list of the first event */
+extern struct list_head *fsnotify_first_notification_list(struct fsnotify_group *group);
 /* return, but do not dequeue the first event on the notification queue */
 extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
 /* return AND dequeue the first event on the notification queue */
@@ -577,10 +616,10 @@  extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
 extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);
 
 static inline void fsnotify_init_event(struct fsnotify_event *event,
-				       unsigned long objectid)
+				       unsigned int key)
 {
 	INIT_LIST_HEAD(&event->list);
-	event->objectid = objectid;
+	event->key = key;
 }
 
 #else