diff mbox series

inotify, memcg: account inotify instances to kmemcg

Message ID 20201218221129.851003-1-shakeelb@google.com (mailing list archive)
State New, archived
Headers show
Series inotify, memcg: account inotify instances to kmemcg | expand

Commit Message

Shakeel Butt Dec. 18, 2020, 10:11 p.m. UTC
Currently the fs sysctl inotify/max_user_instances is used to limit the
number of inotify instances on the system. For systems running multiple
workloads, the per-user namespace sysctl max_inotify_instances can be
used to further partition inotify instances. However there is no easy
way to set a sensible system level max limit on inotify instances and
further partition it between the workloads. It is much easier to charge
the underlying resource (i.e. memory) behind the inotify instances to
the memcg of the workload and let their memory limits limit the number
of inotify instances they can create.

Signed-off-by: Shakeel Butt <shakeelb@google.com>
---
 fs/notify/group.c                | 14 ++++++++++++--
 fs/notify/inotify/inotify_user.c |  5 +++--
 include/linux/fsnotify_backend.h |  2 ++
 3 files changed, 17 insertions(+), 4 deletions(-)

Comments

Amir Goldstein Dec. 19, 2020, 9:48 a.m. UTC | #1
On Sat, Dec 19, 2020 at 12:11 AM Shakeel Butt <shakeelb@google.com> wrote:
>
> Currently the fs sysctl inotify/max_user_instances is used to limit the
> number of inotify instances on the system. For systems running multiple
> workloads, the per-user namespace sysctl max_inotify_instances can be
> used to further partition inotify instances. However there is no easy
> way to set a sensible system level max limit on inotify instances and
> further partition it between the workloads. It is much easier to charge
> the underlying resource (i.e. memory) behind the inotify instances to
> the memcg of the workload and let their memory limits limit the number
> of inotify instances they can create.

Not that I have a problem with this patch, but what problem does it try to
solve? Are you concerned of users depleting system memory by creating
userns's and allocating 128 * (struct fsnotify_group) at a time?

IMO, that is not what max_user_instances was meant to protect against.
There are two reasons I can think of to limit user instances:
1. Pre-memgc, user allocation of events is limited to
    <max_user_instances>*<max_queued_events>
2. Performance penalty. User can place <max_user_instances>
    watches on the same "hot" directory, that will cause any access to
    that directory by any task on the system to pay the penalty of traversing
    <max_user_instances> marks and attempt to queue <max_user_instances>
    events. That cost, including <max_user_instances> inotify_merge() loops
    could be significant

#1 is not a problem anymore, since you already took care of accounting events
to the user's memcg.
#2 is not addressed by your patch.

>
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> ---
>  fs/notify/group.c                | 14 ++++++++++++--
>  fs/notify/inotify/inotify_user.c |  5 +++--
>  include/linux/fsnotify_backend.h |  2 ++
>  3 files changed, 17 insertions(+), 4 deletions(-)
>
> diff --git a/fs/notify/group.c b/fs/notify/group.c
> index a4a4b1c64d32..fab3cfdb4d9e 100644
> --- a/fs/notify/group.c
> +++ b/fs/notify/group.c
> @@ -114,11 +114,12 @@ EXPORT_SYMBOL_GPL(fsnotify_put_group);
>  /*
>   * Create a new fsnotify_group and hold a reference for the group returned.
>   */
> -struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
> +struct fsnotify_group *fsnotify_alloc_group_gfp(const struct fsnotify_ops *ops,
> +                                               gfp_t gfp)
>  {
>         struct fsnotify_group *group;
>
> -       group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
> +       group = kzalloc(sizeof(struct fsnotify_group), gfp);
>         if (!group)
>                 return ERR_PTR(-ENOMEM);
>
> @@ -139,6 +140,15 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
>
>         return group;
>  }
> +EXPORT_SYMBOL_GPL(fsnotify_alloc_group_gfp);
> +
> +/*
> + * Create a new fsnotify_group and hold a reference for the group returned.
> + */
> +struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
> +{
> +       return fsnotify_alloc_group_gfp(ops, GFP_KERNEL);
> +}
>  EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
>
>  int fsnotify_fasync(int fd, struct file *file, int on)
> diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
> index 59c177011a0f..7cb528c6154c 100644
> --- a/fs/notify/inotify/inotify_user.c
> +++ b/fs/notify/inotify/inotify_user.c
> @@ -632,11 +632,12 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
>         struct fsnotify_group *group;
>         struct inotify_event_info *oevent;
>
> -       group = fsnotify_alloc_group(&inotify_fsnotify_ops);
> +       group = fsnotify_alloc_group_gfp(&inotify_fsnotify_ops,
> +                                        GFP_KERNEL_ACCOUNT);
>         if (IS_ERR(group))
>                 return group;
>
> -       oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
> +       oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT);
>         if (unlikely(!oevent)) {
>                 fsnotify_destroy_group(group);
>                 return ERR_PTR(-ENOMEM);

Any reason why you did not include fanotify in this patch?

Thanks,
Amir.
Shakeel Butt Dec. 19, 2020, 2:31 p.m. UTC | #2
On Sat, Dec 19, 2020 at 1:48 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Sat, Dec 19, 2020 at 12:11 AM Shakeel Butt <shakeelb@google.com> wrote:
> >
> > Currently the fs sysctl inotify/max_user_instances is used to limit the
> > number of inotify instances on the system. For systems running multiple
> > workloads, the per-user namespace sysctl max_inotify_instances can be
> > used to further partition inotify instances. However there is no easy
> > way to set a sensible system level max limit on inotify instances and
> > further partition it between the workloads. It is much easier to charge
> > the underlying resource (i.e. memory) behind the inotify instances to
> > the memcg of the workload and let their memory limits limit the number
> > of inotify instances they can create.
>
> Not that I have a problem with this patch, but what problem does it try to
> solve?

I am aiming for the simplicity to not set another limit which can
indirectly be limited by memcg limits. I just want to set the memcg
limit on our production environment which runs multiple workloads on a
system and not think about setting a sensible value to
max_user_instances in production. I would prefer to set
max_user_instances to max int and let the memcg limits of the
workloads limit their inotify usage.

> Are you concerned of users depleting system memory by creating
> userns's and allocating 128 * (struct fsnotify_group) at a time?
>
> IMO, that is not what max_user_instances was meant to protect against.
> There are two reasons I can think of to limit user instances:
> 1. Pre-memgc, user allocation of events is limited to
>     <max_user_instances>*<max_queued_events>
> 2. Performance penalty. User can place <max_user_instances>
>     watches on the same "hot" directory, that will cause any access to
>     that directory by any task on the system to pay the penalty of traversing
>     <max_user_instances> marks and attempt to queue <max_user_instances>
>     events. That cost, including <max_user_instances> inotify_merge() loops
>     could be significant
>
> #1 is not a problem anymore, since you already took care of accounting events
> to the user's memcg.
> #2 is not addressed by your patch.

Yes, I am not addressing #2. Our workloads in prod have their own
private filesystems, so this is not an issue we observed.

>
> >
> > Signed-off-by: Shakeel Butt <shakeelb@google.com>
> > ---
> >  fs/notify/group.c                | 14 ++++++++++++--
> >  fs/notify/inotify/inotify_user.c |  5 +++--
> >  include/linux/fsnotify_backend.h |  2 ++
> >  3 files changed, 17 insertions(+), 4 deletions(-)
> >
> > diff --git a/fs/notify/group.c b/fs/notify/group.c
> > index a4a4b1c64d32..fab3cfdb4d9e 100644
> > --- a/fs/notify/group.c
> > +++ b/fs/notify/group.c
> > @@ -114,11 +114,12 @@ EXPORT_SYMBOL_GPL(fsnotify_put_group);
> >  /*
> >   * Create a new fsnotify_group and hold a reference for the group returned.
> >   */
> > -struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
> > +struct fsnotify_group *fsnotify_alloc_group_gfp(const struct fsnotify_ops *ops,
> > +                                               gfp_t gfp)
> >  {
> >         struct fsnotify_group *group;
> >
> > -       group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
> > +       group = kzalloc(sizeof(struct fsnotify_group), gfp);
> >         if (!group)
> >                 return ERR_PTR(-ENOMEM);
> >
> > @@ -139,6 +140,15 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
> >
> >         return group;
> >  }
> > +EXPORT_SYMBOL_GPL(fsnotify_alloc_group_gfp);
> > +
> > +/*
> > + * Create a new fsnotify_group and hold a reference for the group returned.
> > + */
> > +struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
> > +{
> > +       return fsnotify_alloc_group_gfp(ops, GFP_KERNEL);
> > +}
> >  EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
> >
> >  int fsnotify_fasync(int fd, struct file *file, int on)
> > diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
> > index 59c177011a0f..7cb528c6154c 100644
> > --- a/fs/notify/inotify/inotify_user.c
> > +++ b/fs/notify/inotify/inotify_user.c
> > @@ -632,11 +632,12 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
> >         struct fsnotify_group *group;
> >         struct inotify_event_info *oevent;
> >
> > -       group = fsnotify_alloc_group(&inotify_fsnotify_ops);
> > +       group = fsnotify_alloc_group_gfp(&inotify_fsnotify_ops,
> > +                                        GFP_KERNEL_ACCOUNT);
> >         if (IS_ERR(group))
> >                 return group;
> >
> > -       oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
> > +       oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT);
> >         if (unlikely(!oevent)) {
> >                 fsnotify_destroy_group(group);
> >                 return ERR_PTR(-ENOMEM);
>
> Any reason why you did not include fanotify in this patch?

The motivation was inotify's max_user_instances but we can charge
fsnotify_group for fanotify as well. Though I would prefer that to be
a separate patch. Let me know what you prefer?

>
> Thanks,
> Amir.

Thanks for the review. I really appreciate your time.

Shakeel
Amir Goldstein Dec. 19, 2020, 4:25 p.m. UTC | #3
On Sat, Dec 19, 2020 at 4:31 PM Shakeel Butt <shakeelb@google.com> wrote:
>
> On Sat, Dec 19, 2020 at 1:48 AM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Sat, Dec 19, 2020 at 12:11 AM Shakeel Butt <shakeelb@google.com> wrote:
> > >
> > > Currently the fs sysctl inotify/max_user_instances is used to limit the
> > > number of inotify instances on the system. For systems running multiple
> > > workloads, the per-user namespace sysctl max_inotify_instances can be
> > > used to further partition inotify instances. However there is no easy
> > > way to set a sensible system level max limit on inotify instances and
> > > further partition it between the workloads. It is much easier to charge
> > > the underlying resource (i.e. memory) behind the inotify instances to
> > > the memcg of the workload and let their memory limits limit the number
> > > of inotify instances they can create.
> >
> > Not that I have a problem with this patch, but what problem does it try to
> > solve?
>
> I am aiming for the simplicity to not set another limit which can
> indirectly be limited by memcg limits. I just want to set the memcg
> limit on our production environment which runs multiple workloads on a
> system and not think about setting a sensible value to
> max_user_instances in production. I would prefer to set
> max_user_instances to max int and let the memcg limits of the
> workloads limit their inotify usage.
>

understood.
and I guess the multiple workloads cannot run each in their own userns?
because then you wouldn't need to change max_user_instances limit.

> > Are you concerned of users depleting system memory by creating
> > userns's and allocating 128 * (struct fsnotify_group) at a time?
> >
> > IMO, that is not what max_user_instances was meant to protect against.
> > There are two reasons I can think of to limit user instances:
> > 1. Pre-memgc, user allocation of events is limited to
> >     <max_user_instances>*<max_queued_events>
> > 2. Performance penalty. User can place <max_user_instances>
> >     watches on the same "hot" directory, that will cause any access to
> >     that directory by any task on the system to pay the penalty of traversing
> >     <max_user_instances> marks and attempt to queue <max_user_instances>
> >     events. That cost, including <max_user_instances> inotify_merge() loops
> >     could be significant
> >
> > #1 is not a problem anymore, since you already took care of accounting events
> > to the user's memcg.
> > #2 is not addressed by your patch.
>
> Yes, I am not addressing #2. Our workloads in prod have their own
> private filesystems, so this is not an issue we observed.
>
> >
> > >
> > > Signed-off-by: Shakeel Butt <shakeelb@google.com>
> > > ---
> > >  fs/notify/group.c                | 14 ++++++++++++--
> > >  fs/notify/inotify/inotify_user.c |  5 +++--
> > >  include/linux/fsnotify_backend.h |  2 ++
> > >  3 files changed, 17 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/fs/notify/group.c b/fs/notify/group.c
> > > index a4a4b1c64d32..fab3cfdb4d9e 100644
> > > --- a/fs/notify/group.c
> > > +++ b/fs/notify/group.c
> > > @@ -114,11 +114,12 @@ EXPORT_SYMBOL_GPL(fsnotify_put_group);
> > >  /*
> > >   * Create a new fsnotify_group and hold a reference for the group returned.
> > >   */
> > > -struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
> > > +struct fsnotify_group *fsnotify_alloc_group_gfp(const struct fsnotify_ops *ops,
> > > +                                               gfp_t gfp)
> > >  {
> > >         struct fsnotify_group *group;
> > >
> > > -       group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
> > > +       group = kzalloc(sizeof(struct fsnotify_group), gfp);
> > >         if (!group)
> > >                 return ERR_PTR(-ENOMEM);
> > >
> > > @@ -139,6 +140,15 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
> > >
> > >         return group;
> > >  }
> > > +EXPORT_SYMBOL_GPL(fsnotify_alloc_group_gfp);
> > > +
> > > +/*
> > > + * Create a new fsnotify_group and hold a reference for the group returned.
> > > + */
> > > +struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
> > > +{
> > > +       return fsnotify_alloc_group_gfp(ops, GFP_KERNEL);
> > > +}
> > >  EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
> > >
> > >  int fsnotify_fasync(int fd, struct file *file, int on)
> > > diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
> > > index 59c177011a0f..7cb528c6154c 100644
> > > --- a/fs/notify/inotify/inotify_user.c
> > > +++ b/fs/notify/inotify/inotify_user.c
> > > @@ -632,11 +632,12 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
> > >         struct fsnotify_group *group;
> > >         struct inotify_event_info *oevent;
> > >
> > > -       group = fsnotify_alloc_group(&inotify_fsnotify_ops);
> > > +       group = fsnotify_alloc_group_gfp(&inotify_fsnotify_ops,
> > > +                                        GFP_KERNEL_ACCOUNT);
> > >         if (IS_ERR(group))
> > >                 return group;
> > >
> > > -       oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
> > > +       oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT);
> > >         if (unlikely(!oevent)) {
> > >                 fsnotify_destroy_group(group);
> > >                 return ERR_PTR(-ENOMEM);
> >
> > Any reason why you did not include fanotify in this patch?
>
> The motivation was inotify's max_user_instances but we can charge
> fsnotify_group for fanotify as well. Though I would prefer that to be
> a separate patch. Let me know what you prefer?
>

I would prefer to add the helper fsnotify_alloc_user_group()
that will use the GFP_KERNEL_ACCOUNT allocation flags
internally.

fsnotify_alloc_group() is used by all backends that initialize a single
group instance for internal use and  fsnotify_alloc_user_group() will be
used by inotify/fanotify when users create instances.
I see no reason to separate that to two patches.

Thanks,
Amir.
Shakeel Butt Dec. 20, 2020, 4:24 a.m. UTC | #4
On Sat, Dec 19, 2020 at 8:25 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Sat, Dec 19, 2020 at 4:31 PM Shakeel Butt <shakeelb@google.com> wrote:
> >
> > On Sat, Dec 19, 2020 at 1:48 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > >
> > > On Sat, Dec 19, 2020 at 12:11 AM Shakeel Butt <shakeelb@google.com> wrote:
> > > >
> > > > Currently the fs sysctl inotify/max_user_instances is used to limit the
> > > > number of inotify instances on the system. For systems running multiple
> > > > workloads, the per-user namespace sysctl max_inotify_instances can be
> > > > used to further partition inotify instances. However there is no easy
> > > > way to set a sensible system level max limit on inotify instances and
> > > > further partition it between the workloads. It is much easier to charge
> > > > the underlying resource (i.e. memory) behind the inotify instances to
> > > > the memcg of the workload and let their memory limits limit the number
> > > > of inotify instances they can create.
> > >
> > > Not that I have a problem with this patch, but what problem does it try to
> > > solve?
> >
> > I am aiming for the simplicity to not set another limit which can
> > indirectly be limited by memcg limits. I just want to set the memcg
> > limit on our production environment which runs multiple workloads on a
> > system and not think about setting a sensible value to
> > max_user_instances in production. I would prefer to set
> > max_user_instances to max int and let the memcg limits of the
> > workloads limit their inotify usage.
> >
>
> understood.
> and I guess the multiple workloads cannot run each in their own userns?
> because then you wouldn't need to change max_user_instances limit.
>

No workloads can run in their own user namespace but please note that
max_user_instances is shared between all the user namespaces.

>
[snip]
> > > Any reason why you did not include fanotify in this patch?
> >
> > The motivation was inotify's max_user_instances but we can charge
> > fsnotify_group for fanotify as well. Though I would prefer that to be
> > a separate patch. Let me know what you prefer?
> >
>
> I would prefer to add the helper fsnotify_alloc_user_group()
> that will use the GFP_KERNEL_ACCOUNT allocation flags
> internally.
>
> fsnotify_alloc_group() is used by all backends that initialize a single
> group instance for internal use and  fsnotify_alloc_user_group() will be
> used by inotify/fanotify when users create instances.
> I see no reason to separate that to two patches.
>

SGTM.
Amir Goldstein Dec. 20, 2020, 11:31 a.m. UTC | #5
On Sun, Dec 20, 2020 at 6:24 AM Shakeel Butt <shakeelb@google.com> wrote:
>
> On Sat, Dec 19, 2020 at 8:25 AM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Sat, Dec 19, 2020 at 4:31 PM Shakeel Butt <shakeelb@google.com> wrote:
> > >
> > > On Sat, Dec 19, 2020 at 1:48 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > > >
> > > > On Sat, Dec 19, 2020 at 12:11 AM Shakeel Butt <shakeelb@google.com> wrote:
> > > > >
> > > > > Currently the fs sysctl inotify/max_user_instances is used to limit the
> > > > > number of inotify instances on the system. For systems running multiple
> > > > > workloads, the per-user namespace sysctl max_inotify_instances can be
> > > > > used to further partition inotify instances. However there is no easy
> > > > > way to set a sensible system level max limit on inotify instances and
> > > > > further partition it between the workloads. It is much easier to charge
> > > > > the underlying resource (i.e. memory) behind the inotify instances to
> > > > > the memcg of the workload and let their memory limits limit the number
> > > > > of inotify instances they can create.
> > > >
> > > > Not that I have a problem with this patch, but what problem does it try to
> > > > solve?
> > >
> > > I am aiming for the simplicity to not set another limit which can
> > > indirectly be limited by memcg limits. I just want to set the memcg
> > > limit on our production environment which runs multiple workloads on a
> > > system and not think about setting a sensible value to
> > > max_user_instances in production. I would prefer to set
> > > max_user_instances to max int and let the memcg limits of the
> > > workloads limit their inotify usage.
> > >
> >
> > understood.
> > and I guess the multiple workloads cannot run each in their own userns?
> > because then you wouldn't need to change max_user_instances limit.
> >
>
> No workloads can run in their own user namespace but please note that
> max_user_instances is shared between all the user namespaces.

/proc/sys/fs/inotify/max_user_instances is shared between all the user
namespaces, but it only controls the init_user_ns limits.
/proc/sys/user/max_inotify_instances is per user ns and it is the one that
actually controls the inotify limits in non init_user_ns.

That said, I see that it is always initialized to MAX_INT on non init user ns,
which is exactly the setup that you are aiming at:

$ unshare -U
$ cat /proc/sys/user/max_inotify_instances
2147483647
$ cat /proc/sys/fs/inotify/max_user_instances
128

Thanks,
Amir.
Shakeel Butt Dec. 20, 2020, 5:56 p.m. UTC | #6
On Sun, Dec 20, 2020 at 3:31 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Sun, Dec 20, 2020 at 6:24 AM Shakeel Butt <shakeelb@google.com> wrote:
> >
> > On Sat, Dec 19, 2020 at 8:25 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > >
> > > On Sat, Dec 19, 2020 at 4:31 PM Shakeel Butt <shakeelb@google.com> wrote:
> > > >
> > > > On Sat, Dec 19, 2020 at 1:48 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > > > >
> > > > > On Sat, Dec 19, 2020 at 12:11 AM Shakeel Butt <shakeelb@google.com> wrote:
> > > > > >
> > > > > > Currently the fs sysctl inotify/max_user_instances is used to limit the
> > > > > > number of inotify instances on the system. For systems running multiple
> > > > > > workloads, the per-user namespace sysctl max_inotify_instances can be
> > > > > > used to further partition inotify instances. However there is no easy
> > > > > > way to set a sensible system level max limit on inotify instances and
> > > > > > further partition it between the workloads. It is much easier to charge
> > > > > > the underlying resource (i.e. memory) behind the inotify instances to
> > > > > > the memcg of the workload and let their memory limits limit the number
> > > > > > of inotify instances they can create.
> > > > >
> > > > > Not that I have a problem with this patch, but what problem does it try to
> > > > > solve?
> > > >
> > > > I am aiming for the simplicity to not set another limit which can
> > > > indirectly be limited by memcg limits. I just want to set the memcg
> > > > limit on our production environment which runs multiple workloads on a
> > > > system and not think about setting a sensible value to
> > > > max_user_instances in production. I would prefer to set
> > > > max_user_instances to max int and let the memcg limits of the
> > > > workloads limit their inotify usage.
> > > >
> > >
> > > understood.
> > > and I guess the multiple workloads cannot run each in their own userns?
> > > because then you wouldn't need to change max_user_instances limit.
> > >
> >
> > No workloads can run in their own user namespace but please note that
> > max_user_instances is shared between all the user namespaces.
>
> /proc/sys/fs/inotify/max_user_instances is shared between all the user
> namespaces, but it only controls the init_user_ns limits.
> /proc/sys/user/max_inotify_instances is per user ns and it is the one that
> actually controls the inotify limits in non init_user_ns.
>
> That said, I see that it is always initialized to MAX_INT on non init user ns,
> which is exactly the setup that you are aiming at:
>
> $ unshare -U
> $ cat /proc/sys/user/max_inotify_instances
> 2147483647
> $ cat /proc/sys/fs/inotify/max_user_instances
> 128

From what I understand, namespace-based limits are enforced
hierarchically. More specifically in the example above, the
application running in a user namespace with
/proc/sys/user/max_inotify_instances = 2147483647 and
/proc/sys/fs/inotify/max_user_instances = 128 will not be able to
create more than 128 inotify instances. I actually tested this with a
simple program which calls inotify_init() in a loop and it starts
failing before the 128th iteration.
Amir Goldstein Dec. 20, 2020, 6:04 p.m. UTC | #7
On Sun, Dec 20, 2020 at 7:56 PM Shakeel Butt <shakeelb@google.com> wrote:
>
> On Sun, Dec 20, 2020 at 3:31 AM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Sun, Dec 20, 2020 at 6:24 AM Shakeel Butt <shakeelb@google.com> wrote:
> > >
> > > On Sat, Dec 19, 2020 at 8:25 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > > >
> > > > On Sat, Dec 19, 2020 at 4:31 PM Shakeel Butt <shakeelb@google.com> wrote:
> > > > >
> > > > > On Sat, Dec 19, 2020 at 1:48 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > > > > >
> > > > > > On Sat, Dec 19, 2020 at 12:11 AM Shakeel Butt <shakeelb@google.com> wrote:
> > > > > > >
> > > > > > > Currently the fs sysctl inotify/max_user_instances is used to limit the
> > > > > > > number of inotify instances on the system. For systems running multiple
> > > > > > > workloads, the per-user namespace sysctl max_inotify_instances can be
> > > > > > > used to further partition inotify instances. However there is no easy
> > > > > > > way to set a sensible system level max limit on inotify instances and
> > > > > > > further partition it between the workloads. It is much easier to charge
> > > > > > > the underlying resource (i.e. memory) behind the inotify instances to
> > > > > > > the memcg of the workload and let their memory limits limit the number
> > > > > > > of inotify instances they can create.
> > > > > >
> > > > > > Not that I have a problem with this patch, but what problem does it try to
> > > > > > solve?
> > > > >
> > > > > I am aiming for the simplicity to not set another limit which can
> > > > > indirectly be limited by memcg limits. I just want to set the memcg
> > > > > limit on our production environment which runs multiple workloads on a
> > > > > system and not think about setting a sensible value to
> > > > > max_user_instances in production. I would prefer to set
> > > > > max_user_instances to max int and let the memcg limits of the
> > > > > workloads limit their inotify usage.
> > > > >
> > > >
> > > > understood.
> > > > and I guess the multiple workloads cannot run each in their own userns?
> > > > because then you wouldn't need to change max_user_instances limit.
> > > >
> > >
> > > No workloads can run in their own user namespace but please note that
> > > max_user_instances is shared between all the user namespaces.
> >
> > /proc/sys/fs/inotify/max_user_instances is shared between all the user
> > namespaces, but it only controls the init_user_ns limits.
> > /proc/sys/user/max_inotify_instances is per user ns and it is the one that
> > actually controls the inotify limits in non init_user_ns.
> >
> > That said, I see that it is always initialized to MAX_INT on non init user ns,
> > which is exactly the setup that you are aiming at:
> >
> > $ unshare -U
> > $ cat /proc/sys/user/max_inotify_instances
> > 2147483647
> > $ cat /proc/sys/fs/inotify/max_user_instances
> > 128
>
> From what I understand, namespace-based limits are enforced
> hierarchically. More specifically in the example above, the
> application running in a user namespace with
> /proc/sys/user/max_inotify_instances = 2147483647 and
> /proc/sys/fs/inotify/max_user_instances = 128 will not be able to
> create more than 128 inotify instances. I actually tested this with a
> simple program which calls inotify_init() in a loop and it starts
> failing before the 128th iteration.

Right, it is.
Thanks for the clarification.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/notify/group.c b/fs/notify/group.c
index a4a4b1c64d32..fab3cfdb4d9e 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -114,11 +114,12 @@  EXPORT_SYMBOL_GPL(fsnotify_put_group);
 /*
  * Create a new fsnotify_group and hold a reference for the group returned.
  */
-struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
+struct fsnotify_group *fsnotify_alloc_group_gfp(const struct fsnotify_ops *ops,
+						gfp_t gfp)
 {
 	struct fsnotify_group *group;
 
-	group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+	group = kzalloc(sizeof(struct fsnotify_group), gfp);
 	if (!group)
 		return ERR_PTR(-ENOMEM);
 
@@ -139,6 +140,15 @@  struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 
 	return group;
 }
+EXPORT_SYMBOL_GPL(fsnotify_alloc_group_gfp);
+
+/*
+ * Create a new fsnotify_group and hold a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
+{
+	return fsnotify_alloc_group_gfp(ops, GFP_KERNEL);
+}
 EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
 
 int fsnotify_fasync(int fd, struct file *file, int on)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 59c177011a0f..7cb528c6154c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -632,11 +632,12 @@  static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 	struct fsnotify_group *group;
 	struct inotify_event_info *oevent;
 
-	group = fsnotify_alloc_group(&inotify_fsnotify_ops);
+	group = fsnotify_alloc_group_gfp(&inotify_fsnotify_ops,
+					 GFP_KERNEL_ACCOUNT);
 	if (IS_ERR(group))
 		return group;
 
-	oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
+	oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT);
 	if (unlikely(!oevent)) {
 		fsnotify_destroy_group(group);
 		return ERR_PTR(-ENOMEM);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index a2e42d3cd87c..a0138267f547 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -470,6 +470,8 @@  static inline void fsnotify_update_flags(struct dentry *dentry)
 
 /* create a new group */
 extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops);
+extern struct fsnotify_group *fsnotify_alloc_group_gfp(const struct fsnotify_ops *ops,
+						       gfp_t gfp);
 /* get reference to a group */
 extern void fsnotify_get_group(struct fsnotify_group *group);
 /* drop reference on a group from fsnotify_alloc_group */