diff mbox series

[1/2] kernfs: add kernfs_ops.free operation to free resources tied to the file

Message ID 20230626201713.1204982-1-surenb@google.com (mailing list archive)
State New, archived
Headers show
Series [1/2] kernfs: add kernfs_ops.free operation to free resources tied to the file | expand

Commit Message

Suren Baghdasaryan June 26, 2023, 8:17 p.m. UTC
kernfs_ops.release operation can be called from kernfs_drain_open_files
which is not tied to the file's real lifecycle. Introduce a new kernfs_ops
free operation which is called only when the last fput() of the file is
performed and therefore is strictly tied to the file's lifecycle. This
operation will be used for freeing resources tied to the file, like
waitqueues used for polling the file.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 fs/kernfs/file.c       | 8 +++++---
 include/linux/kernfs.h | 5 +++++
 2 files changed, 10 insertions(+), 3 deletions(-)

Comments

Suren Baghdasaryan June 26, 2023, 8:21 p.m. UTC | #1
On Mon, Jun 26, 2023 at 1:17 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> kernfs_ops.release operation can be called from kernfs_drain_open_files
> which is not tied to the file's real lifecycle. Introduce a new kernfs_ops
> free operation which is called only when the last fput() of the file is
> performed and therefore is strictly tied to the file's lifecycle. This
> operation will be used for freeing resources tied to the file, like
> waitqueues used for polling the file.

While this patchset touches kernfs, cgroups and psi areas (3 different
maintainers), I think cgroups are the most relevant area for it, so
IMHO Tejun's tree would be the right one to get them once reviewed and
acknowledged. Thanks!

>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  fs/kernfs/file.c       | 8 +++++---
>  include/linux/kernfs.h | 5 +++++
>  2 files changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> index 40c4661f15b7..acc52d23d8f6 100644
> --- a/fs/kernfs/file.c
> +++ b/fs/kernfs/file.c
> @@ -766,7 +766,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
>
>  /* used from release/drain to ensure that ->release() is called exactly once */
>  static void kernfs_release_file(struct kernfs_node *kn,
> -                               struct kernfs_open_file *of)
> +                               struct kernfs_open_file *of, bool final)
>  {
>         /*
>          * @of is guaranteed to have no other file operations in flight and
> @@ -787,6 +787,8 @@ static void kernfs_release_file(struct kernfs_node *kn,
>                 of->released = true;
>                 of_on(of)->nr_to_release--;
>         }
> +       if (final && kn->attr.ops->free)
> +               kn->attr.ops->free(of);
>  }
>
>  static int kernfs_fop_release(struct inode *inode, struct file *filp)
> @@ -798,7 +800,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
>                 struct mutex *mutex;
>
>                 mutex = kernfs_open_file_mutex_lock(kn);
> -               kernfs_release_file(kn, of);
> +               kernfs_release_file(kn, of, true);
>                 mutex_unlock(mutex);
>         }
>
> @@ -852,7 +854,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
>                 }
>
>                 if (kn->flags & KERNFS_HAS_RELEASE)
> -                       kernfs_release_file(kn, of);
> +                       kernfs_release_file(kn, of, false);
>         }
>
>         WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release);
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index 73f5c120def8..a7e404ff31bb 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -273,6 +273,11 @@ struct kernfs_ops {
>          */
>         int (*open)(struct kernfs_open_file *of);
>         void (*release)(struct kernfs_open_file *of);
> +       /*
> +        * Free resources tied to the lifecycle of the file, like a
> +        * waitqueue used for polling.
> +        */
> +       void (*free)(struct kernfs_open_file *of);
>
>         /*
>          * Read is handled by either seq_file or raw_read().
> --
> 2.41.0.162.gfafddb0af9-goog
>
Tejun Heo June 26, 2023, 8:31 p.m. UTC | #2
On Mon, Jun 26, 2023 at 01:17:12PM -0700, Suren Baghdasaryan wrote:
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index 73f5c120def8..a7e404ff31bb 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -273,6 +273,11 @@ struct kernfs_ops {
>  	 */
>  	int (*open)(struct kernfs_open_file *of);
>  	void (*release)(struct kernfs_open_file *of);
> +	/*
> +	 * Free resources tied to the lifecycle of the file, like a
> +	 * waitqueue used for polling.
> +	 */
> +	void (*free)(struct kernfs_open_file *of);

I think this can use a bit more commenting - ie. explain that release may be
called earlier than the actual freeing of the file and how that can lead to
problems. Othre than that, looks fine to me.

Greg, as Suren suggested, I can route both patches through the cgroup tree
if you're okay with it.

Thanks.
Suren Baghdasaryan June 26, 2023, 8:39 p.m. UTC | #3
On Mon, Jun 26, 2023 at 1:31 PM Tejun Heo <tj@kernel.org> wrote:
>
> On Mon, Jun 26, 2023 at 01:17:12PM -0700, Suren Baghdasaryan wrote:
> > diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> > index 73f5c120def8..a7e404ff31bb 100644
> > --- a/include/linux/kernfs.h
> > +++ b/include/linux/kernfs.h
> > @@ -273,6 +273,11 @@ struct kernfs_ops {
> >        */
> >       int (*open)(struct kernfs_open_file *of);
> >       void (*release)(struct kernfs_open_file *of);
> > +     /*
> > +      * Free resources tied to the lifecycle of the file, like a
> > +      * waitqueue used for polling.
> > +      */
> > +     void (*free)(struct kernfs_open_file *of);
>
> I think this can use a bit more commenting - ie. explain that release may be
> called earlier than the actual freeing of the file and how that can lead to
> problems. Othre than that, looks fine to me.

Sure, once I get more feedback I'll post the next version with
expanded description.
Thanks!

>
> Greg, as Suren suggested, I can route both patches through the cgroup tree
> if you're okay with it.
>
> Thanks.
>
> --
> tejun
Greg KH June 27, 2023, 6:25 a.m. UTC | #4
On Mon, Jun 26, 2023 at 01:17:12PM -0700, Suren Baghdasaryan wrote:
> kernfs_ops.release operation can be called from kernfs_drain_open_files
> which is not tied to the file's real lifecycle. Introduce a new kernfs_ops
> free operation which is called only when the last fput() of the file is
> performed and therefore is strictly tied to the file's lifecycle. This
> operation will be used for freeing resources tied to the file, like
> waitqueues used for polling the file.

This is confusing, shouldn't release be the "last" time the file is
handled and then all resources attached to it freed?  Why do we need
another callback, shouldn't release handle this?


> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  fs/kernfs/file.c       | 8 +++++---
>  include/linux/kernfs.h | 5 +++++
>  2 files changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> index 40c4661f15b7..acc52d23d8f6 100644
> --- a/fs/kernfs/file.c
> +++ b/fs/kernfs/file.c
> @@ -766,7 +766,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
>  
>  /* used from release/drain to ensure that ->release() is called exactly once */
>  static void kernfs_release_file(struct kernfs_node *kn,
> -				struct kernfs_open_file *of)
> +				struct kernfs_open_file *of, bool final)

Adding flags to functions like this are a pain, now we need to look it
up every time to see what that bool means.

And when we do, we see that it is not documented here so we have no idea
of what it is :(

This is not going to be maintainable as-is, sorry.

>  {
>  	/*
>  	 * @of is guaranteed to have no other file operations in flight and
> @@ -787,6 +787,8 @@ static void kernfs_release_file(struct kernfs_node *kn,
>  		of->released = true;
>  		of_on(of)->nr_to_release--;
>  	}
> +	if (final && kn->attr.ops->free)
> +		kn->attr.ops->free(of);
>  }
>  
>  static int kernfs_fop_release(struct inode *inode, struct file *filp)
> @@ -798,7 +800,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
>  		struct mutex *mutex;
>  
>  		mutex = kernfs_open_file_mutex_lock(kn);
> -		kernfs_release_file(kn, of);
> +		kernfs_release_file(kn, of, true);
>  		mutex_unlock(mutex);
>  	}
>  
> @@ -852,7 +854,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
>  		}
>  
>  		if (kn->flags & KERNFS_HAS_RELEASE)
> -			kernfs_release_file(kn, of);
> +			kernfs_release_file(kn, of, false);

Why isn't this also the "last" time things are touched here?  why is it
false?


>  	}
>  
>  	WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release);
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index 73f5c120def8..a7e404ff31bb 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -273,6 +273,11 @@ struct kernfs_ops {
>  	 */
>  	int (*open)(struct kernfs_open_file *of);
>  	void (*release)(struct kernfs_open_file *of);
> +	/*
> +	 * Free resources tied to the lifecycle of the file, like a
> +	 * waitqueue used for polling.
> +	 */
> +	void (*free)(struct kernfs_open_file *of);

I agree with Tejun, this needs to be documented much better and show how
you really should never need to use this :)

thanks,

greg k-h
Christian Brauner June 27, 2023, 8:24 a.m. UTC | #5
On Mon, Jun 26, 2023 at 10:31:49AM -1000, Tejun Heo wrote:
> On Mon, Jun 26, 2023 at 01:17:12PM -0700, Suren Baghdasaryan wrote:
> > diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> > index 73f5c120def8..a7e404ff31bb 100644
> > --- a/include/linux/kernfs.h
> > +++ b/include/linux/kernfs.h
> > @@ -273,6 +273,11 @@ struct kernfs_ops {
> >  	 */
> >  	int (*open)(struct kernfs_open_file *of);
> >  	void (*release)(struct kernfs_open_file *of);
> > +	/*
> > +	 * Free resources tied to the lifecycle of the file, like a
> > +	 * waitqueue used for polling.
> > +	 */
> > +	void (*free)(struct kernfs_open_file *of);
> 
> I think this can use a bit more commenting - ie. explain that release may be
> called earlier than the actual freeing of the file and how that can lead to
> problems. Othre than that, looks fine to me.

It seems the more natural thing to do would be to introduce a ->drain()
operation and order it before ->release(), no?
Suren Baghdasaryan June 27, 2023, 5:03 p.m. UTC | #6
On Mon, Jun 26, 2023 at 11:25 PM Greg KH <gregkh@linuxfoundation.org> wrote:
>
> On Mon, Jun 26, 2023 at 01:17:12PM -0700, Suren Baghdasaryan wrote:
> > kernfs_ops.release operation can be called from kernfs_drain_open_files
> > which is not tied to the file's real lifecycle. Introduce a new kernfs_ops
> > free operation which is called only when the last fput() of the file is
> > performed and therefore is strictly tied to the file's lifecycle. This
> > operation will be used for freeing resources tied to the file, like
> > waitqueues used for polling the file.
>
> This is confusing, shouldn't release be the "last" time the file is
> handled and then all resources attached to it freed?  Why do we need
> another callback, shouldn't release handle this?

That is what I thought too but apparently kernfs_drain_open_files()
can also cause ops->release to be called while the file keeps on
living (see details here:
https://lore.kernel.org/all/CAJuCfpFZ3B4530TgsSHqp5F_gwfrDujwRYewKReJru==MdEHQg@mail.gmail.com/#t).

>
>
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> >  fs/kernfs/file.c       | 8 +++++---
> >  include/linux/kernfs.h | 5 +++++
> >  2 files changed, 10 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> > index 40c4661f15b7..acc52d23d8f6 100644
> > --- a/fs/kernfs/file.c
> > +++ b/fs/kernfs/file.c
> > @@ -766,7 +766,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
> >
> >  /* used from release/drain to ensure that ->release() is called exactly once */
> >  static void kernfs_release_file(struct kernfs_node *kn,
> > -                             struct kernfs_open_file *of)
> > +                             struct kernfs_open_file *of, bool final)
>
> Adding flags to functions like this are a pain, now we need to look it
> up every time to see what that bool means.
>
> And when we do, we see that it is not documented here so we have no idea
> of what it is :(
>
> This is not going to be maintainable as-is, sorry.

It's a static function with only two places it's used in the same
file. I can add documentation too if that helps.

>
> >  {
> >       /*
> >        * @of is guaranteed to have no other file operations in flight and
> > @@ -787,6 +787,8 @@ static void kernfs_release_file(struct kernfs_node *kn,
> >               of->released = true;
> >               of_on(of)->nr_to_release--;
> >       }
> > +     if (final && kn->attr.ops->free)
> > +             kn->attr.ops->free(of);
> >  }
> >
> >  static int kernfs_fop_release(struct inode *inode, struct file *filp)
> > @@ -798,7 +800,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
> >               struct mutex *mutex;
> >
> >               mutex = kernfs_open_file_mutex_lock(kn);
> > -             kernfs_release_file(kn, of);
> > +             kernfs_release_file(kn, of, true);
> >               mutex_unlock(mutex);
> >       }
> >
> > @@ -852,7 +854,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
> >               }
> >
> >               if (kn->flags & KERNFS_HAS_RELEASE)
> > -                     kernfs_release_file(kn, of);
> > +                     kernfs_release_file(kn, of, false);
>
> Why isn't this also the "last" time things are touched here?  why is it
> false?

Because it's called from the context of the process doing rmdir() and
if another process has the file in the directory opened it will have
that file alive until it calls the last fput(). These are the call
paths:

do_rmdir
  cgroup_rmdir
    kernfs_drain_open_files
      kernfs_release_file(..., false)
        kn->attr.ops->release(), of->released=true

fput()
  kernfs_fop_release()
    kernfs_release_file(..., true), of->released==true,
kn->attr.ops->release() is not called.

So, when kn->attr.ops->release() is called by do_rmdir() the file is
still kept alive by another process holding a reference to the file.
It's a problem in our case because if we free the resources associated
with that file (waitqueue head) from inside our release() operation
then the ongoing poll() operation on that file will step on that freed
resource.

>
>
> >       }
> >
> >       WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release);
> > diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> > index 73f5c120def8..a7e404ff31bb 100644
> > --- a/include/linux/kernfs.h
> > +++ b/include/linux/kernfs.h
> > @@ -273,6 +273,11 @@ struct kernfs_ops {
> >        */
> >       int (*open)(struct kernfs_open_file *of);
> >       void (*release)(struct kernfs_open_file *of);
> > +     /*
> > +      * Free resources tied to the lifecycle of the file, like a
> > +      * waitqueue used for polling.
> > +      */
> > +     void (*free)(struct kernfs_open_file *of);
>
> I agree with Tejun, this needs to be documented much better and show how
> you really should never need to use this :)

I'll be happy not to use it if we can fix the release() to be called
only when the file is truly going away.
Thanks,
Suren.

>
> thanks,
>
> greg k-h
Suren Baghdasaryan June 27, 2023, 5:09 p.m. UTC | #7
On Tue, Jun 27, 2023 at 1:24 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Mon, Jun 26, 2023 at 10:31:49AM -1000, Tejun Heo wrote:
> > On Mon, Jun 26, 2023 at 01:17:12PM -0700, Suren Baghdasaryan wrote:
> > > diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> > > index 73f5c120def8..a7e404ff31bb 100644
> > > --- a/include/linux/kernfs.h
> > > +++ b/include/linux/kernfs.h
> > > @@ -273,6 +273,11 @@ struct kernfs_ops {
> > >      */
> > >     int (*open)(struct kernfs_open_file *of);
> > >     void (*release)(struct kernfs_open_file *of);
> > > +   /*
> > > +    * Free resources tied to the lifecycle of the file, like a
> > > +    * waitqueue used for polling.
> > > +    */
> > > +   void (*free)(struct kernfs_open_file *of);
> >
> > I think this can use a bit more commenting - ie. explain that release may be
> > called earlier than the actual freeing of the file and how that can lead to
> > problems. Othre than that, looks fine to me.
>
> It seems the more natural thing to do would be to introduce a ->drain()
> operation and order it before ->release(), no?

I assume you mean we should add a ->drain() operation and call it when
kernfs_drain_open_files()  causes kernfs_release_file()? That would
work but if any existing release() handler counts on the current
behavior (release() being called while draining) then we should find
and fix these. Hopefully they don't really depend on the current
behavior but I dunno.
Christian Brauner June 27, 2023, 5:23 p.m. UTC | #8
On Tue, Jun 27, 2023 at 10:03:15AM -0700, Suren Baghdasaryan wrote:
> On Mon, Jun 26, 2023 at 11:25 PM Greg KH <gregkh@linuxfoundation.org> wrote:
> >
> > On Mon, Jun 26, 2023 at 01:17:12PM -0700, Suren Baghdasaryan wrote:
> > > kernfs_ops.release operation can be called from kernfs_drain_open_files
> > > which is not tied to the file's real lifecycle. Introduce a new kernfs_ops
> > > free operation which is called only when the last fput() of the file is
> > > performed and therefore is strictly tied to the file's lifecycle. This
> > > operation will be used for freeing resources tied to the file, like
> > > waitqueues used for polling the file.
> >
> > This is confusing, shouldn't release be the "last" time the file is
> > handled and then all resources attached to it freed?  Why do we need
> > another callback, shouldn't release handle this?
> 
> That is what I thought too but apparently kernfs_drain_open_files()
> can also cause ops->release to be called while the file keeps on
> living (see details here:
> https://lore.kernel.org/all/CAJuCfpFZ3B4530TgsSHqp5F_gwfrDujwRYewKReJru==MdEHQg@mail.gmail.com/#t).
> 
> >
> >
> > >
> > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > ---
> > >  fs/kernfs/file.c       | 8 +++++---
> > >  include/linux/kernfs.h | 5 +++++
> > >  2 files changed, 10 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> > > index 40c4661f15b7..acc52d23d8f6 100644
> > > --- a/fs/kernfs/file.c
> > > +++ b/fs/kernfs/file.c
> > > @@ -766,7 +766,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
> > >
> > >  /* used from release/drain to ensure that ->release() is called exactly once */
> > >  static void kernfs_release_file(struct kernfs_node *kn,
> > > -                             struct kernfs_open_file *of)
> > > +                             struct kernfs_open_file *of, bool final)
> >
> > Adding flags to functions like this are a pain, now we need to look it
> > up every time to see what that bool means.
> >
> > And when we do, we see that it is not documented here so we have no idea
> > of what it is :(
> >
> > This is not going to be maintainable as-is, sorry.
> 
> It's a static function with only two places it's used in the same
> file. I can add documentation too if that helps.
> 
> >
> > >  {
> > >       /*
> > >        * @of is guaranteed to have no other file operations in flight and
> > > @@ -787,6 +787,8 @@ static void kernfs_release_file(struct kernfs_node *kn,
> > >               of->released = true;
> > >               of_on(of)->nr_to_release--;
> > >       }
> > > +     if (final && kn->attr.ops->free)
> > > +             kn->attr.ops->free(of);
> > >  }
> > >
> > >  static int kernfs_fop_release(struct inode *inode, struct file *filp)
> > > @@ -798,7 +800,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
> > >               struct mutex *mutex;
> > >
> > >               mutex = kernfs_open_file_mutex_lock(kn);
> > > -             kernfs_release_file(kn, of);
> > > +             kernfs_release_file(kn, of, true);
> > >               mutex_unlock(mutex);
> > >       }
> > >
> > > @@ -852,7 +854,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
> > >               }
> > >
> > >               if (kn->flags & KERNFS_HAS_RELEASE)
> > > -                     kernfs_release_file(kn, of);
> > > +                     kernfs_release_file(kn, of, false);
> >
> > Why isn't this also the "last" time things are touched here?  why is it
> > false?
> 
> Because it's called from the context of the process doing rmdir() and
> if another process has the file in the directory opened it will have
> that file alive until it calls the last fput(). These are the call
> paths:
> 
> do_rmdir
>   cgroup_rmdir
>     kernfs_drain_open_files
>       kernfs_release_file(..., false)
>         kn->attr.ops->release(), of->released=true

This seems weird to me. Why would that trigger a ->release() call. In
general, calling ->release() kinda betrays the name.
So imho, this really wants to be a separate ->drain() or ->shutdown()
call (and seems conceptually related to f_op->flush()).

> 
> fput()
>   kernfs_fop_release()
>     kernfs_release_file(..., true), of->released==true,
> kn->attr.ops->release() is not called.
Christian Brauner June 27, 2023, 5:30 p.m. UTC | #9
On Tue, Jun 27, 2023 at 10:09:27AM -0700, Suren Baghdasaryan wrote:
> On Tue, Jun 27, 2023 at 1:24 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Mon, Jun 26, 2023 at 10:31:49AM -1000, Tejun Heo wrote:
> > > On Mon, Jun 26, 2023 at 01:17:12PM -0700, Suren Baghdasaryan wrote:
> > > > diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> > > > index 73f5c120def8..a7e404ff31bb 100644
> > > > --- a/include/linux/kernfs.h
> > > > +++ b/include/linux/kernfs.h
> > > > @@ -273,6 +273,11 @@ struct kernfs_ops {
> > > >      */
> > > >     int (*open)(struct kernfs_open_file *of);
> > > >     void (*release)(struct kernfs_open_file *of);
> > > > +   /*
> > > > +    * Free resources tied to the lifecycle of the file, like a
> > > > +    * waitqueue used for polling.
> > > > +    */
> > > > +   void (*free)(struct kernfs_open_file *of);
> > >
> > > I think this can use a bit more commenting - ie. explain that release may be
> > > called earlier than the actual freeing of the file and how that can lead to
> > > problems. Othre than that, looks fine to me.
> >
> > It seems the more natural thing to do would be to introduce a ->drain()
> > operation and order it before ->release(), no?
> 
> I assume you mean we should add a ->drain() operation and call it when
> kernfs_drain_open_files()  causes kernfs_release_file()? That would
> work but if any existing release() handler counts on the current
> behavior (release() being called while draining) then we should find
> and fix these. Hopefully they don't really depend on the current
> behavior but I dunno.

Before I wrote that I did a naive

        > git grep -A 20 kernfs_ops | grep \\.release
        kernel/cgroup/cgroup.c- .release                = cgroup_file_release,
        kernel/cgroup/cgroup.c- .release                = cgroup_file_release,

which only gave cgroup_release_file(). Might be I'm missing some convoluted
callchains though or macro magic...

->release() was added in

    commit 0e67db2f9fe91937e798e3d7d22c50a8438187e1
    kernfs: add kernfs_ops->open/release() callbacks

    Add ->open/release() methods to kernfs_ops.  ->open() is called when
    the file is opened and ->release() when the file is either released or
    severed.  These callbacks can be used, for example, to manage
    persistent caching objects over multiple seq_file iterations.

    Signed-off-by: Tejun Heo <tj@kernel.org>
    Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
    Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>


which mentions "either releases or severed" which imho already points to
separate methods.
Suren Baghdasaryan June 27, 2023, 5:36 p.m. UTC | #10
On Tue, Jun 27, 2023 at 10:30 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Tue, Jun 27, 2023 at 10:09:27AM -0700, Suren Baghdasaryan wrote:
> > On Tue, Jun 27, 2023 at 1:24 AM Christian Brauner <brauner@kernel.org> wrote:
> > >
> > > On Mon, Jun 26, 2023 at 10:31:49AM -1000, Tejun Heo wrote:
> > > > On Mon, Jun 26, 2023 at 01:17:12PM -0700, Suren Baghdasaryan wrote:
> > > > > diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> > > > > index 73f5c120def8..a7e404ff31bb 100644
> > > > > --- a/include/linux/kernfs.h
> > > > > +++ b/include/linux/kernfs.h
> > > > > @@ -273,6 +273,11 @@ struct kernfs_ops {
> > > > >      */
> > > > >     int (*open)(struct kernfs_open_file *of);
> > > > >     void (*release)(struct kernfs_open_file *of);
> > > > > +   /*
> > > > > +    * Free resources tied to the lifecycle of the file, like a
> > > > > +    * waitqueue used for polling.
> > > > > +    */
> > > > > +   void (*free)(struct kernfs_open_file *of);
> > > >
> > > > I think this can use a bit more commenting - ie. explain that release may be
> > > > called earlier than the actual freeing of the file and how that can lead to
> > > > problems. Othre than that, looks fine to me.
> > >
> > > It seems the more natural thing to do would be to introduce a ->drain()
> > > operation and order it before ->release(), no?
> >
> > I assume you mean we should add a ->drain() operation and call it when
> > kernfs_drain_open_files()  causes kernfs_release_file()? That would
> > work but if any existing release() handler counts on the current
> > behavior (release() being called while draining) then we should find
> > and fix these. Hopefully they don't really depend on the current
> > behavior but I dunno.
>
> Before I wrote that I did a naive
>
>         > git grep -A 20 kernfs_ops | grep \\.release
>         kernel/cgroup/cgroup.c- .release                = cgroup_file_release,
>         kernel/cgroup/cgroup.c- .release                = cgroup_file_release,
>
> which only gave cgroup_release_file(). Might be I'm missing some convoluted
> callchains though or macro magic...
>
> ->release() was added in
>
>     commit 0e67db2f9fe91937e798e3d7d22c50a8438187e1
>     kernfs: add kernfs_ops->open/release() callbacks
>
>     Add ->open/release() methods to kernfs_ops.  ->open() is called when
>     the file is opened and ->release() when the file is either released or
>     severed.  These callbacks can be used, for example, to manage
>     persistent caching objects over multiple seq_file iterations.
>
>     Signed-off-by: Tejun Heo <tj@kernel.org>
>     Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>     Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
>
>
> which mentions "either releases or severed" which imho already points to
> separate methods.

Interesting. I guess we can add op->drain() and make all existing
handlers of ops->release() to handle ops->drain() with the same
handler. That should keep them happy and for my case I'll be releasing
resources only inside ops->release(). Does that sound good?

>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Matthew Wilcox (Oracle) June 27, 2023, 5:36 p.m. UTC | #11
On Tue, Jun 27, 2023 at 10:03:15AM -0700, Suren Baghdasaryan wrote:
> On Mon, Jun 26, 2023 at 11:25 PM Greg KH <gregkh@linuxfoundation.org> wrote:
> >
> > On Mon, Jun 26, 2023 at 01:17:12PM -0700, Suren Baghdasaryan wrote:
> > > kernfs_ops.release operation can be called from kernfs_drain_open_files
> > > which is not tied to the file's real lifecycle. Introduce a new kernfs_ops
> > > free operation which is called only when the last fput() of the file is
> > > performed and therefore is strictly tied to the file's lifecycle. This
> > > operation will be used for freeing resources tied to the file, like
> > > waitqueues used for polling the file.
> >
> > This is confusing, shouldn't release be the "last" time the file is
> > handled and then all resources attached to it freed?  Why do we need
> > another callback, shouldn't release handle this?
> 
> That is what I thought too but apparently kernfs_drain_open_files()
> can also cause ops->release to be called while the file keeps on
> living (see details here:
> https://lore.kernel.org/all/CAJuCfpFZ3B4530TgsSHqp5F_gwfrDujwRYewKReJru==MdEHQg@mail.gmail.com/#t).

If we're splitting these two functions apart, can we use the same naming
as the VFS please?

``flush``
        called by the close(2) system call to flush a file

``release``
        called when the last reference to an open file is closed
Tejun Heo June 27, 2023, 6:42 p.m. UTC | #12
Hello, Christian.

On Tue, Jun 27, 2023 at 07:30:26PM +0200, Christian Brauner wrote:
...
> ->release() was added in
> 
>     commit 0e67db2f9fe91937e798e3d7d22c50a8438187e1
>     kernfs: add kernfs_ops->open/release() callbacks
> 
>     Add ->open/release() methods to kernfs_ops.  ->open() is called when
>     the file is opened and ->release() when the file is either released or
>     severed.  These callbacks can be used, for example, to manage
>     persistent caching objects over multiple seq_file iterations.
> 
>     Signed-off-by: Tejun Heo <tj@kernel.org>
>     Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>     Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
> 
> which mentions "either releases or severed" which imho already points to
> separate methods.

This is because kernfs has revoking operation which doesn't exist for other
filesystems. Other filesystem implemenations can't just say "I'm done. Bye!"
and go away. Even if the underlying filesystem has completely failed, the
code still has to remain attached and keep aborting operations.

However, kernfs serves as the midlayer to a lot of device drivers and other
internal subsystems and it'd be really inconvenient for each of them to have
to implement "I want to go away but I gotta wait out this user who's holding
onto my tuning knob file". So, kernfs exposes a revoke or severing semantics
something that's exposing interface through kernfs wants to stop doing so.

If you look at it from file operation implementation POV, this seems exactly
like ->release. All open files are shutdown and there won't be any future
operations. After all, revoke is forced closing of all fd's. So, for most
users, treating severing just like ->release is the right thing to do.

The PSI file which caused this is a special case because it attaches
something to its kernfs file which outlives the severing operation bypassing
kernfs infra. A more complete way to fix this would be supporting the
required behavior from kernfs side, so that the PSI file operates on kernfs
interface which knows the severing event and detaches properly. That said,
currently, this is very much an one-off.

Suren, if you're interested, it might make sense to pipe poll through kernfs
properly so that it has its kernfs operation and kernfs can sever it. That
said, as this is a fix for something which is currently causing crashes,
it'd be better to merge this simpler fix first no matter what.

Thanks.
Suren Baghdasaryan June 27, 2023, 8:09 p.m. UTC | #13
On Tue, Jun 27, 2023 at 11:42 AM Tejun Heo <tj@kernel.org> wrote:
>
> Hello, Christian.
>
> On Tue, Jun 27, 2023 at 07:30:26PM +0200, Christian Brauner wrote:
> ...
> > ->release() was added in
> >
> >     commit 0e67db2f9fe91937e798e3d7d22c50a8438187e1
> >     kernfs: add kernfs_ops->open/release() callbacks
> >
> >     Add ->open/release() methods to kernfs_ops.  ->open() is called when
> >     the file is opened and ->release() when the file is either released or
> >     severed.  These callbacks can be used, for example, to manage
> >     persistent caching objects over multiple seq_file iterations.
> >
> >     Signed-off-by: Tejun Heo <tj@kernel.org>
> >     Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> >     Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
> >
> > which mentions "either releases or severed" which imho already points to
> > separate methods.
>
> This is because kernfs has revoking operation which doesn't exist for other
> filesystems. Other filesystem implemenations can't just say "I'm done. Bye!"
> and go away. Even if the underlying filesystem has completely failed, the
> code still has to remain attached and keep aborting operations.
>
> However, kernfs serves as the midlayer to a lot of device drivers and other
> internal subsystems and it'd be really inconvenient for each of them to have
> to implement "I want to go away but I gotta wait out this user who's holding
> onto my tuning knob file". So, kernfs exposes a revoke or severing semantics
> something that's exposing interface through kernfs wants to stop doing so.
>
> If you look at it from file operation implementation POV, this seems exactly
> like ->release. All open files are shutdown and there won't be any future
> operations. After all, revoke is forced closing of all fd's. So, for most
> users, treating severing just like ->release is the right thing to do.
>
> The PSI file which caused this is a special case because it attaches
> something to its kernfs file which outlives the severing operation bypassing
> kernfs infra. A more complete way to fix this would be supporting the
> required behavior from kernfs side, so that the PSI file operates on kernfs
> interface which knows the severing event and detaches properly. That said,
> currently, this is very much an one-off.
>
> Suren, if you're interested, it might make sense to pipe poll through kernfs
> properly so that it has its kernfs operation and kernfs can sever it. That
> said, as this is a fix for something which is currently causing crashes,
> it'd be better to merge this simpler fix first no matter what.

I'm happy to implement the right fix if you go into more details.
AFAIKT kernfs_ops already has poll() operation, we are hooking
cgroup_file_poll() to it and using kernfs_generic_poll(). I thought
this is the right way to pipe poll through kernfs but if that's
incorrect, please let me know. I'm happy to fix that.
Thanks,
Suren.

>
> Thanks.

>
> --
> tejun
Suren Baghdasaryan June 27, 2023, 9:43 p.m. UTC | #14
On Tue, Jun 27, 2023 at 1:09 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Jun 27, 2023 at 11:42 AM Tejun Heo <tj@kernel.org> wrote:
> >
> > Hello, Christian.
> >
> > On Tue, Jun 27, 2023 at 07:30:26PM +0200, Christian Brauner wrote:
> > ...
> > > ->release() was added in
> > >
> > >     commit 0e67db2f9fe91937e798e3d7d22c50a8438187e1
> > >     kernfs: add kernfs_ops->open/release() callbacks
> > >
> > >     Add ->open/release() methods to kernfs_ops.  ->open() is called when
> > >     the file is opened and ->release() when the file is either released or
> > >     severed.  These callbacks can be used, for example, to manage
> > >     persistent caching objects over multiple seq_file iterations.
> > >
> > >     Signed-off-by: Tejun Heo <tj@kernel.org>
> > >     Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> > >     Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
> > >
> > > which mentions "either releases or severed" which imho already points to
> > > separate methods.
> >
> > This is because kernfs has revoking operation which doesn't exist for other
> > filesystems. Other filesystem implemenations can't just say "I'm done. Bye!"
> > and go away. Even if the underlying filesystem has completely failed, the
> > code still has to remain attached and keep aborting operations.
> >
> > However, kernfs serves as the midlayer to a lot of device drivers and other
> > internal subsystems and it'd be really inconvenient for each of them to have
> > to implement "I want to go away but I gotta wait out this user who's holding
> > onto my tuning knob file". So, kernfs exposes a revoke or severing semantics
> > something that's exposing interface through kernfs wants to stop doing so.
> >
> > If you look at it from file operation implementation POV, this seems exactly
> > like ->release. All open files are shutdown and there won't be any future
> > operations. After all, revoke is forced closing of all fd's. So, for most
> > users, treating severing just like ->release is the right thing to do.
> >
> > The PSI file which caused this is a special case because it attaches
> > something to its kernfs file which outlives the severing operation bypassing
> > kernfs infra. A more complete way to fix this would be supporting the
> > required behavior from kernfs side, so that the PSI file operates on kernfs
> > interface which knows the severing event and detaches properly. That said,
> > currently, this is very much an one-off.
> >
> > Suren, if you're interested, it might make sense to pipe poll through kernfs
> > properly so that it has its kernfs operation and kernfs can sever it. That
> > said, as this is a fix for something which is currently causing crashes,
> > it'd be better to merge this simpler fix first no matter what.
>
> I'm happy to implement the right fix if you go into more details.
> AFAIKT kernfs_ops already has poll() operation, we are hooking
> cgroup_file_poll() to it and using kernfs_generic_poll(). I thought
> this is the right way to pipe poll through kernfs but if that's
> incorrect, please let me know. I'm happy to fix that.

Ah, sorry, for PSI we are not using kernfs_generic_poll(), so my claim
misrepresents the situation. Let me look into how
kernfs_generic_poll() is implemented and maybe I can find a better
solution for PSI.
Thanks,
Suren.

> Thanks,
> Suren.
>
> >
> > Thanks.
>
> >
> > --
> > tejun
Suren Baghdasaryan June 27, 2023, 9:58 p.m. UTC | #15
On Tue, Jun 27, 2023 at 2:43 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Jun 27, 2023 at 1:09 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Tue, Jun 27, 2023 at 11:42 AM Tejun Heo <tj@kernel.org> wrote:
> > >
> > > Hello, Christian.
> > >
> > > On Tue, Jun 27, 2023 at 07:30:26PM +0200, Christian Brauner wrote:
> > > ...
> > > > ->release() was added in
> > > >
> > > >     commit 0e67db2f9fe91937e798e3d7d22c50a8438187e1
> > > >     kernfs: add kernfs_ops->open/release() callbacks
> > > >
> > > >     Add ->open/release() methods to kernfs_ops.  ->open() is called when
> > > >     the file is opened and ->release() when the file is either released or
> > > >     severed.  These callbacks can be used, for example, to manage
> > > >     persistent caching objects over multiple seq_file iterations.
> > > >
> > > >     Signed-off-by: Tejun Heo <tj@kernel.org>
> > > >     Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> > > >     Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
> > > >
> > > > which mentions "either releases or severed" which imho already points to
> > > > separate methods.
> > >
> > > This is because kernfs has revoking operation which doesn't exist for other
> > > filesystems. Other filesystem implemenations can't just say "I'm done. Bye!"
> > > and go away. Even if the underlying filesystem has completely failed, the
> > > code still has to remain attached and keep aborting operations.
> > >
> > > However, kernfs serves as the midlayer to a lot of device drivers and other
> > > internal subsystems and it'd be really inconvenient for each of them to have
> > > to implement "I want to go away but I gotta wait out this user who's holding
> > > onto my tuning knob file". So, kernfs exposes a revoke or severing semantics
> > > something that's exposing interface through kernfs wants to stop doing so.
> > >
> > > If you look at it from file operation implementation POV, this seems exactly
> > > like ->release. All open files are shutdown and there won't be any future
> > > operations. After all, revoke is forced closing of all fd's. So, for most
> > > users, treating severing just like ->release is the right thing to do.
> > >
> > > The PSI file which caused this is a special case because it attaches
> > > something to its kernfs file which outlives the severing operation bypassing
> > > kernfs infra. A more complete way to fix this would be supporting the
> > > required behavior from kernfs side, so that the PSI file operates on kernfs
> > > interface which knows the severing event and detaches properly. That said,
> > > currently, this is very much an one-off.
> > >
> > > Suren, if you're interested, it might make sense to pipe poll through kernfs
> > > properly so that it has its kernfs operation and kernfs can sever it. That
> > > said, as this is a fix for something which is currently causing crashes,
> > > it'd be better to merge this simpler fix first no matter what.
> >
> > I'm happy to implement the right fix if you go into more details.
> > AFAIKT kernfs_ops already has poll() operation, we are hooking
> > cgroup_file_poll() to it and using kernfs_generic_poll(). I thought
> > this is the right way to pipe poll through kernfs but if that's
> > incorrect, please let me know. I'm happy to fix that.
>
> Ah, sorry, for PSI we are not using kernfs_generic_poll(), so my claim
> misrepresents the situation. Let me look into how
> kernfs_generic_poll() is implemented and maybe I can find a better
> solution for PSI.

Ok in kernfs_generic_poll() we are using kernfs_open_node.poll
waitqueue head for polling and kernfs_open_node is freed from inside
kernfs_unlink_open_file() which is called from kernfs_fop_release().
So, it is destroyed only when the last fput() is done, unlike the
ops->release() operation which we are using for destroying PSI
trigger's waitqueue. So, it seems we still need an operation which
would indicate that the file is truly going away.

Christian's suggestion to rename current ops->release() operation into
ops->drain() (or ops->flush() per Matthew's request) and introduce a
"new" ops->release() which is called only when the last fput() is done
seems sane to me. Would everyone be happy with that approach?


> Thanks,
> Suren.
>
> > Thanks,
> > Suren.
> >
> > >
> > > Thanks.
> >
> > >
> > > --
> > > tejun
Tejun Heo June 28, 2023, 1:54 a.m. UTC | #16
Hello,

On Tue, Jun 27, 2023 at 02:58:08PM -0700, Suren Baghdasaryan wrote:
> Ok in kernfs_generic_poll() we are using kernfs_open_node.poll
> waitqueue head for polling and kernfs_open_node is freed from inside
> kernfs_unlink_open_file() which is called from kernfs_fop_release().
> So, it is destroyed only when the last fput() is done, unlike the
> ops->release() operation which we are using for destroying PSI
> trigger's waitqueue. So, it seems we still need an operation which
> would indicate that the file is truly going away.

If we want to stay consistent with how kernfs behaves w.r.t. severing, the
right thing to do would be preventing any future polling at severing and
waking up everyone currently waiting, which sounds fine from cgroup behavior
POV too.

Now, the challenge is designing an interface which is difficult to make
mistake with. IOW, it'd be great if kernfs wraps poll call so that severing
is implemented without kernfs users doing anything, or at least make it
pretty obvious what the correct usage pattern is.

> Christian's suggestion to rename current ops->release() operation into
> ops->drain() (or ops->flush() per Matthew's request) and introduce a
> "new" ops->release() which is called only when the last fput() is done
> seems sane to me. Would everyone be happy with that approach?

I'm not sure I'd go there. The contract is that once ->release() is called,
the code backing that file can go away (e.g. rmmod'd). It really should
behave just like the last put from kernfs users' POV. For this specific fix,
it's safe because we know the ops is always built into the kernel and won't
go away but it'd be really bad if the interface says "this is a normal thing
to do". We'd be calling into rmmod'd text pages in no time.

So, I mean, even for temporary fix, we have to make it abundantly clear that
this is not for usual usage and can only be used if the code backing the ops
is built into the kernel and so on.

Thanks.
Suren Baghdasaryan June 28, 2023, 3:09 a.m. UTC | #17
On Tue, Jun 27, 2023 at 6:54 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Tue, Jun 27, 2023 at 02:58:08PM -0700, Suren Baghdasaryan wrote:
> > Ok in kernfs_generic_poll() we are using kernfs_open_node.poll
> > waitqueue head for polling and kernfs_open_node is freed from inside
> > kernfs_unlink_open_file() which is called from kernfs_fop_release().
> > So, it is destroyed only when the last fput() is done, unlike the
> > ops->release() operation which we are using for destroying PSI
> > trigger's waitqueue. So, it seems we still need an operation which
> > would indicate that the file is truly going away.
>
> If we want to stay consistent with how kernfs behaves w.r.t. severing, the
> right thing to do would be preventing any future polling at severing and
> waking up everyone currently waiting, which sounds fine from cgroup behavior
> POV too.

That's actually what we are currently doing for PSI triggers.
->release() is handled by cgroup_pressure_release() which signals the
waiters, waits for RCU grace period to pass (per
https://elixir.bootlin.com/linux/latest/source/include/linux/wait.h#L258)
and then releases all the trigger resources including the waitqueue
head. However as reported in
https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com
this does not save us from the synchronous polling case:

                                                  do_select
                                                      vfs_poll
cgroup_pressure_release
    psi_trigger_destroy
        wake_up_pollfree(&t->event_wait) -> unblocks vfs_poll
        synchronize_rcu()
        kfree(t) -> frees waitqueue head
                                                     poll_freewait()
-> uses waitqueue head


This happens because we release the resources associated with the file
while there are still file users (the file's refcount is non-zero).
And that happens because kernfs can call ->release() before the last
fput().

>
> Now, the challenge is designing an interface which is difficult to make
> mistake with. IOW, it'd be great if kernfs wraps poll call so that severing
> is implemented without kernfs users doing anything, or at least make it
> pretty obvious what the correct usage pattern is.
>
> > Christian's suggestion to rename current ops->release() operation into
> > ops->drain() (or ops->flush() per Matthew's request) and introduce a
> > "new" ops->release() which is called only when the last fput() is done
> > seems sane to me. Would everyone be happy with that approach?
>
> I'm not sure I'd go there. The contract is that once ->release() is called,
> the code backing that file can go away (e.g. rmmod'd). It really should
> behave just like the last put from kernfs users' POV.

I 100% agree with the above statement.

> For this specific fix,
> it's safe because we know the ops is always built into the kernel and won't
> go away but it'd be really bad if the interface says "this is a normal thing
> to do". We'd be calling into rmmod'd text pages in no time.
>
> So, I mean, even for temporary fix, we have to make it abundantly clear that
> this is not for usual usage and can only be used if the code backing the ops
> is built into the kernel and so on.

I think the root cause of this problem is that ->release() in kernfs
does not adhere to the common rule that ->release() is called only
when the file is going away and has no users left. Am I wrong?
Thanks,
Suren.

>
> Thanks.
>
> --
> tejun
Christian Brauner June 28, 2023, 7:26 a.m. UTC | #18
On Tue, Jun 27, 2023 at 08:09:46PM -0700, Suren Baghdasaryan wrote:
> On Tue, Jun 27, 2023 at 6:54 PM Tejun Heo <tj@kernel.org> wrote:
> >
> > Hello,
> >
> > On Tue, Jun 27, 2023 at 02:58:08PM -0700, Suren Baghdasaryan wrote:
> > > Ok in kernfs_generic_poll() we are using kernfs_open_node.poll
> > > waitqueue head for polling and kernfs_open_node is freed from inside
> > > kernfs_unlink_open_file() which is called from kernfs_fop_release().
> > > So, it is destroyed only when the last fput() is done, unlike the
> > > ops->release() operation which we are using for destroying PSI
> > > trigger's waitqueue. So, it seems we still need an operation which
> > > would indicate that the file is truly going away.
> >
> > If we want to stay consistent with how kernfs behaves w.r.t. severing, the
> > right thing to do would be preventing any future polling at severing and
> > waking up everyone currently waiting, which sounds fine from cgroup behavior
> > POV too.
> 
> That's actually what we are currently doing for PSI triggers.
> ->release() is handled by cgroup_pressure_release() which signals the
> waiters, waits for RCU grace period to pass (per
> https://elixir.bootlin.com/linux/latest/source/include/linux/wait.h#L258)
> and then releases all the trigger resources including the waitqueue
> head. However as reported in
> https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com
> this does not save us from the synchronous polling case:
> 
>                                                   do_select
>                                                       vfs_poll
> cgroup_pressure_release
>     psi_trigger_destroy
>         wake_up_pollfree(&t->event_wait) -> unblocks vfs_poll
>         synchronize_rcu()
>         kfree(t) -> frees waitqueue head
>                                                      poll_freewait()
> -> uses waitqueue head
> 
> 
> This happens because we release the resources associated with the file
> while there are still file users (the file's refcount is non-zero).
> And that happens because kernfs can call ->release() before the last
> fput().
> 
> >
> > Now, the challenge is designing an interface which is difficult to make
> > mistake with. IOW, it'd be great if kernfs wraps poll call so that severing
> > is implemented without kernfs users doing anything, or at least make it
> > pretty obvious what the correct usage pattern is.
> >
> > > Christian's suggestion to rename current ops->release() operation into
> > > ops->drain() (or ops->flush() per Matthew's request) and introduce a
> > > "new" ops->release() which is called only when the last fput() is done
> > > seems sane to me. Would everyone be happy with that approach?
> >
> > I'm not sure I'd go there. The contract is that once ->release() is called,
> > the code backing that file can go away (e.g. rmmod'd). It really should
> > behave just like the last put from kernfs users' POV.
> 
> I 100% agree with the above statement.
> 
> > For this specific fix,
> > it's safe because we know the ops is always built into the kernel and won't
> > go away but it'd be really bad if the interface says "this is a normal thing
> > to do". We'd be calling into rmmod'd text pages in no time.
> >
> > So, I mean, even for temporary fix, we have to make it abundantly clear that
> > this is not for usual usage and can only be used if the code backing the ops
> > is built into the kernel and so on.
> 
> I think the root cause of this problem is that ->release() in kernfs
> does not adhere to the common rule that ->release() is called only
> when the file is going away and has no users left. Am I wrong?

So imho, ultimately this all comes down to rmdir() having special
semantics in kernfs. On any regular filesystem an rmdir() on a directory
which is still referenced by a struct file doesn't trigger an
f_op->release() operation. It's just that directory is unlinked and
you get some sort of errno like ENOENT when you try to create new files
in there or whatever. The actual f_op->release) however is triggered
on last fput().

But in essence, kernfs treats an rmdir() operation as being equivalent
to a final fput() such that it somehow magically kills all file
references. And that's just wrong and not supported.
Suren Baghdasaryan June 28, 2023, 7:46 a.m. UTC | #19
On Wed, Jun 28, 2023 at 12:26 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Tue, Jun 27, 2023 at 08:09:46PM -0700, Suren Baghdasaryan wrote:
> > On Tue, Jun 27, 2023 at 6:54 PM Tejun Heo <tj@kernel.org> wrote:
> > >
> > > Hello,
> > >
> > > On Tue, Jun 27, 2023 at 02:58:08PM -0700, Suren Baghdasaryan wrote:
> > > > Ok in kernfs_generic_poll() we are using kernfs_open_node.poll
> > > > waitqueue head for polling and kernfs_open_node is freed from inside
> > > > kernfs_unlink_open_file() which is called from kernfs_fop_release().
> > > > So, it is destroyed only when the last fput() is done, unlike the
> > > > ops->release() operation which we are using for destroying PSI
> > > > trigger's waitqueue. So, it seems we still need an operation which
> > > > would indicate that the file is truly going away.
> > >
> > > If we want to stay consistent with how kernfs behaves w.r.t. severing, the
> > > right thing to do would be preventing any future polling at severing and
> > > waking up everyone currently waiting, which sounds fine from cgroup behavior
> > > POV too.
> >
> > That's actually what we are currently doing for PSI triggers.
> > ->release() is handled by cgroup_pressure_release() which signals the
> > waiters, waits for RCU grace period to pass (per
> > https://elixir.bootlin.com/linux/latest/source/include/linux/wait.h#L258)
> > and then releases all the trigger resources including the waitqueue
> > head. However as reported in
> > https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com
> > this does not save us from the synchronous polling case:
> >
> >                                                   do_select
> >                                                       vfs_poll
> > cgroup_pressure_release
> >     psi_trigger_destroy
> >         wake_up_pollfree(&t->event_wait) -> unblocks vfs_poll
> >         synchronize_rcu()
> >         kfree(t) -> frees waitqueue head
> >                                                      poll_freewait()
> > -> uses waitqueue head
> >
> >
> > This happens because we release the resources associated with the file
> > while there are still file users (the file's refcount is non-zero).
> > And that happens because kernfs can call ->release() before the last
> > fput().
> >
> > >
> > > Now, the challenge is designing an interface which is difficult to make
> > > mistake with. IOW, it'd be great if kernfs wraps poll call so that severing
> > > is implemented without kernfs users doing anything, or at least make it
> > > pretty obvious what the correct usage pattern is.
> > >
> > > > Christian's suggestion to rename current ops->release() operation into
> > > > ops->drain() (or ops->flush() per Matthew's request) and introduce a
> > > > "new" ops->release() which is called only when the last fput() is done
> > > > seems sane to me. Would everyone be happy with that approach?
> > >
> > > I'm not sure I'd go there. The contract is that once ->release() is called,
> > > the code backing that file can go away (e.g. rmmod'd). It really should
> > > behave just like the last put from kernfs users' POV.
> >
> > I 100% agree with the above statement.
> >
> > > For this specific fix,
> > > it's safe because we know the ops is always built into the kernel and won't
> > > go away but it'd be really bad if the interface says "this is a normal thing
> > > to do". We'd be calling into rmmod'd text pages in no time.
> > >
> > > So, I mean, even for temporary fix, we have to make it abundantly clear that
> > > this is not for usual usage and can only be used if the code backing the ops
> > > is built into the kernel and so on.
> >
> > I think the root cause of this problem is that ->release() in kernfs
> > does not adhere to the common rule that ->release() is called only
> > when the file is going away and has no users left. Am I wrong?
>
> So imho, ultimately this all comes down to rmdir() having special
> semantics in kernfs. On any regular filesystem an rmdir() on a directory
> which is still referenced by a struct file doesn't trigger an
> f_op->release() operation. It's just that directory is unlinked and
> you get some sort of errno like ENOENT when you try to create new files
> in there or whatever. The actual f_op->release) however is triggered
> on last fput().
>
> But in essence, kernfs treats an rmdir() operation as being equivalent
> to a final fput() such that it somehow magically kills all file
> references. And that's just wrong and not supported.

Thanks for the explanation, Christian!
If kernfs is special and needs different rules for calling
f_op->release() then fine, but I need an operation which tells me
there are no users of the file so that I can free the resources.
What's the best way to do that?

>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Christian Brauner June 28, 2023, 8:41 a.m. UTC | #20
On Wed, Jun 28, 2023 at 12:46:43AM -0700, Suren Baghdasaryan wrote:
> On Wed, Jun 28, 2023 at 12:26 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Tue, Jun 27, 2023 at 08:09:46PM -0700, Suren Baghdasaryan wrote:
> > > On Tue, Jun 27, 2023 at 6:54 PM Tejun Heo <tj@kernel.org> wrote:
> > > >
> > > > Hello,
> > > >
> > > > On Tue, Jun 27, 2023 at 02:58:08PM -0700, Suren Baghdasaryan wrote:
> > > > > Ok in kernfs_generic_poll() we are using kernfs_open_node.poll
> > > > > waitqueue head for polling and kernfs_open_node is freed from inside
> > > > > kernfs_unlink_open_file() which is called from kernfs_fop_release().
> > > > > So, it is destroyed only when the last fput() is done, unlike the
> > > > > ops->release() operation which we are using for destroying PSI
> > > > > trigger's waitqueue. So, it seems we still need an operation which
> > > > > would indicate that the file is truly going away.
> > > >
> > > > If we want to stay consistent with how kernfs behaves w.r.t. severing, the
> > > > right thing to do would be preventing any future polling at severing and
> > > > waking up everyone currently waiting, which sounds fine from cgroup behavior
> > > > POV too.
> > >
> > > That's actually what we are currently doing for PSI triggers.
> > > ->release() is handled by cgroup_pressure_release() which signals the
> > > waiters, waits for RCU grace period to pass (per
> > > https://elixir.bootlin.com/linux/latest/source/include/linux/wait.h#L258)
> > > and then releases all the trigger resources including the waitqueue
> > > head. However as reported in
> > > https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com
> > > this does not save us from the synchronous polling case:
> > >
> > >                                                   do_select
> > >                                                       vfs_poll
> > > cgroup_pressure_release
> > >     psi_trigger_destroy
> > >         wake_up_pollfree(&t->event_wait) -> unblocks vfs_poll
> > >         synchronize_rcu()
> > >         kfree(t) -> frees waitqueue head
> > >                                                      poll_freewait()
> > > -> uses waitqueue head
> > >
> > >
> > > This happens because we release the resources associated with the file
> > > while there are still file users (the file's refcount is non-zero).
> > > And that happens because kernfs can call ->release() before the last
> > > fput().
> > >
> > > >
> > > > Now, the challenge is designing an interface which is difficult to make
> > > > mistake with. IOW, it'd be great if kernfs wraps poll call so that severing
> > > > is implemented without kernfs users doing anything, or at least make it
> > > > pretty obvious what the correct usage pattern is.
> > > >
> > > > > Christian's suggestion to rename current ops->release() operation into
> > > > > ops->drain() (or ops->flush() per Matthew's request) and introduce a
> > > > > "new" ops->release() which is called only when the last fput() is done
> > > > > seems sane to me. Would everyone be happy with that approach?
> > > >
> > > > I'm not sure I'd go there. The contract is that once ->release() is called,
> > > > the code backing that file can go away (e.g. rmmod'd). It really should
> > > > behave just like the last put from kernfs users' POV.
> > >
> > > I 100% agree with the above statement.
> > >
> > > > For this specific fix,
> > > > it's safe because we know the ops is always built into the kernel and won't

I don't know if this talks about kernfs_ops (likely) or talks about
f_ops but fyi for f_ops this isn't a problem. See fops_get() in
do_dentry_open() which takes a reference on the mode that provides the
fops. And debugfs - a little more elaborately - handles this as well.

> > > > go away but it'd be really bad if the interface says "this is a normal thing
> > > > to do". We'd be calling into rmmod'd text pages in no time.
> > > >
> > > > So, I mean, even for temporary fix, we have to make it abundantly clear that
> > > > this is not for usual usage and can only be used if the code backing the ops
> > > > is built into the kernel and so on.
> > >
> > > I think the root cause of this problem is that ->release() in kernfs
> > > does not adhere to the common rule that ->release() is called only
> > > when the file is going away and has no users left. Am I wrong?
> >
> > So imho, ultimately this all comes down to rmdir() having special
> > semantics in kernfs. On any regular filesystem an rmdir() on a directory
> > which is still referenced by a struct file doesn't trigger an
> > f_op->release() operation. It's just that directory is unlinked and
> > you get some sort of errno like ENOENT when you try to create new files
> > in there or whatever. The actual f_op->release) however is triggered
> > on last fput().
> >
> > But in essence, kernfs treats an rmdir() operation as being equivalent
> > to a final fput() such that it somehow magically kills all file
> > references. And that's just wrong and not supported.
> 
> Thanks for the explanation, Christian!
> If kernfs is special and needs different rules for calling
> f_op->release() then fine, but I need an operation which tells me
> there are no users of the file so that I can free the resources.
> What's the best way to do that?

Imho, if there's still someone with an fd referencing that file then
there's still a user. That's unlink() while holding an fd in a nutshell.

But generically, afaui what you seem to want is:

(1) a way to shutdown functionality provided by a kernfs node on removal
    of that node
(2) a way to release the resources of a kernfs node

So (2) is seemingly what kernfs_ops->release() is about but it's also
used for (1). So while I initially thought about ->drain() or whatever
it seems what you really want is for struct kernfs_ops to gain an
unlink()/remove()/rmdir() method(s). The method can be implemented by
interested callers and should be called when the kernfs node is removed.

And that's when you can shutdown any functionality without freeing the
resources.

Imho, if you add struct gimmegimme_ops with same names as f_op you
really to mirror them as close as possible otherwise you're asking for
problems.
Suren Baghdasaryan June 28, 2023, 4:28 p.m. UTC | #21
On Wed, Jun 28, 2023 at 1:41 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Wed, Jun 28, 2023 at 12:46:43AM -0700, Suren Baghdasaryan wrote:
> > On Wed, Jun 28, 2023 at 12:26 AM Christian Brauner <brauner@kernel.org> wrote:
> > >
> > > On Tue, Jun 27, 2023 at 08:09:46PM -0700, Suren Baghdasaryan wrote:
> > > > On Tue, Jun 27, 2023 at 6:54 PM Tejun Heo <tj@kernel.org> wrote:
> > > > >
> > > > > Hello,
> > > > >
> > > > > On Tue, Jun 27, 2023 at 02:58:08PM -0700, Suren Baghdasaryan wrote:
> > > > > > Ok in kernfs_generic_poll() we are using kernfs_open_node.poll
> > > > > > waitqueue head for polling and kernfs_open_node is freed from inside
> > > > > > kernfs_unlink_open_file() which is called from kernfs_fop_release().
> > > > > > So, it is destroyed only when the last fput() is done, unlike the
> > > > > > ops->release() operation which we are using for destroying PSI
> > > > > > trigger's waitqueue. So, it seems we still need an operation which
> > > > > > would indicate that the file is truly going away.
> > > > >
> > > > > If we want to stay consistent with how kernfs behaves w.r.t. severing, the
> > > > > right thing to do would be preventing any future polling at severing and
> > > > > waking up everyone currently waiting, which sounds fine from cgroup behavior
> > > > > POV too.
> > > >
> > > > That's actually what we are currently doing for PSI triggers.
> > > > ->release() is handled by cgroup_pressure_release() which signals the
> > > > waiters, waits for RCU grace period to pass (per
> > > > https://elixir.bootlin.com/linux/latest/source/include/linux/wait.h#L258)
> > > > and then releases all the trigger resources including the waitqueue
> > > > head. However as reported in
> > > > https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com
> > > > this does not save us from the synchronous polling case:
> > > >
> > > >                                                   do_select
> > > >                                                       vfs_poll
> > > > cgroup_pressure_release
> > > >     psi_trigger_destroy
> > > >         wake_up_pollfree(&t->event_wait) -> unblocks vfs_poll
> > > >         synchronize_rcu()
> > > >         kfree(t) -> frees waitqueue head
> > > >                                                      poll_freewait()
> > > > -> uses waitqueue head
> > > >
> > > >
> > > > This happens because we release the resources associated with the file
> > > > while there are still file users (the file's refcount is non-zero).
> > > > And that happens because kernfs can call ->release() before the last
> > > > fput().
> > > >
> > > > >
> > > > > Now, the challenge is designing an interface which is difficult to make
> > > > > mistake with. IOW, it'd be great if kernfs wraps poll call so that severing
> > > > > is implemented without kernfs users doing anything, or at least make it
> > > > > pretty obvious what the correct usage pattern is.
> > > > >
> > > > > > Christian's suggestion to rename current ops->release() operation into
> > > > > > ops->drain() (or ops->flush() per Matthew's request) and introduce a
> > > > > > "new" ops->release() which is called only when the last fput() is done
> > > > > > seems sane to me. Would everyone be happy with that approach?
> > > > >
> > > > > I'm not sure I'd go there. The contract is that once ->release() is called,
> > > > > the code backing that file can go away (e.g. rmmod'd). It really should
> > > > > behave just like the last put from kernfs users' POV.
> > > >
> > > > I 100% agree with the above statement.
> > > >
> > > > > For this specific fix,
> > > > > it's safe because we know the ops is always built into the kernel and won't
>
> I don't know if this talks about kernfs_ops (likely) or talks about
> f_ops but fyi for f_ops this isn't a problem. See fops_get() in
> do_dentry_open() which takes a reference on the mode that provides the
> fops. And debugfs - a little more elaborately - handles this as well.
>
> > > > > go away but it'd be really bad if the interface says "this is a normal thing
> > > > > to do". We'd be calling into rmmod'd text pages in no time.
> > > > >
> > > > > So, I mean, even for temporary fix, we have to make it abundantly clear that
> > > > > this is not for usual usage and can only be used if the code backing the ops
> > > > > is built into the kernel and so on.
> > > >
> > > > I think the root cause of this problem is that ->release() in kernfs
> > > > does not adhere to the common rule that ->release() is called only
> > > > when the file is going away and has no users left. Am I wrong?
> > >
> > > So imho, ultimately this all comes down to rmdir() having special
> > > semantics in kernfs. On any regular filesystem an rmdir() on a directory
> > > which is still referenced by a struct file doesn't trigger an
> > > f_op->release() operation. It's just that directory is unlinked and
> > > you get some sort of errno like ENOENT when you try to create new files
> > > in there or whatever. The actual f_op->release) however is triggered
> > > on last fput().
> > >
> > > But in essence, kernfs treats an rmdir() operation as being equivalent
> > > to a final fput() such that it somehow magically kills all file
> > > references. And that's just wrong and not supported.
> >
> > Thanks for the explanation, Christian!
> > If kernfs is special and needs different rules for calling
> > f_op->release() then fine, but I need an operation which tells me
> > there are no users of the file so that I can free the resources.
> > What's the best way to do that?
>
> Imho, if there's still someone with an fd referencing that file then
> there's still a user. That's unlink() while holding an fd in a nutshell.
>
> But generically, afaui what you seem to want is:
>
> (1) a way to shutdown functionality provided by a kernfs node on removal
>     of that node
> (2) a way to release the resources of a kernfs node
>
> So (2) is seemingly what kernfs_ops->release() is about but it's also
> used for (1). So while I initially thought about ->drain() or whatever
> it seems what you really want is for struct kernfs_ops to gain an
> unlink()/remove()/rmdir() method(s). The method can be implemented by
> interested callers and should be called when the kernfs node is removed.
>
> And that's when you can shutdown any functionality without freeing the
> resources.
>
> Imho, if you add struct gimmegimme_ops with same names as f_op you
> really to mirror them as close as possible otherwise you're asking for
> problems.

Thanks for the feedback!
To summarize my understanding of your proposal, you suggest adding new
kernfs_ops for the case you marked (1) and change ->release() to do
only (2). Please correct me if I misunderstood. Greg, Tejun, WDYT?
Christian Brauner June 28, 2023, 5:35 p.m. UTC | #22
On Wed, Jun 28, 2023 at 09:28:03AM -0700, Suren Baghdasaryan wrote:
> On Wed, Jun 28, 2023 at 1:41 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Wed, Jun 28, 2023 at 12:46:43AM -0700, Suren Baghdasaryan wrote:
> > > On Wed, Jun 28, 2023 at 12:26 AM Christian Brauner <brauner@kernel.org> wrote:
> > > >
> > > > On Tue, Jun 27, 2023 at 08:09:46PM -0700, Suren Baghdasaryan wrote:
> > > > > On Tue, Jun 27, 2023 at 6:54 PM Tejun Heo <tj@kernel.org> wrote:
> > > > > >
> > > > > > Hello,
> > > > > >
> > > > > > On Tue, Jun 27, 2023 at 02:58:08PM -0700, Suren Baghdasaryan wrote:
> > > > > > > Ok in kernfs_generic_poll() we are using kernfs_open_node.poll
> > > > > > > waitqueue head for polling and kernfs_open_node is freed from inside
> > > > > > > kernfs_unlink_open_file() which is called from kernfs_fop_release().
> > > > > > > So, it is destroyed only when the last fput() is done, unlike the
> > > > > > > ops->release() operation which we are using for destroying PSI
> > > > > > > trigger's waitqueue. So, it seems we still need an operation which
> > > > > > > would indicate that the file is truly going away.
> > > > > >
> > > > > > If we want to stay consistent with how kernfs behaves w.r.t. severing, the
> > > > > > right thing to do would be preventing any future polling at severing and
> > > > > > waking up everyone currently waiting, which sounds fine from cgroup behavior
> > > > > > POV too.
> > > > >
> > > > > That's actually what we are currently doing for PSI triggers.
> > > > > ->release() is handled by cgroup_pressure_release() which signals the
> > > > > waiters, waits for RCU grace period to pass (per
> > > > > https://elixir.bootlin.com/linux/latest/source/include/linux/wait.h#L258)
> > > > > and then releases all the trigger resources including the waitqueue
> > > > > head. However as reported in
> > > > > https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com
> > > > > this does not save us from the synchronous polling case:
> > > > >
> > > > >                                                   do_select
> > > > >                                                       vfs_poll
> > > > > cgroup_pressure_release
> > > > >     psi_trigger_destroy
> > > > >         wake_up_pollfree(&t->event_wait) -> unblocks vfs_poll
> > > > >         synchronize_rcu()
> > > > >         kfree(t) -> frees waitqueue head
> > > > >                                                      poll_freewait()
> > > > > -> uses waitqueue head
> > > > >
> > > > >
> > > > > This happens because we release the resources associated with the file
> > > > > while there are still file users (the file's refcount is non-zero).
> > > > > And that happens because kernfs can call ->release() before the last
> > > > > fput().
> > > > >
> > > > > >
> > > > > > Now, the challenge is designing an interface which is difficult to make
> > > > > > mistake with. IOW, it'd be great if kernfs wraps poll call so that severing
> > > > > > is implemented without kernfs users doing anything, or at least make it
> > > > > > pretty obvious what the correct usage pattern is.
> > > > > >
> > > > > > > Christian's suggestion to rename current ops->release() operation into
> > > > > > > ops->drain() (or ops->flush() per Matthew's request) and introduce a
> > > > > > > "new" ops->release() which is called only when the last fput() is done
> > > > > > > seems sane to me. Would everyone be happy with that approach?
> > > > > >
> > > > > > I'm not sure I'd go there. The contract is that once ->release() is called,
> > > > > > the code backing that file can go away (e.g. rmmod'd). It really should
> > > > > > behave just like the last put from kernfs users' POV.
> > > > >
> > > > > I 100% agree with the above statement.
> > > > >
> > > > > > For this specific fix,
> > > > > > it's safe because we know the ops is always built into the kernel and won't
> >
> > I don't know if this talks about kernfs_ops (likely) or talks about
> > f_ops but fyi for f_ops this isn't a problem. See fops_get() in
> > do_dentry_open() which takes a reference on the mode that provides the
> > fops. And debugfs - a little more elaborately - handles this as well.
> >
> > > > > > go away but it'd be really bad if the interface says "this is a normal thing
> > > > > > to do". We'd be calling into rmmod'd text pages in no time.
> > > > > >
> > > > > > So, I mean, even for temporary fix, we have to make it abundantly clear that
> > > > > > this is not for usual usage and can only be used if the code backing the ops
> > > > > > is built into the kernel and so on.
> > > > >
> > > > > I think the root cause of this problem is that ->release() in kernfs
> > > > > does not adhere to the common rule that ->release() is called only
> > > > > when the file is going away and has no users left. Am I wrong?
> > > >
> > > > So imho, ultimately this all comes down to rmdir() having special
> > > > semantics in kernfs. On any regular filesystem an rmdir() on a directory
> > > > which is still referenced by a struct file doesn't trigger an
> > > > f_op->release() operation. It's just that directory is unlinked and
> > > > you get some sort of errno like ENOENT when you try to create new files
> > > > in there or whatever. The actual f_op->release) however is triggered
> > > > on last fput().
> > > >
> > > > But in essence, kernfs treats an rmdir() operation as being equivalent
> > > > to a final fput() such that it somehow magically kills all file
> > > > references. And that's just wrong and not supported.
> > >
> > > Thanks for the explanation, Christian!
> > > If kernfs is special and needs different rules for calling
> > > f_op->release() then fine, but I need an operation which tells me
> > > there are no users of the file so that I can free the resources.
> > > What's the best way to do that?
> >
> > Imho, if there's still someone with an fd referencing that file then
> > there's still a user. That's unlink() while holding an fd in a nutshell.
> >
> > But generically, afaui what you seem to want is:
> >
> > (1) a way to shutdown functionality provided by a kernfs node on removal
> >     of that node
> > (2) a way to release the resources of a kernfs node
> >
> > So (2) is seemingly what kernfs_ops->release() is about but it's also
> > used for (1). So while I initially thought about ->drain() or whatever
> > it seems what you really want is for struct kernfs_ops to gain an
> > unlink()/remove()/rmdir() method(s). The method can be implemented by
> > interested callers and should be called when the kernfs node is removed.
> >
> > And that's when you can shutdown any functionality without freeing the
> > resources.
> >
> > Imho, if you add struct gimmegimme_ops with same names as f_op you
> > really to mirror them as close as possible otherwise you're asking for
> > problems.
> 
> Thanks for the feedback!

Hopefully it helped more than it confused.

> To summarize my understanding of your proposal, you suggest adding new
> kernfs_ops for the case you marked (1) and change ->release() to do
> only (2). Please correct me if I misunderstood. Greg, Tejun, WDYT?

Yes. I can't claim to know all the intricate implementation details of
kernfs ofc but this seems sane to me.
Tejun Heo June 28, 2023, 5:58 p.m. UTC | #23
Hello,

On Wed, Jun 28, 2023 at 09:26:07AM +0200, Christian Brauner wrote:
> > I think the root cause of this problem is that ->release() in kernfs
> > does not adhere to the common rule that ->release() is called only
> > when the file is going away and has no users left. Am I wrong?
> 
> So imho, ultimately this all comes down to rmdir() having special
> semantics in kernfs. On any regular filesystem an rmdir() on a directory

Yeap, rmdir needs to revoke all the existing open files for kernfs to allow
the subsystem to disappear afterwards.

> which is still referenced by a struct file doesn't trigger an
> f_op->release() operation. It's just that directory is unlinked and
> you get some sort of errno like ENOENT when you try to create new files
> in there or whatever. The actual f_op->release) however is triggered
> on last fput().
> 
> But in essence, kernfs treats an rmdir() operation as being equivalent
> to a final fput() such that it somehow magically kills all file
> references. And that's just wrong and not supported.

It is not supported in linux vfs but kernfs users need it, so it's a
semantic implemented in kernfs, which does add some complications but that's
the cost we pay for solving the problem of allowing device drivers or
whatever backing kernfs to go away when they want to.

I'm not sure what classifying a behavior requirement as wrong means. Do you
mean that we shouldn't allow device drives to be unloaded if someone forget
to close a sysfs file?

Thanks.
Tejun Heo June 28, 2023, 6:02 p.m. UTC | #24
On Wed, Jun 28, 2023 at 07:35:20PM +0200, Christian Brauner wrote:
> > To summarize my understanding of your proposal, you suggest adding new
> > kernfs_ops for the case you marked (1) and change ->release() to do
> > only (2). Please correct me if I misunderstood. Greg, Tejun, WDYT?
> 
> Yes. I can't claim to know all the intricate implementation details of
> kernfs ofc but this seems sane to me.

This is going to be massively confusing for vast majority of kernfs users.
The contract kernfs provides is that you can tell kernfs that you want out
and then you can do so synchronously in a finite amount of time (you still
have to wait for in-flight operations to finish but that's under your
control). Adding an operation which outlives that contract as something
usual to use is guaranteed to lead to obscure future crnashes. For a
temporary fix, it's fine as long as it's marked clearly but please don't
make it something seemingly widely useable.

We have a long history of modules causing crashes because of this. The
severing semantics is not there just for fun.

Thanks.
Suren Baghdasaryan June 28, 2023, 6:18 p.m. UTC | #25
On Wed, Jun 28, 2023 at 11:02 AM Tejun Heo <tj@kernel.org> wrote:
>
> On Wed, Jun 28, 2023 at 07:35:20PM +0200, Christian Brauner wrote:
> > > To summarize my understanding of your proposal, you suggest adding new
> > > kernfs_ops for the case you marked (1) and change ->release() to do
> > > only (2). Please correct me if I misunderstood. Greg, Tejun, WDYT?
> >
> > Yes. I can't claim to know all the intricate implementation details of
> > kernfs ofc but this seems sane to me.
>
> This is going to be massively confusing for vast majority of kernfs users.
> The contract kernfs provides is that you can tell kernfs that you want out
> and then you can do so synchronously in a finite amount of time (you still
> have to wait for in-flight operations to finish but that's under your
> control). Adding an operation which outlives that contract as something
> usual to use is guaranteed to lead to obscure future crnashes. For a
> temporary fix, it's fine as long as it's marked clearly but please don't
> make it something seemingly widely useable.
>
> We have a long history of modules causing crashes because of this. The
> severing semantics is not there just for fun.

I'm sure there are reasons things are working as they do today. Sounds
like we can't change the ->release() logic from what it is today...
Then the question is how do we fix this case needing to release a
resource which can be released only when there are no users of the
file? My original suggestion was to add a kernfs_ops operation which
would indicate there are no more users but that seems to be confusing.
Are there better ways to fix this issue?
Thanks,
Suren.

>
> Thanks.
>
> --
> tejun
Greg KH June 28, 2023, 6:42 p.m. UTC | #26
On Wed, Jun 28, 2023 at 11:18:20AM -0700, Suren Baghdasaryan wrote:
> On Wed, Jun 28, 2023 at 11:02 AM Tejun Heo <tj@kernel.org> wrote:
> >
> > On Wed, Jun 28, 2023 at 07:35:20PM +0200, Christian Brauner wrote:
> > > > To summarize my understanding of your proposal, you suggest adding new
> > > > kernfs_ops for the case you marked (1) and change ->release() to do
> > > > only (2). Please correct me if I misunderstood. Greg, Tejun, WDYT?
> > >
> > > Yes. I can't claim to know all the intricate implementation details of
> > > kernfs ofc but this seems sane to me.
> >
> > This is going to be massively confusing for vast majority of kernfs users.
> > The contract kernfs provides is that you can tell kernfs that you want out
> > and then you can do so synchronously in a finite amount of time (you still
> > have to wait for in-flight operations to finish but that's under your
> > control). Adding an operation which outlives that contract as something
> > usual to use is guaranteed to lead to obscure future crnashes. For a
> > temporary fix, it's fine as long as it's marked clearly but please don't
> > make it something seemingly widely useable.
> >
> > We have a long history of modules causing crashes because of this. The
> > severing semantics is not there just for fun.
> 
> I'm sure there are reasons things are working as they do today. Sounds
> like we can't change the ->release() logic from what it is today...
> Then the question is how do we fix this case needing to release a
> resource which can be released only when there are no users of the
> file? My original suggestion was to add a kernfs_ops operation which
> would indicate there are no more users but that seems to be confusing.
> Are there better ways to fix this issue?

Just make sure that you really only remove the file when all users are
done with it?  Do you have control of that from the driver side?

But, why is this kernfs file so "special" that it must have this special
construct?  Why not do what all other files that handle polling do and
just remove and get out of there when done?

thanks,

greg k-h
Suren Baghdasaryan June 28, 2023, 8:12 p.m. UTC | #27
On Wed, Jun 28, 2023 at 11:42 AM Greg KH <gregkh@linuxfoundation.org> wrote:
>
> On Wed, Jun 28, 2023 at 11:18:20AM -0700, Suren Baghdasaryan wrote:
> > On Wed, Jun 28, 2023 at 11:02 AM Tejun Heo <tj@kernel.org> wrote:
> > >
> > > On Wed, Jun 28, 2023 at 07:35:20PM +0200, Christian Brauner wrote:
> > > > > To summarize my understanding of your proposal, you suggest adding new
> > > > > kernfs_ops for the case you marked (1) and change ->release() to do
> > > > > only (2). Please correct me if I misunderstood. Greg, Tejun, WDYT?
> > > >
> > > > Yes. I can't claim to know all the intricate implementation details of
> > > > kernfs ofc but this seems sane to me.
> > >
> > > This is going to be massively confusing for vast majority of kernfs users.
> > > The contract kernfs provides is that you can tell kernfs that you want out
> > > and then you can do so synchronously in a finite amount of time (you still
> > > have to wait for in-flight operations to finish but that's under your
> > > control). Adding an operation which outlives that contract as something
> > > usual to use is guaranteed to lead to obscure future crnashes. For a
> > > temporary fix, it's fine as long as it's marked clearly but please don't
> > > make it something seemingly widely useable.
> > >
> > > We have a long history of modules causing crashes because of this. The
> > > severing semantics is not there just for fun.
> >
> > I'm sure there are reasons things are working as they do today. Sounds
> > like we can't change the ->release() logic from what it is today...
> > Then the question is how do we fix this case needing to release a
> > resource which can be released only when there are no users of the
> > file? My original suggestion was to add a kernfs_ops operation which
> > would indicate there are no more users but that seems to be confusing.
> > Are there better ways to fix this issue?
>
> Just make sure that you really only remove the file when all users are
> done with it?  Do you have control of that from the driver side?

I'm a bit confused. In my case it's not a driver, it's the cgroup
subsystem and the issue is not that we are removing the file while
there are other users. The issue is that kernfs today has no operation
which is called when the last user is gone. I need such an operation
to be able to free the resources knowing that no users are left.

>
> But, why is this kernfs file so "special" that it must have this special
> construct?  Why not do what all other files that handle polling do and
> just remove and get out of there when done?

AFAIU all other files that handle polling rely on f_op->release()
being called after all the users are gone, therefore they can safely
free their resources. However kernfs can call ->release() while there
are still active users of the file. I can't use that operation for
resource cleanup therefore I was suggesting to add a new operation
which would be called only after the last fput() and would guarantee
no users. Again, I'm not an expert in this, so there might be a better
way to handle it. Please advise.
Thanks,
Suren.

>
> thanks,
>
> greg k-h
Tejun Heo June 28, 2023, 8:34 p.m. UTC | #28
Hello, Suren.

On Wed, Jun 28, 2023 at 01:12:23PM -0700, Suren Baghdasaryan wrote:
> AFAIU all other files that handle polling rely on f_op->release()
> being called after all the users are gone, therefore they can safely
> free their resources. However kernfs can call ->release() while there
> are still active users of the file. I can't use that operation for
> resource cleanup therefore I was suggesting to add a new operation
> which would be called only after the last fput() and would guarantee
> no users. Again, I'm not an expert in this, so there might be a better
> way to handle it. Please advise.

So, w/ kernfs, the right thing to do is making sure that whatever is exposed
to the kernfs user is terminated on removal - ie. after kernfs_ops->release
is called, the ops table should be considered dead and there shouldn't be
anything left to clean up from the kernfs user side. You can add abstraction
kernfs so that kernfs can terminate the calls coming down from the higher
layers on its own. That's how every other operation is handled and what
should happen with the psi polling too.

Thanks.
Suren Baghdasaryan June 28, 2023, 9:50 p.m. UTC | #29
On Wed, Jun 28, 2023 at 1:34 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello, Suren.
>
> On Wed, Jun 28, 2023 at 01:12:23PM -0700, Suren Baghdasaryan wrote:
> > AFAIU all other files that handle polling rely on f_op->release()
> > being called after all the users are gone, therefore they can safely
> > free their resources. However kernfs can call ->release() while there
> > are still active users of the file. I can't use that operation for
> > resource cleanup therefore I was suggesting to add a new operation
> > which would be called only after the last fput() and would guarantee
> > no users. Again, I'm not an expert in this, so there might be a better
> > way to handle it. Please advise.
>
> So, w/ kernfs, the right thing to do is making sure that whatever is exposed
> to the kernfs user is terminated on removal - ie. after kernfs_ops->release
> is called, the ops table should be considered dead and there shouldn't be
> anything left to clean up from the kernfs user side. You can add abstraction
> kernfs so that kernfs can terminate the calls coming down from the higher
> layers on its own. That's how every other operation is handled and what
> should happen with the psi polling too.

I'm not sure I understand. The waitqueue head we are freeing in
->release() can be accessed asynchronously and does not require any
kernfs_op call. Here is a recap of that race:

                                                do_select
                                                      vfs_poll
cgroup_pressure_release
    psi_trigger_destroy
        wake_up_pollfree(&t->event_wait) -> unblocks vfs_poll
        synchronize_rcu()
        kfree(t) -> frees waitqueue head
                                                     poll_freewait() -> UAF

Note that poll_freewait() is not part of any kernel_op, so I'm not
sure how adding an abstraction kernfs would help, but again, this is
new territory for me and I might be missing something.

On a different note, I think there might be an easy way to fix this.
What if psi triggers reuse kernfs_open_node->poll waitqueue head?
Since we are overriding the ->poll() method, that waitqueue head is
unused AFAIKT. And best of all, its lifecycle is tied to the file's
lifecycle, so it does not have the issue that trigger waitqueue head
has. In the trigger I could simply store a pointer to that waitqueue
and use it. Then in ->release() freeing trigger would not affect the
waitqueue at all. Does that sound sane?


>
> Thanks.
>
> --
> tejun
Suren Baghdasaryan June 30, 2023, 12:59 a.m. UTC | #30
On Wed, Jun 28, 2023 at 2:50 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Jun 28, 2023 at 1:34 PM Tejun Heo <tj@kernel.org> wrote:
> >
> > Hello, Suren.
> >
> > On Wed, Jun 28, 2023 at 01:12:23PM -0700, Suren Baghdasaryan wrote:
> > > AFAIU all other files that handle polling rely on f_op->release()
> > > being called after all the users are gone, therefore they can safely
> > > free their resources. However kernfs can call ->release() while there
> > > are still active users of the file. I can't use that operation for
> > > resource cleanup therefore I was suggesting to add a new operation
> > > which would be called only after the last fput() and would guarantee
> > > no users. Again, I'm not an expert in this, so there might be a better
> > > way to handle it. Please advise.
> >
> > So, w/ kernfs, the right thing to do is making sure that whatever is exposed
> > to the kernfs user is terminated on removal - ie. after kernfs_ops->release
> > is called, the ops table should be considered dead and there shouldn't be
> > anything left to clean up from the kernfs user side. You can add abstraction
> > kernfs so that kernfs can terminate the calls coming down from the higher
> > layers on its own. That's how every other operation is handled and what
> > should happen with the psi polling too.
>
> I'm not sure I understand. The waitqueue head we are freeing in
> ->release() can be accessed asynchronously and does not require any
> kernfs_op call. Here is a recap of that race:
>
>                                                 do_select
>                                                       vfs_poll
> cgroup_pressure_release
>     psi_trigger_destroy
>         wake_up_pollfree(&t->event_wait) -> unblocks vfs_poll
>         synchronize_rcu()
>         kfree(t) -> frees waitqueue head
>                                                      poll_freewait() -> UAF
>
> Note that poll_freewait() is not part of any kernel_op, so I'm not
> sure how adding an abstraction kernfs would help, but again, this is
> new territory for me and I might be missing something.
>
> On a different note, I think there might be an easy way to fix this.
> What if psi triggers reuse kernfs_open_node->poll waitqueue head?
> Since we are overriding the ->poll() method, that waitqueue head is
> unused AFAIKT. And best of all, its lifecycle is tied to the file's
> lifecycle, so it does not have the issue that trigger waitqueue head
> has. In the trigger I could simply store a pointer to that waitqueue
> and use it. Then in ->release() freeing trigger would not affect the
> waitqueue at all. Does that sound sane?

I think this approach is much cleaner and I'm guessing that's in line
with what Tejun was describing (maybe it's exactly what he was telling
me but it took time for me to get it). Posted the patch implementing
this approach here:
https://lore.kernel.org/all/20230630005612.1014540-1-surenb@google.com/

>
>
> >
> > Thanks.
> >
> > --
> > tejun
Christian Brauner June 30, 2023, 8:21 a.m. UTC | #31
On Thu, Jun 29, 2023 at 05:59:07PM -0700, Suren Baghdasaryan wrote:
> On Wed, Jun 28, 2023 at 2:50 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Wed, Jun 28, 2023 at 1:34 PM Tejun Heo <tj@kernel.org> wrote:
> > >
> > > Hello, Suren.
> > >
> > > On Wed, Jun 28, 2023 at 01:12:23PM -0700, Suren Baghdasaryan wrote:
> > > > AFAIU all other files that handle polling rely on f_op->release()
> > > > being called after all the users are gone, therefore they can safely
> > > > free their resources. However kernfs can call ->release() while there
> > > > are still active users of the file. I can't use that operation for
> > > > resource cleanup therefore I was suggesting to add a new operation
> > > > which would be called only after the last fput() and would guarantee
> > > > no users. Again, I'm not an expert in this, so there might be a better
> > > > way to handle it. Please advise.
> > >
> > > So, w/ kernfs, the right thing to do is making sure that whatever is exposed
> > > to the kernfs user is terminated on removal - ie. after kernfs_ops->release
> > > is called, the ops table should be considered dead and there shouldn't be
> > > anything left to clean up from the kernfs user side. You can add abstraction
> > > kernfs so that kernfs can terminate the calls coming down from the higher
> > > layers on its own. That's how every other operation is handled and what
> > > should happen with the psi polling too.
> >
> > I'm not sure I understand. The waitqueue head we are freeing in
> > ->release() can be accessed asynchronously and does not require any
> > kernfs_op call. Here is a recap of that race:
> >
> >                                                 do_select
> >                                                       vfs_poll
> > cgroup_pressure_release
> >     psi_trigger_destroy
> >         wake_up_pollfree(&t->event_wait) -> unblocks vfs_poll
> >         synchronize_rcu()
> >         kfree(t) -> frees waitqueue head
> >                                                      poll_freewait() -> UAF
> >
> > Note that poll_freewait() is not part of any kernel_op, so I'm not
> > sure how adding an abstraction kernfs would help, but again, this is
> > new territory for me and I might be missing something.
> >
> > On a different note, I think there might be an easy way to fix this.
> > What if psi triggers reuse kernfs_open_node->poll waitqueue head?
> > Since we are overriding the ->poll() method, that waitqueue head is
> > unused AFAIKT. And best of all, its lifecycle is tied to the file's
> > lifecycle, so it does not have the issue that trigger waitqueue head
> > has. In the trigger I could simply store a pointer to that waitqueue
> > and use it. Then in ->release() freeing trigger would not affect the
> > waitqueue at all. Does that sound sane?
> 
> I think this approach is much cleaner and I'm guessing that's in line
> with what Tejun was describing (maybe it's exactly what he was telling
> me but it took time for me to get it). Posted the patch implementing
> this approach here:
> https://lore.kernel.org/all/20230630005612.1014540-1-surenb@google.com/

I'm sure that how things work today in kernfs are there for a reason.A

What I'm mostly reacting to is that there's a kernfs_ops->release()
method which mirrors f_op->release() but can be called when there are
still users which is counterintuitive for release semantics. And that
ultimately caused this UAF issue which was rather subtle given how long
it took to track down the root cause.

A rmdir() isn't triggering a f_op->release() if there are still file
references but it's apparently triggering a kernfs_ops->release(). It
feels like this should at least be documented in struct kernfs_ops...
Tejun Heo July 10, 2023, 8:38 p.m. UTC | #32
Hello,

On Fri, Jun 30, 2023 at 10:21:17AM +0200, Christian Brauner wrote:
> What I'm mostly reacting to is that there's a kernfs_ops->release()
> method which mirrors f_op->release() but can be called when there are
> still users which is counterintuitive for release semantics. And that
> ultimately caused this UAF issue which was rather subtle given how long
> it took to track down the root cause.
> 
> A rmdir() isn't triggering a f_op->release() if there are still file
> references but it's apparently triggering a kernfs_ops->release(). It
> feels like this should at least be documented in struct kernfs_ops...

Oh yeah, better documentation would be great. The core part here is that
kernfs is the layer which is implementing the revoke-like semantics
specifically to allow kernfs users (the ones that implement kernfs_ops) can
synchronously abort their involvement at will. So, from those users' POV,
->release is being called when it should be. The problem here was that PSI
was mixing objects from two layers with different lifetime rules, which
obviously causes issues.

As Suren's new fix shows, the fix is just using the matching object whose
lifetime is governed by kernfs. While this shows up in a subtle way for
poll, for all other operations, this is almost completely transprent.

Thanks.
diff mbox series

Patch

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 40c4661f15b7..acc52d23d8f6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -766,7 +766,7 @@  static int kernfs_fop_open(struct inode *inode, struct file *file)
 
 /* used from release/drain to ensure that ->release() is called exactly once */
 static void kernfs_release_file(struct kernfs_node *kn,
-				struct kernfs_open_file *of)
+				struct kernfs_open_file *of, bool final)
 {
 	/*
 	 * @of is guaranteed to have no other file operations in flight and
@@ -787,6 +787,8 @@  static void kernfs_release_file(struct kernfs_node *kn,
 		of->released = true;
 		of_on(of)->nr_to_release--;
 	}
+	if (final && kn->attr.ops->free)
+		kn->attr.ops->free(of);
 }
 
 static int kernfs_fop_release(struct inode *inode, struct file *filp)
@@ -798,7 +800,7 @@  static int kernfs_fop_release(struct inode *inode, struct file *filp)
 		struct mutex *mutex;
 
 		mutex = kernfs_open_file_mutex_lock(kn);
-		kernfs_release_file(kn, of);
+		kernfs_release_file(kn, of, true);
 		mutex_unlock(mutex);
 	}
 
@@ -852,7 +854,7 @@  void kernfs_drain_open_files(struct kernfs_node *kn)
 		}
 
 		if (kn->flags & KERNFS_HAS_RELEASE)
-			kernfs_release_file(kn, of);
+			kernfs_release_file(kn, of, false);
 	}
 
 	WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 73f5c120def8..a7e404ff31bb 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -273,6 +273,11 @@  struct kernfs_ops {
 	 */
 	int (*open)(struct kernfs_open_file *of);
 	void (*release)(struct kernfs_open_file *of);
+	/*
+	 * Free resources tied to the lifecycle of the file, like a
+	 * waitqueue used for polling.
+	 */
+	void (*free)(struct kernfs_open_file *of);
 
 	/*
 	 * Read is handled by either seq_file or raw_read().