diff mbox series

[v3,03/10] eventfs: adding eventfs dir add functions

Message ID 1685610013-33478-4-git-send-email-akaher@vmware.com (mailing list archive)
State Superseded
Headers show
Series tracing: introducing eventfs | expand

Commit Message

Ajay Kaher June 1, 2023, 9 a.m. UTC
Adding eventfs_file structure which will hold properties of file or dir.

Adding following functions to add dir in eventfs:

eventfs_create_events_dir() directly creates events dir with-in
tracing folder.

eventfs_add_subsystem_dir() adds the information of subsystem_dir to
eventfs and dynamically creates subsystem_dir as and when requires.

eventfs_add_dir() adds the information of dir (which is with-in
subsystem_dir) to eventfs and dynamically creates these dir as
and when requires.

Signed-off-by: Ajay Kaher <akaher@vmware.com>
Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Tested-by: Ching-lin Yu <chinglinyu@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-lkp/202305051619.9a469a9a-yujie.liu@intel.com
---
 fs/tracefs/Makefile      |   1 +
 fs/tracefs/event_inode.c | 272 +++++++++++++++++++++++++++++++++++++++
 include/linux/tracefs.h  |  29 +++++
 kernel/trace/trace.h     |   1 +
 4 files changed, 303 insertions(+)
 create mode 100644 fs/tracefs/event_inode.c

Comments

Steven Rostedt July 1, 2023, 1:54 p.m. UTC | #1
FYI, all subjects should start with a capital letter:

 "eventfs: Implement eventfs dir creation functions"

On Thu,  1 Jun 2023 14:30:06 +0530
Ajay Kaher <akaher@vmware.com> wrote:

> Adding eventfs_file structure which will hold properties of file or dir.
> 
> Adding following functions to add dir in eventfs:
> 
> eventfs_create_events_dir() directly creates events dir with-in

			"within" is a proper word.

> tracing folder.
> 
> eventfs_add_subsystem_dir() adds the information of subsystem_dir to
> eventfs and dynamically creates subsystem_dir as and when requires.

  "as and when requires" does not make sense.

> 
> eventfs_add_dir() adds the information of dir (which is with-in

   "within"

> subsystem_dir) to eventfs and dynamically creates these dir as
> and when requires.

I'm guessing you want to say:

	eventfs_add_dir() adds the information of the dir, within a
	subsystem_dir, to eventfs and dynamically creates these
	directories when they are accessed.

> 
> Signed-off-by: Ajay Kaher <akaher@vmware.com>
> Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> Tested-by: Ching-lin Yu <chinglinyu@google.com>
> Reported-by: kernel test robot <lkp@intel.com>
> Link: https://lore.kernel.org/oe-lkp/202305051619.9a469a9a-yujie.liu@intel.com
> ---
>  fs/tracefs/Makefile      |   1 +
>  fs/tracefs/event_inode.c | 272 +++++++++++++++++++++++++++++++++++++++
>  include/linux/tracefs.h  |  29 +++++
>  kernel/trace/trace.h     |   1 +
>  4 files changed, 303 insertions(+)
>  create mode 100644 fs/tracefs/event_inode.c
> 
> diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
> index 7c35a282b..73c56da8e 100644
> --- a/fs/tracefs/Makefile
> +++ b/fs/tracefs/Makefile
> @@ -1,5 +1,6 @@
>  # SPDX-License-Identifier: GPL-2.0-only
>  tracefs-objs	:= inode.o
> +tracefs-objs	+= event_inode.o
>  
>  obj-$(CONFIG_TRACING)	+= tracefs.o
>  
> diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
> new file mode 100644
> index 000000000..a48ce23c0
> --- /dev/null
> +++ b/fs/tracefs/event_inode.c
> @@ -0,0 +1,272 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + *  event_inode.c - part of tracefs, a pseudo file system for activating tracing
> + *
> + *  Copyright (C) 2020-22 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org>
> + *  Copyright (C) 2020-22 VMware Inc, author: Ajay Kaher <akaher@vmware.com>
> + *
> + *  eventfs is used to show trace events with one set of dentries
> + *
> + *  eventfs stores meta-data of files/dirs and skip to create object of
> + *  inodes/dentries. As and when requires, eventfs will create the
> + *  inodes/dentries for only required files/directories. Also eventfs
> + *  would delete the inodes/dentries once no more requires but preserve
> + *  the meta data.
> + */
> +#include <linux/fsnotify.h>
> +#include <linux/fs.h>
> +#include <linux/namei.h>
> +#include <linux/security.h>
> +#include <linux/tracefs.h>
> +#include <linux/kref.h>
> +#include <linux/delay.h>
> +#include "internal.h"
> +
> +/**
> + * eventfs_dentry_to_rwsem - Return corresponding eventfs_rwsem
> + * @dentry: a pointer to dentry
> + *
> + * helper function to return crossponding eventfs_rwsem for given dentry
> + */
> +static struct rw_semaphore *eventfs_dentry_to_rwsem(struct dentry *dentry)
> +{
> +	if (S_ISDIR(dentry->d_inode->i_mode))
> +		return (struct rw_semaphore *)dentry->d_inode->i_private;
> +	else
> +		return (struct rw_semaphore *)dentry->d_parent->d_inode->i_private;
> +}
> +
> +/**
> + * eventfs_down_read - acquire read lock function
> + * @eventfs_rwsem: a pointer to rw_semaphore
> + *
> + * helper function to perform read lock. Nested locking requires because
> + * lookup(), release() requires read lock, these could be called directly
> + * or from open(), remove() which already hold the read/write lock.
> + */
> +static void eventfs_down_read(struct rw_semaphore *eventfs_rwsem)
> +{
> +	down_read_nested(eventfs_rwsem, SINGLE_DEPTH_NESTING);
> +}
> +
> +/**
> + * eventfs_up_read - release read lock function
> + * @eventfs_rwsem: a pointer to rw_semaphore
> + *
> + * helper function to release eventfs_rwsem lock if locked
> + */
> +static void eventfs_up_read(struct rw_semaphore *eventfs_rwsem)
> +{
> +	up_read(eventfs_rwsem);
> +}
> +
> +/**
> + * eventfs_down_write - acquire write lock function
> + * @eventfs_rwsem: a pointer to rw_semaphore
> + *
> + * helper function to perform write lock on eventfs_rwsem
> + */
> +static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem)
> +{
> +	while (!down_write_trylock(eventfs_rwsem))
> +		msleep(10);

What's this loop for? Something like that needs a very good explanation
in a comment. Loops like these are usually a sign of a workaround for a
bug in the design, or worse, simply hides an existing bug.

> +}
> +
> +/**
> + * eventfs_up_write - release write lock function
> + * @eventfs_rwsem: a pointer to rw_semaphore
> + *
> + * helper function to perform write lock on eventfs_rwsem
> + */
> +static void eventfs_up_write(struct rw_semaphore *eventfs_rwsem)
> +{
> +	up_write(eventfs_rwsem);
> +}
> +
> +static const struct file_operations eventfs_file_operations = {
> +};
> +
> +static const struct inode_operations eventfs_root_dir_inode_operations = {
> +};
> +
> +/**
> + * eventfs_prepare_ef - helper function to prepare eventfs_file
> + * @name: a pointer to a string containing the name of the file/directory
> + *        to create.
> + * @mode: the permission that the file should have.
> + * @fop: a pointer to a struct file_operations that should be used for
> + *        this file/directory.
> + * @iop: a pointer to a struct inode_operations that should be used for
> + *        this file/directory.
> + * @data: a pointer to something that the caller will want to get to later
> + *        on.  The inode.i_private pointer will point to this value on
> + *        the open() call.
> + *
> + * This function allocate the fill eventfs_file structure.

   "allocates and fills the" ?

> + */
> +static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode,
> +					const struct file_operations *fop,
> +					const struct inode_operations *iop,
> +					void *data)
> +{
> +	struct eventfs_file *ef;
> +
> +	ef = kzalloc(sizeof(*ef), GFP_KERNEL);
> +	if (!ef)
> +		return ERR_PTR(-ENOMEM);
> +
> +	ef->name = kstrdup(name, GFP_KERNEL);
> +	if (!ef->name) {
> +		kfree(ef);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	if (S_ISDIR(mode)) {
> +		ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL);
> +		if (!ef->ei) {
> +			kfree(ef->name);
> +			kfree(ef);
> +			return ERR_PTR(-ENOMEM);
> +		}
> +		INIT_LIST_HEAD(&ef->ei->e_top_files);
> +	} else {
> +		ef->ei = NULL;
> +	}
> +
> +	ef->iop = iop;
> +	ef->fop = fop;
> +	ef->mode = mode;
> +	ef->data = data;
> +	ef->dentry = NULL;
> +	ef->d_parent = NULL;
> +	ef->created = false;

No need for the initialization to NULL or even the false, as the
kzalloc() already did that.

> +	return ef;
> +}
> +
> +/**
> + * eventfs_create_events_dir - create the trace event structure
> + * @name: a pointer to a string containing the name of the directory to
> + *        create.

You don't need to add "a pointer" we can see it's a pointer. Just say:

 * @name: The name of the directory to create

Adding more makes it confusing to read.

> + * @parent: a pointer to the parent dentry for this file.  This should be a
> + *          directory dentry if set.  If this parameter is NULL, then the
> + *          directory will be created in the root of the tracefs filesystem.
> + * @eventfs_rwsem: a pointer to rw_semaphore

Same with all the descriptions.


> + *
> + * This function creates the top of the trace event directory.
> + */
> +struct dentry *eventfs_create_events_dir(const char *name,
> +					 struct dentry *parent,
> +					 struct rw_semaphore *eventfs_rwsem)

OK, I'm going to have to really look at this. Passing in a lock to the
API is just broken. We need to find a way to solve this another way.

I'm about to board a plane to JFK shortly, I'm hoping to play with this
while flying back.

-- Steve


> +{
> +	struct dentry *dentry = tracefs_start_creating(name, parent);
> +	struct eventfs_inode *ei;
> +	struct tracefs_inode *ti;
> +	struct inode *inode;
> +
> +	if (IS_ERR(dentry))
> +		return dentry;
> +
> +	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
> +	if (!ei)
> +		return ERR_PTR(-ENOMEM);
> +	inode = tracefs_get_inode(dentry->d_sb);
> +	if (unlikely(!inode)) {
> +		kfree(ei);
> +		tracefs_failed_creating(dentry);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	init_rwsem(eventfs_rwsem);
> +	INIT_LIST_HEAD(&ei->e_top_files);
> +
> +	ti = get_tracefs(inode);
> +	ti->flags |= TRACEFS_EVENT_INODE;
> +	ti->private = ei;
> +
> +	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
> +	inode->i_op = &eventfs_root_dir_inode_operations;
> +	inode->i_fop = &eventfs_file_operations;
> +	inode->i_private = eventfs_rwsem;
> +
> +	/* directory inodes start off with i_nlink == 2 (for "." entry) */
> +	inc_nlink(inode);
> +	d_instantiate(dentry, inode);
> +	inc_nlink(dentry->d_parent->d_inode);
> +	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
> +	return tracefs_end_creating(dentry);
> +}
Ajay Kaher July 3, 2023, 10:13 a.m. UTC | #2
> On 01-Jul-2023, at 7:24 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> !! External Email
>
> FYI, all subjects should start with a capital letter:
>
> "eventfs: Implement eventfs dir creation functions"
>
> On Thu,  1 Jun 2023 14:30:06 +0530
> Ajay Kaher <akaher@vmware.com> wrote:
>
>> Adding eventfs_file structure which will hold properties of file or dir.
>>
>> Adding following functions to add dir in eventfs:
>>
>> eventfs_create_events_dir() directly creates events dir with-in
>
>                        "within" is a proper word.
>
>> tracing folder.
>>
>> eventfs_add_subsystem_dir() adds the information of subsystem_dir to
>> eventfs and dynamically creates subsystem_dir as and when requires.
>
>  "as and when requires" does not make sense.
>
>>
>> eventfs_add_dir() adds the information of dir (which is with-in
>
>   "within"
>
>> subsystem_dir) to eventfs and dynamically creates these dir as
>> and when requires.
>
> I'm guessing you want to say:
>
>        eventfs_add_dir() adds the information of the dir, within a
>        subsystem_dir, to eventfs and dynamically creates these
>        directories when they are accessed.
>
>>
>> Signed-off-by: Ajay Kaher <akaher@vmware.com>
>> Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
>> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
>> Tested-by: Ching-lin Yu <chinglinyu@google.com>
>> Reported-by: kernel test robot <lkp@intel.com>
>> Link: https://lore.kernel.org/oe-lkp/202305051619.9a469a9a-yujie.liu@intel.com
>> ---
>> fs/tracefs/Makefile      |   1 +
>> fs/tracefs/event_inode.c | 272 +++++++++++++++++++++++++++++++++++++++
>> include/linux/tracefs.h  |  29 +++++
>> kernel/trace/trace.h     |   1 +
>> 4 files changed, 303 insertions(+)
>> create mode 100644 fs/tracefs/event_inode.c
>>
>> diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
>> index 7c35a282b..73c56da8e 100644
>> --- a/fs/tracefs/Makefile
>> +++ b/fs/tracefs/Makefile
>> @@ -1,5 +1,6 @@
>> # SPDX-License-Identifier: GPL-2.0-only
>> tracefs-objs := inode.o
>> +tracefs-objs += event_inode.o
>>
>> obj-$(CONFIG_TRACING)        += tracefs.o
>>
>> diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
>> new file mode 100644
>> index 000000000..a48ce23c0
>> --- /dev/null
>> +++ b/fs/tracefs/event_inode.c
>> @@ -0,0 +1,272 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + *  event_inode.c - part of tracefs, a pseudo file system for activating tracing
>> + *
>> + *  Copyright (C) 2020-22 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org>
>> + *  Copyright (C) 2020-22 VMware Inc, author: Ajay Kaher <akaher@vmware.com>
>> + *
>> + *  eventfs is used to show trace events with one set of dentries
>> + *
>> + *  eventfs stores meta-data of files/dirs and skip to create object of
>> + *  inodes/dentries. As and when requires, eventfs will create the
>> + *  inodes/dentries for only required files/directories. Also eventfs
>> + *  would delete the inodes/dentries once no more requires but preserve
>> + *  the meta data.
>> + */
>> +#include <linux/fsnotify.h>
>> +#include <linux/fs.h>
>> +#include <linux/namei.h>
>> +#include <linux/security.h>
>> +#include <linux/tracefs.h>
>> +#include <linux/kref.h>
>> +#include <linux/delay.h>
>> +#include "internal.h"
>> +
>> +/**
>> + * eventfs_dentry_to_rwsem - Return corresponding eventfs_rwsem
>> + * @dentry: a pointer to dentry
>> + *
>> + * helper function to return crossponding eventfs_rwsem for given dentry
>> + */
>> +static struct rw_semaphore *eventfs_dentry_to_rwsem(struct dentry *dentry)
>> +{
>> +     if (S_ISDIR(dentry->d_inode->i_mode))
>> +             return (struct rw_semaphore *)dentry->d_inode->i_private;
>> +     else
>> +             return (struct rw_semaphore *)dentry->d_parent->d_inode->i_private;
>> +}
>> +
>> +/**
>> + * eventfs_down_read - acquire read lock function
>> + * @eventfs_rwsem: a pointer to rw_semaphore
>> + *
>> + * helper function to perform read lock. Nested locking requires because
>> + * lookup(), release() requires read lock, these could be called directly
>> + * or from open(), remove() which already hold the read/write lock.
>> + */
>> +static void eventfs_down_read(struct rw_semaphore *eventfs_rwsem)
>> +{
>> +     down_read_nested(eventfs_rwsem, SINGLE_DEPTH_NESTING);
>> +}
>> +
>> +/**
>> + * eventfs_up_read - release read lock function
>> + * @eventfs_rwsem: a pointer to rw_semaphore
>> + *
>> + * helper function to release eventfs_rwsem lock if locked
>> + */
>> +static void eventfs_up_read(struct rw_semaphore *eventfs_rwsem)
>> +{
>> +     up_read(eventfs_rwsem);
>> +}
>> +
>> +/**
>> + * eventfs_down_write - acquire write lock function
>> + * @eventfs_rwsem: a pointer to rw_semaphore
>> + *
>> + * helper function to perform write lock on eventfs_rwsem
>> + */
>> +static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem)
>> +{
>> +     while (!down_write_trylock(eventfs_rwsem))
>> +             msleep(10);
>
> What's this loop for? Something like that needs a very good explanation
> in a comment. Loops like these are usually a sign of a workaround for a
> bug in the design, or worse, simply hides an existing bug.
>

Yes correct, this logic is to solve deadlock:

Thread 1                             Thread 2
down_read_nested()                                 - read lock acquired
                                         down_write()     - waiting for write lock to acquire
down_read_nested()                                  - deadlock

Deadlock is because rwlock wouldn’t allow read lock to be acquired if write lock is waiting.
down_write_trylock() wouldn’t add the write lock in waiting queue, hence helps to prevent
deadlock scenario.

I was stuck with this Deadlock, tried few methods and finally borrowed from cifs, as it’s
upstreamed, tested and working in cifs, please refer:
https://elixir.bootlin.com/linux/v6.3.1/source/fs/cifs/file.c#L438

Looking further for your input. I will add explanation in v4.


>> +}
>> +
>> +/**
>> + * eventfs_up_write - release write lock function
>> + * @eventfs_rwsem: a pointer to rw_semaphore
>> + *
>> + * helper function to perform write lock on eventfs_rwsem
>> + */
>> +static void eventfs_up_write(struct rw_semaphore *eventfs_rwsem)
>> +{
>> +     up_write(eventfs_rwsem);
>> +}
>> +
>> +static const struct file_operations eventfs_file_operations = {
>> +};
>> +
>> +static const struct inode_operations eventfs_root_dir_inode_operations = {
>> +};
>> +
>> +/**
>> + * eventfs_prepare_ef - helper function to prepare eventfs_file
>> + * @name: a pointer to a string containing the name of the file/directory
>> + *        to create.
>> + * @mode: the permission that the file should have.
>> + * @fop: a pointer to a struct file_operations that should be used for
>> + *        this file/directory.
>> + * @iop: a pointer to a struct inode_operations that should be used for
>> + *        this file/directory.
>> + * @data: a pointer to something that the caller will want to get to later
>> + *        on.  The inode.i_private pointer will point to this value on
>> + *        the open() call.
>> + *
>> + * This function allocate the fill eventfs_file structure.
>
>   "allocates and fills the" ?
>
>> + */
>> +static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode,
>> +                                     const struct file_operations *fop,
>> +                                     const struct inode_operations *iop,
>> +                                     void *data)
>> +{
>> +     struct eventfs_file *ef;
>> +
>> +     ef = kzalloc(sizeof(*ef), GFP_KERNEL);
>> +     if (!ef)
>> +             return ERR_PTR(-ENOMEM);
>> +
>> +     ef->name = kstrdup(name, GFP_KERNEL);
>> +     if (!ef->name) {
>> +             kfree(ef);
>> +             return ERR_PTR(-ENOMEM);
>> +     }
>> +
>> +     if (S_ISDIR(mode)) {
>> +             ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL);
>> +             if (!ef->ei) {
>> +                     kfree(ef->name);
>> +                     kfree(ef);
>> +                     return ERR_PTR(-ENOMEM);
>> +             }
>> +             INIT_LIST_HEAD(&ef->ei->e_top_files);
>> +     } else {
>> +             ef->ei = NULL;
>> +     }
>> +
>> +     ef->iop = iop;
>> +     ef->fop = fop;
>> +     ef->mode = mode;
>> +     ef->data = data;
>> +     ef->dentry = NULL;
>> +     ef->d_parent = NULL;
>> +     ef->created = false;
>
> No need for the initialization to NULL or even the false, as the
> kzalloc() already did that.
>
>> +     return ef;
>> +}
>> +
>> +/**
>> + * eventfs_create_events_dir - create the trace event structure
>> + * @name: a pointer to a string containing the name of the directory to
>> + *        create.
>
> You don't need to add "a pointer" we can see it's a pointer. Just say:
>
> * @name: The name of the directory to create
>
> Adding more makes it confusing to read.
>
>> + * @parent: a pointer to the parent dentry for this file.  This should be a
>> + *          directory dentry if set.  If this parameter is NULL, then the
>> + *          directory will be created in the root of the tracefs filesystem.
>> + * @eventfs_rwsem: a pointer to rw_semaphore
>
> Same with all the descriptions.
>
>
>> + *
>> + * This function creates the top of the trace event directory.
>> + */
>> +struct dentry *eventfs_create_events_dir(const char *name,
>> +                                      struct dentry *parent,
>> +                                      struct rw_semaphore *eventfs_rwsem)
>
> OK, I'm going to have to really look at this. Passing in a lock to the
> API is just broken. We need to find a way to solve this another way.

eventfs_rwsem is a member of struct trace_array, I guess we should
pass pointer to trace_array.


> I'm about to board a plane to JFK shortly, I'm hoping to play with this
> while flying back.
>

I have replied for major concerns. All other minor I will take care in v4.

Thanks a lot for giving time to eventfs patches.

- Ajay


> -- Steve
>
>
>> +{
>> +     struct dentry *dentry = tracefs_start_creating(name, parent);
>> +     struct eventfs_inode *ei;
>> +     struct tracefs_inode *ti;
>> +     struct inode *inode;
>> +
>> +     if (IS_ERR(dentry))
>> +             return dentry;
>> +
>> +     ei = kzalloc(sizeof(*ei), GFP_KERNEL);
>> +     if (!ei)
>> +             return ERR_PTR(-ENOMEM);
>> +     inode = tracefs_get_inode(dentry->d_sb);
>> +     if (unlikely(!inode)) {
>> +             kfree(ei);
>> +             tracefs_failed_creating(dentry);
>> +             return ERR_PTR(-ENOMEM);
>> +     }
>> +
>> +     init_rwsem(eventfs_rwsem);
>> +     INIT_LIST_HEAD(&ei->e_top_files);
>> +
>> +     ti = get_tracefs(inode);
>> +     ti->flags |= TRACEFS_EVENT_INODE;
>> +     ti->private = ei;
>> +
>> +     inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
>> +     inode->i_op = &eventfs_root_dir_inode_operations;
>> +     inode->i_fop = &eventfs_file_operations;
>> +     inode->i_private = eventfs_rwsem;
>> +
>> +     /* directory inodes start off with i_nlink == 2 (for "." entry) */
>> +     inc_nlink(inode);
>> +     d_instantiate(dentry, inode);
>> +     inc_nlink(dentry->d_parent->d_inode);
>> +     fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
>> +     return tracefs_end_creating(dentry);
>> +}
>
> !! External Email: This email originated from outside of the organization. Do not click links or open attachments unless you recognize the sender.
Steven Rostedt July 3, 2023, 3:08 p.m. UTC | #3
On Mon, 3 Jul 2023 10:13:22 +0000
Ajay Kaher <akaher@vmware.com> wrote:

> >> +/**
> >> + * eventfs_down_write - acquire write lock function
> >> + * @eventfs_rwsem: a pointer to rw_semaphore
> >> + *
> >> + * helper function to perform write lock on eventfs_rwsem
> >> + */
> >> +static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem)
> >> +{
> >> +     while (!down_write_trylock(eventfs_rwsem))
> >> +             msleep(10);  
> >
> > What's this loop for? Something like that needs a very good explanation
> > in a comment. Loops like these are usually a sign of a workaround for a
> > bug in the design, or worse, simply hides an existing bug.
> >  
> 
> Yes correct, this logic is to solve deadlock:
> 
> Thread 1                             Thread 2
> down_read_nested()                                 - read lock acquired
>                                          down_write()     - waiting for write lock to acquire
> down_read_nested()                                  - deadlock
> 
> Deadlock is because rwlock wouldn’t allow read lock to be acquired if write lock is waiting.
> down_write_trylock() wouldn’t add the write lock in waiting queue, hence helps to prevent
> deadlock scenario.
> 
> I was stuck with this Deadlock, tried few methods and finally borrowed from cifs, as it’s
> upstreamed, tested and working in cifs, please refer:
> https://elixir.bootlin.com/linux/v6.3.1/source/fs/cifs/file.c#L438

I just looked at that code and the commit, and I honestly believe that
is a horrible hack, and very fragile. It's in the smb code, so it was
unlikely reviewed by anyone outside that subsystem. I really do not
want to prolificate that solution around the kernel. We need to come up
with something else.

I also think it's buggy (yes the cifs code is buggy!) because in the
comment above the down_read_nested() it says:

/*
 * nested locking. NOTE: rwsems are not allowed to recurse
 * (which occurs if the same task tries to acquire the same
 * lock instance multiple times), but multiple locks of the
 * same lock class might be taken, if the order of the locks
 * is always the same. This ordering rule can be expressed
 * to lockdep via the _nested() APIs, but enumerating the
 * subclasses that are used. (If the nesting relationship is
 * static then another method for expressing nested locking is
 * the explicit definition of lock class keys and the use of
 * lockdep_set_class() at lock initialization time.
 * See Documentation/locking/lockdep-design.rst for more details.)
 */

So this is NOT a solution (and the cifs code should be fixed too!)

Can you show me the exact backtrace where the reader lock gets taken
again? We will have to come up with a way to not take the same lock
twice.

We can also look to see if we can implement this with RCU. What exactly
is this rwsem protecting?


> 
> Looking further for your input. I will add explanation in v4.
> 
> 
> >> +}
> >> +

[..]

> >> + *
> >> + * This function creates the top of the trace event directory.
> >> + */
> >> +struct dentry *eventfs_create_events_dir(const char *name,
> >> +                                      struct dentry *parent,
> >> +                                      struct rw_semaphore *eventfs_rwsem)  
> >
> > OK, I'm going to have to really look at this. Passing in a lock to the
> > API is just broken. We need to find a way to solve this another way.  
> 
> eventfs_rwsem is a member of struct trace_array, I guess we should
> pass pointer to trace_array.

No, it should not be part of the trace_array. If we can't do this with
RCU, then we need to add a descriptor that contains the dentry that is
returned above, and have the lock held there. The caller of the
eventfs_create_events_dir() should not care about locking. That's an
implementation detail that should *not* be part of the API.

That is, if you need a lock:

struct eventfs_dentry {
	struct dentry		*dentry;
	struct rwsem		*rwsem;
};

And then get to that lock by using the container_of() macro. All
created eventfs dentry's could have this structure, where the rwsem
points to the top one. Again, that's only if we can't do this with RCU.

-- Steve


> 
> 
> > I'm about to board a plane to JFK shortly, I'm hoping to play with this
> > while flying back.
> >  
> 
> I have replied for major concerns. All other minor I will take care in v4.
> 
> Thanks a lot for giving time to eventfs patches.
> 
> - Ajay
>
Ajay Kaher July 3, 2023, 6:51 p.m. UTC | #4
> On 03-Jul-2023, at 8:38 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> I just looked at that code and the commit, and I honestly believe that
> is a horrible hack, and very fragile. It's in the smb code, so it was
> unlikely reviewed by anyone outside that subsystem. I really do not
> want to prolificate that solution around the kernel. We need to come up
> with something else.
> 
> I also think it's buggy (yes the cifs code is buggy!) because in the
> comment above the down_read_nested() it says:
> 
> /*
> * nested locking. NOTE: rwsems are not allowed to recurse
> * (which occurs if the same task tries to acquire the same
> * lock instance multiple times), but multiple locks of the
> * same lock class might be taken, if the order of the locks
> * is always the same. This ordering rule can be expressed
> * to lockdep via the _nested() APIs, but enumerating the
> * subclasses that are used. (If the nesting relationship is
> * static then another method for expressing nested locking is
> * the explicit definition of lock class keys and the use of
> * lockdep_set_class() at lock initialization time.
> * See Documentation/locking/lockdep-design.rst for more details.)
> */
> 
> So this is NOT a solution (and the cifs code should be fixed too!)
> 
> Can you show me the exact backtrace where the reader lock gets taken
> again? We will have to come up with a way to not take the same lock
> twice.


[ 244.185505] eventfs_root_lookup+0x37/0x1f0                          <--- require read lock
[ 244.185509] __lookup_slow+0x72/0x100
[ 244.185511] lookup_one_len+0x6a/0x70
[ 244.185513] eventfs_start_creating+0x58/0xd0
[ 244.185515] ? security_locked_down+0x2e/0x50
[ 244.185518] eventfs_create_file+0x57/0x150
[ 244.185521] dcache_dir_open_wrapper+0x1c6/0x260             <--- require read lock
[ 244.185524] ? __pfx_dcache_dir_open_wrapper+0x10/0x10
[ 244.185526] do_dentry_open+0x1ed/0x420
[ 244.185529] vfs_open+0x2d/0x40


> 
> We can also look to see if we can implement this with RCU. What exactly
> is this rwsem protecting?
> 

- struct eventfs_file holds the meta-data for file or dir.
https://github.com/intel-lab-lkp/linux/blob/dfe0dc15a73261ed83cdc728e43f4b3d4e315aae/include/linux/tracefs.h#L28
- eventfs_rwsem is supposed to protect the 'link-list which is made of struct eventfs_file
' and elements of struct eventfs_file.

I tried one more solution i.e by checking owner of lock:
static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
{
    return (struct task_struct *)
    (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
}

But rwsem_owner() is static.

> 
>> 
>> Looking further for your input. I will add explanation in v4.
>> 
>> 
>>>> +}
>>>> +
> 
> [..]
> 
>>>> + *
>>>> + * This function creates the top of the trace event directory.
>>>> + */
>>>> +struct dentry *eventfs_create_events_dir(const char *name,
>>>> +                                      struct dentry *parent,
>>>> +                                      struct rw_semaphore *eventfs_rwsem)
>>> 
>>> OK, I'm going to have to really look at this. Passing in a lock to the
>>> API is just broken. We need to find a way to solve this another way.
>> 
>> eventfs_rwsem is a member of struct trace_array, I guess we should
>> pass pointer to trace_array.
> 
> No, it should not be part of the trace_array. If we can't do this with
> RCU, then we need to add a descriptor that contains the dentry that is
> returned above, and have the lock held there. The caller of the
> eventfs_create_events_dir() should not care about locking. That's an
> implementation detail that should *not* be part of the API.
> 
> That is, if you need a lock:
> 
> struct eventfs_dentry {
>        struct dentry           *dentry;
>        struct rwsem            *rwsem;
> };
> 
> And then get to that lock by using the container_of() macro. All
> created eventfs dentry's could have this structure, where the rwsem
> points to the top one. Again, that's only if we can't do this with RCU.

Ok. Let’s first fix locking issue.

-Ajay
Steven Rostedt July 3, 2023, 7:52 p.m. UTC | #5
On Mon, 3 Jul 2023 18:51:22 +0000
Ajay Kaher <akaher@vmware.com> wrote:

> > 
> > We can also look to see if we can implement this with RCU. What exactly
> > is this rwsem protecting?
> >   
> 
> - struct eventfs_file holds the meta-data for file or dir.
> https://github.com/intel-lab-lkp/linux/blob/dfe0dc15a73261ed83cdc728e43f4b3d4e315aae/include/linux/tracefs.h#L28
> - eventfs_rwsem is supposed to protect the 'link-list which is made of struct eventfs_file
> ' and elements of struct eventfs_file.

RCU is usually the perfect solution for protecting link lists though. I'll
take a look at this when I get back to work.

-- Steve
Nadav Amit July 10, 2023, 2:17 a.m. UTC | #6
> On Jul 9, 2023, at 6:54 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> +       union {
> +               struct rcu_head         rcu;
> +               struct llist_node       llist;  /* For freeing after RCU */
> +       };

The memory savings from using a union might not be worth the potential impact
of type confusion and bugs.
Steven Rostedt July 10, 2023, 2:53 a.m. UTC | #7
On Mon, 10 Jul 2023 02:17:01 +0000
Nadav Amit <namit@vmware.com> wrote:

> > On Jul 9, 2023, at 6:54 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> > 
> > +       union {
> > +               struct rcu_head         rcu;
> > +               struct llist_node       llist;  /* For freeing after RCU */
> > +       };  
> 
> The memory savings from using a union might not be worth the potential impact
> of type confusion and bugs.

It's also documentation. The two are related, as one is the hand off to
the other. It's not a random union, and I'd like to leave it that way.

-- Steve
Ajay Kaher July 10, 2023, 6:53 p.m. UTC | #8
> On 10-Jul-2023, at 7:24 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> !! External Email
>
> On Mon, 3 Jul 2023 15:52:26 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
>> On Mon, 3 Jul 2023 18:51:22 +0000
>> Ajay Kaher <akaher@vmware.com> wrote:
>>
>>>>
>>>> We can also look to see if we can implement this with RCU. What exactly
>>>> is this rwsem protecting?
>>>>
>>>
>>> - struct eventfs_file holds the meta-data for file or dir.
>>> https://github.com/intel-lab-lkp/linux/blob/dfe0dc15a73261ed83cdc728e43f4b3d4e315aae/include/linux/tracefs.h#L28
>>> - eventfs_rwsem is supposed to protect the 'link-list which is made of struct eventfs_file
>>> ' and elements of struct eventfs_file.
>>
>> RCU is usually the perfect solution for protecting link lists though. I'll
>> take a look at this when I get back to work.
>>
>
> So I did the below patch on top of this series. If you could fold this
> into the appropriate patches, it should get us closer to an acceptable
> solution.
>
> What I did was:
>
> 1. Moved the struct eventfs_file and eventfs_inode into event_inode.c as it
>   really should not be exposed to all users.
>
> 2. Added a recursion check to eventfs_remove_rec() as it is really
>   dangerous to have unchecked recursion in the kernel (we do have a fixed
>   size stack).
>
> 3. Removed all the eventfs_rwsem code and replaced it with an srcu lock for
>   the readers, and a mutex to synchronize the writers of the list.
>
> 4. Added a eventfs_mutex that is used for the modifications of the
>   dentry itself (as well as modifying the list from 3 above).
>
> 5. Have the free use srcu callbacks. After the srcu grace periods are done,
>   it adds the eventfs_file onto a llist (lockless link list) and wakes up a
>   work queue. Then the work queue does the freeing (this needs to be done in
>   task/workqueue context, as srcu callbacks are done in softirq context).
>
> This appears to pass through some of my instance stress tests as well as
> the in tree ftrace selftests.
>

Awesome :)

I have manually applied the patches and ftracetest results are same as v3.
No more complains from lockdep.

I will merge this into appropriate patches of v3 and soon send v4.

You have renamed eventfs_create_dir() to create_dir(), and kept  eventfs_create_dir()
just a wrapper with lock, same for eventfs_create_file(). However these wrapper no where
used, I will drop these wrappers.

I was trying to have independent lock for each instance of events. As common lock
for every instance of events is not must.

Something was broken in your mail (I guess cc list) and couldn’t reach to lkml or
ignored by lkml. I just wanted to track the auto test results from linux-kselftest.

-Ajay


>
> ---
> fs/tracefs/event_inode.c    |  333 ++++++++++++++++++++++----------------------
> include/linux/tracefs.h     |   26 ---
> kernel/trace/trace.h        |    1
> kernel/trace/trace_events.c |    6
> 4 files changed, 179 insertions(+), 187 deletions(-)
>
> Index: linux-trace.git/fs/tracefs/event_inode.c
> ===================================================================
> --- linux-trace.git.orig/fs/tracefs/event_inode.c       2023-07-07 22:04:44.490812310 -0400
> +++ linux-trace.git/fs/tracefs/event_inode.c    2023-07-09 21:48:28.162874719 -0400
> @@ -16,71 +16,69 @@
> #include <linux/fsnotify.h>
> #include <linux/fs.h>
> #include <linux/namei.h>
> +#include <linux/workqueue.h>
> #include <linux/security.h>
> #include <linux/tracefs.h>
> #include <linux/kref.h>
> #include <linux/delay.h>
> #include "internal.h"
>
> -/**
> - * eventfs_dentry_to_rwsem - Return corresponding eventfs_rwsem
> - * @dentry: a pointer to dentry
> - *
> - * helper function to return crossponding eventfs_rwsem for given dentry
> - */
> -static struct rw_semaphore *eventfs_dentry_to_rwsem(struct dentry *dentry)
> -{
> -       if (S_ISDIR(dentry->d_inode->i_mode))
> -               return (struct rw_semaphore *)dentry->d_inode->i_private;
> -       else
> -               return (struct rw_semaphore *)dentry->d_parent->d_inode->i_private;
> -}
> +struct eventfs_inode {
> +       struct list_head                e_top_files;
> +};
>
> -/**
> - * eventfs_down_read - acquire read lock function
> - * @eventfs_rwsem: a pointer to rw_semaphore
> - *
> - * helper function to perform read lock. Nested locking requires because
> - * lookup(), release() requires read lock, these could be called directly
> - * or from open(), remove() which already hold the read/write lock.
> - */
> -static void eventfs_down_read(struct rw_semaphore *eventfs_rwsem)
> -{
> -       down_read_nested(eventfs_rwsem, SINGLE_DEPTH_NESTING);
> -}
> +struct eventfs_file {
> +       const char                      *name;
> +       struct dentry                   *d_parent;
> +       struct dentry                   *dentry;
> +       struct list_head                list;
> +       struct eventfs_inode            *ei;
> +       const struct file_operations    *fop;
> +       const struct inode_operations   *iop;
> +       union {
> +               struct rcu_head         rcu;
> +               struct llist_node       llist;  /* For freeing after RCU */
> +       };
> +       void                            *data;
> +       umode_t                         mode;
> +       bool                            created;
> +};
>
> -/**
> - * eventfs_up_read - release read lock function
> - * @eventfs_rwsem: a pointer to rw_semaphore
> - *
> - * helper function to release eventfs_rwsem lock if locked
> - */
> -static void eventfs_up_read(struct rw_semaphore *eventfs_rwsem)
> -{
> -       up_read(eventfs_rwsem);
> -}
> +static DEFINE_MUTEX(eventfs_mutex);
> +DEFINE_STATIC_SRCU(eventfs_srcu);
>
> -/**
> - * eventfs_down_write - acquire write lock function
> - * @eventfs_rwsem: a pointer to rw_semaphore
> - *
> - * helper function to perform write lock on eventfs_rwsem
> - */
> -static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem)
> +static struct dentry *create_file(const char *name, umode_t mode,
> +                                 struct dentry *parent, void *data,
> +                                 const struct file_operations *fop)
> {
> -       while (!down_write_trylock(eventfs_rwsem))
> -               msleep(10);
> -}
> +       struct tracefs_inode *ti;
> +       struct dentry *dentry;
> +       struct inode *inode;
>
> -/**
> - * eventfs_up_write - release write lock function
> - * @eventfs_rwsem: a pointer to rw_semaphore
> - *
> - * helper function to perform write lock on eventfs_rwsem
> - */
> -static void eventfs_up_write(struct rw_semaphore *eventfs_rwsem)
> -{
> -       up_write(eventfs_rwsem);
> +       if (!(mode & S_IFMT))
> +               mode |= S_IFREG;
> +
> +       if (WARN_ON_ONCE(!S_ISREG(mode)))
> +               return NULL;
> +
> +       dentry = eventfs_start_creating(name, parent);
> +
> +       if (IS_ERR(dentry))
> +               return dentry;
> +
> +       inode = tracefs_get_inode(dentry->d_sb);
> +       if (unlikely(!inode))
> +               return eventfs_failed_creating(dentry);
> +
> +       inode->i_mode = mode;
> +       inode->i_fop = fop;
> +       inode->i_private = data;
> +
> +       ti = get_tracefs(inode);
> +       ti->flags |= TRACEFS_EVENT_INODE;
> +       d_instantiate(dentry, inode);
> +       fsnotify_create(dentry->d_parent->d_inode, dentry);
> +       return eventfs_end_creating(dentry);
> }
>
> /**
> @@ -111,21 +109,30 @@ static struct dentry *eventfs_create_fil
>                                          struct dentry *parent, void *data,
>                                          const struct file_operations *fop)
> {
> -       struct tracefs_inode *ti;
>        struct dentry *dentry;
> -       struct inode *inode;
>
>        if (security_locked_down(LOCKDOWN_TRACEFS))
>                return NULL;
>
> -       if (!(mode & S_IFMT))
> -               mode |= S_IFREG;
> +       mutex_lock(&eventfs_mutex);
> +       dentry = create_file(name, mode, parent, data, fop);
> +       mutex_unlock(&eventfs_mutex);
>
> -       if (WARN_ON_ONCE(!S_ISREG(mode)))
> -               return NULL;
> +       return dentry;
> +}
>
> -       dentry = eventfs_start_creating(name, parent);
> +static struct dentry *create_dir(const char *name, umode_t mode,
> +                                struct dentry *parent, void *data,
> +                                const struct file_operations *fop,
> +                                const struct inode_operations *iop)
> +{
> +       struct tracefs_inode *ti;
> +       struct dentry *dentry;
> +       struct inode *inode;
>
> +       WARN_ON(!S_ISDIR(mode));
> +
> +       dentry = eventfs_start_creating(name, parent);
>        if (IS_ERR(dentry))
>                return dentry;
>
> @@ -134,13 +141,17 @@ static struct dentry *eventfs_create_fil
>                return eventfs_failed_creating(dentry);
>
>        inode->i_mode = mode;
> +       inode->i_op = iop;
>        inode->i_fop = fop;
>        inode->i_private = data;
>
>        ti = get_tracefs(inode);
>        ti->flags |= TRACEFS_EVENT_INODE;
> +
> +       inc_nlink(inode);
>        d_instantiate(dentry, inode);
> -       fsnotify_create(dentry->d_parent->d_inode, dentry);
> +       inc_nlink(dentry->d_parent->d_inode);
> +       fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
>        return eventfs_end_creating(dentry);
> }
>
> @@ -175,37 +186,18 @@ static struct dentry *eventfs_create_dir
>                                         const struct file_operations *fop,
>                                         const struct inode_operations *iop)
> {
> -       struct tracefs_inode *ti;
>        struct dentry *dentry;
> -       struct inode *inode;
>
>        if (security_locked_down(LOCKDOWN_TRACEFS))
>                return NULL;
>
>        WARN_ON(!S_ISDIR(mode));
>
> -       dentry = eventfs_start_creating(name, parent);
> -
> -       if (IS_ERR(dentry))
> -               return dentry;
> -
> -       inode = tracefs_get_inode(dentry->d_sb);
> -       if (unlikely(!inode))
> -               return eventfs_failed_creating(dentry);
> +       mutex_lock(&eventfs_mutex);
> +       dentry = create_dir(name, mode, parent, data, fop, iop);
> +       mutex_unlock(&eventfs_mutex);
>
> -       inode->i_mode = mode;
> -       inode->i_op = iop;
> -       inode->i_fop = fop;
> -       inode->i_private = data;
> -
> -       ti = get_tracefs(inode);
> -       ti->flags |= TRACEFS_EVENT_INODE;
> -
> -       inc_nlink(inode);
> -       d_instantiate(dentry, inode);
> -       inc_nlink(dentry->d_parent->d_inode);
> -       fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
> -       return eventfs_end_creating(dentry);
> +       return dentry;
> }
>
> /**
> @@ -241,13 +233,14 @@ static void eventfs_post_create_dir(stru
> {
>        struct eventfs_file *ef_child;
>        struct tracefs_inode *ti;
> +       int idx;
>
> -       eventfs_down_read((struct rw_semaphore *) ef->data);
> +       /* srcu lock already held */
>        /* fill parent-child relation */
> -       list_for_each_entry(ef_child, &ef->ei->e_top_files, list) {
> +       list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
> +                                srcu_read_lock_held(&eventfs_srcu)) {
>                ef_child->d_parent = ef->dentry;
>        }
> -       eventfs_up_read((struct rw_semaphore *) ef->data);
>
>        ti = get_tracefs(ef->dentry->d_inode);
>        ti->private = ef->ei;
> @@ -271,40 +264,43 @@ static struct dentry *eventfs_root_looku
>        struct eventfs_inode *ei;
>        struct eventfs_file *ef;
>        struct dentry *ret = NULL;
> -       struct rw_semaphore *eventfs_rwsem;
> +       int idx;
>
>        ti = get_tracefs(dir);
>        if (!(ti->flags & TRACEFS_EVENT_INODE))
>                return NULL;
>
>        ei = ti->private;
> -       eventfs_rwsem = (struct rw_semaphore *) dir->i_private;
> -       eventfs_down_read(eventfs_rwsem);
> -       list_for_each_entry(ef, &ei->e_top_files, list) {
> +       idx = srcu_read_lock(&eventfs_srcu);
> +       list_for_each_entry_srcu(ef, &ei->e_top_files, list,
> +                                srcu_read_lock_held(&eventfs_srcu)) {
>                if (strcmp(ef->name, dentry->d_name.name))
>                        continue;
>                ret = simple_lookup(dir, dentry, flags);
>                if (ef->created)
>                        continue;
> +               mutex_lock(&eventfs_mutex);
>                ef->created = true;
>                if (ef->ei)
> -                       ef->dentry = eventfs_create_dir(ef->name, ef->mode, ef->d_parent,
> -                                                       ef->data, ef->fop, ef->iop);
> +                       ef->dentry = create_dir(ef->name, ef->mode, ef->d_parent,
> +                                               ef->data, ef->fop, ef->iop);
>                else
> -                       ef->dentry = eventfs_create_file(ef->name, ef->mode, ef->d_parent,
> -                                                        ef->data, ef->fop);
> +                       ef->dentry = create_file(ef->name, ef->mode, ef->d_parent,
> +                                                ef->data, ef->fop);
>
>                if (IS_ERR_OR_NULL(ef->dentry)) {
>                        ef->created = false;
> +                       mutex_unlock(&eventfs_mutex);
>                } else {
>                        if (ef->ei)
>                                eventfs_post_create_dir(ef);
>                        ef->dentry->d_fsdata = ef;
> +                       mutex_unlock(&eventfs_mutex);
>                        dput(ef->dentry);
>                }
>                break;
>        }
> -       eventfs_up_read(eventfs_rwsem);
> +       srcu_read_unlock(&eventfs_srcu, idx);
>        return ret;
> }
>
> @@ -318,21 +314,20 @@ static int eventfs_release(struct inode
>        struct tracefs_inode *ti;
>        struct eventfs_inode *ei;
>        struct eventfs_file *ef;
> -       struct dentry *dentry = file_dentry(file);
> -       struct rw_semaphore *eventfs_rwsem;
> +       int idx;
>
>        ti = get_tracefs(inode);
>        if (!(ti->flags & TRACEFS_EVENT_INODE))
>                return -EINVAL;
>
>        ei = ti->private;
> -       eventfs_rwsem = eventfs_dentry_to_rwsem(dentry);
> -       eventfs_down_read(eventfs_rwsem);
> -       list_for_each_entry(ef, &ei->e_top_files, list) {
> +       idx = srcu_read_lock(&eventfs_srcu);
> +       list_for_each_entry_srcu(ef, &ei->e_top_files, list,
> +                                srcu_read_lock_held(&eventfs_srcu)) {
>                if (ef->created)
>                        dput(ef->dentry);
>        }
> -       eventfs_up_read(eventfs_rwsem);
> +       srcu_read_unlock(&eventfs_srcu, idx);
>        return dcache_dir_close(inode, file);
> }
>
> @@ -352,30 +347,30 @@ static int dcache_dir_open_wrapper(struc
>        struct eventfs_file *ef;
>        struct inode *f_inode = file_inode(file);
>        struct dentry *dentry = file_dentry(file);
> -       struct rw_semaphore *eventfs_rwsem;
> +       int idx;
>
>        ti = get_tracefs(f_inode);
>        if (!(ti->flags & TRACEFS_EVENT_INODE))
>                return -EINVAL;
>
>        ei = ti->private;
> -       eventfs_rwsem = eventfs_dentry_to_rwsem(dentry);
> -       eventfs_down_read(eventfs_rwsem);
> -       list_for_each_entry(ef, &ei->e_top_files, list) {
> +       idx = srcu_read_lock(&eventfs_srcu);
> +       list_for_each_entry_rcu(ef, &ei->e_top_files, list) {
>                if (ef->created) {
>                        dget(ef->dentry);
>                        continue;
>                }
>
> +               mutex_lock(&eventfs_mutex);
>                ef->created = true;
>
>                inode_lock(dentry->d_inode);
>                if (ef->ei)
> -                       ef->dentry = eventfs_create_dir(ef->name, ef->mode, dentry,
> -                                                       ef->data, ef->fop, ef->iop);
> +                       ef->dentry = create_dir(ef->name, ef->mode, dentry,
> +                                               ef->data, ef->fop, ef->iop);
>                else
> -                       ef->dentry = eventfs_create_file(ef->name, ef->mode, dentry,
> -                                                        ef->data, ef->fop);
> +                       ef->dentry = create_file(ef->name, ef->mode, dentry,
> +                                                ef->data, ef->fop);
>                inode_unlock(dentry->d_inode);
>
>                if (IS_ERR_OR_NULL(ef->dentry)) {
> @@ -385,8 +380,9 @@ static int dcache_dir_open_wrapper(struc
>                                eventfs_post_create_dir(ef);
>                        ef->dentry->d_fsdata = ef;
>                }
> +               mutex_unlock(&eventfs_mutex);
>        }
> -       eventfs_up_read(eventfs_rwsem);
> +       srcu_read_unlock(&eventfs_srcu, idx);
>        return dcache_dir_open(inode, file);
> }
>
> @@ -463,13 +459,11 @@ static struct eventfs_file *eventfs_prep
>  * @parent: a pointer to the parent dentry for this file.  This should be a
>  *          directory dentry if set.  If this parameter is NULL, then the
>  *          directory will be created in the root of the tracefs filesystem.
> - * @eventfs_rwsem: a pointer to rw_semaphore
>  *
>  * This function creates the top of the trace event directory.
>  */
> struct dentry *eventfs_create_events_dir(const char *name,
> -                                        struct dentry *parent,
> -                                        struct rw_semaphore *eventfs_rwsem)
> +                                        struct dentry *parent)
> {
>        struct dentry *dentry = tracefs_start_creating(name, parent);
>        struct eventfs_inode *ei;
> @@ -489,7 +483,6 @@ struct dentry *eventfs_create_events_dir
>                return ERR_PTR(-ENOMEM);
>        }
>
> -       init_rwsem(eventfs_rwsem);
>        INIT_LIST_HEAD(&ei->e_top_files);
>
>        ti = get_tracefs(inode);
> @@ -499,7 +492,6 @@ struct dentry *eventfs_create_events_dir
>        inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
>        inode->i_op = &eventfs_root_dir_inode_operations;
>        inode->i_fop = &eventfs_file_operations;
> -       inode->i_private = eventfs_rwsem;
>
>        /* directory inodes start off with i_nlink == 2 (for "." entry) */
>        inc_nlink(inode);
> @@ -513,15 +505,13 @@ struct dentry *eventfs_create_events_dir
>  * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later
>  * @name: a pointer to a string containing the name of the file to create.
>  * @parent: a pointer to the parent dentry for this dir.
> - * @eventfs_rwsem: a pointer to rw_semaphore
>  *
>  * This function adds eventfs subsystem dir to list.
>  * And all these dirs are created on the fly when they are looked up,
>  * and the dentry and inodes will be removed when they are done.
>  */
> struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
> -                                              struct dentry *parent,
> -                                              struct rw_semaphore *eventfs_rwsem)
> +                                              struct dentry *parent)
> {
>        struct tracefs_inode *ti_parent;
>        struct eventfs_inode *ei_parent;
> @@ -536,16 +526,15 @@ struct eventfs_file *eventfs_add_subsyst
>        ef = eventfs_prepare_ef(name,
>                S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
>                &eventfs_file_operations,
> -               &eventfs_root_dir_inode_operations,
> -               (void *) eventfs_rwsem);
> +               &eventfs_root_dir_inode_operations, NULL);
>
>        if (IS_ERR(ef))
>                return ef;
>
> -       eventfs_down_write(eventfs_rwsem);
> +       mutex_lock(&eventfs_mutex);
>        list_add_tail(&ef->list, &ei_parent->e_top_files);
>        ef->d_parent = parent;
> -       eventfs_up_write(eventfs_rwsem);
> +       mutex_unlock(&eventfs_mutex);
>        return ef;
> }
>
> @@ -553,15 +542,13 @@ struct eventfs_file *eventfs_add_subsyst
>  * eventfs_add_dir - add eventfs dir to list to create later
>  * @name: a pointer to a string containing the name of the file to create.
>  * @ef_parent: a pointer to the parent eventfs_file for this dir.
> - * @eventfs_rwsem: a pointer to rw_semaphore
>  *
>  * This function adds eventfs dir to list.
>  * And all these dirs are created on the fly when they are looked up,
>  * and the dentry and inodes will be removed when they are done.
>  */
> struct eventfs_file *eventfs_add_dir(const char *name,
> -                                    struct eventfs_file *ef_parent,
> -                                    struct rw_semaphore *eventfs_rwsem)
> +                                    struct eventfs_file *ef_parent)
> {
>        struct eventfs_file *ef;
>
> @@ -571,16 +558,15 @@ struct eventfs_file *eventfs_add_dir(con
>        ef = eventfs_prepare_ef(name,
>                S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
>                &eventfs_file_operations,
> -               &eventfs_root_dir_inode_operations,
> -               (void *) eventfs_rwsem);
> +               &eventfs_root_dir_inode_operations, NULL);
>
>        if (IS_ERR(ef))
>                return ef;
>
> -       eventfs_down_write(eventfs_rwsem);
> +       mutex_lock(&eventfs_mutex);
>        list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
>        ef->d_parent = ef_parent->dentry;
> -       eventfs_up_write(eventfs_rwsem);
> +       mutex_unlock(&eventfs_mutex);
>        return ef;
> }
>
> @@ -608,7 +594,6 @@ int eventfs_add_top_file(const char *nam
>        struct tracefs_inode *ti;
>        struct eventfs_inode *ei;
>        struct eventfs_file *ef;
> -       struct rw_semaphore *eventfs_rwsem;
>
>        if (!parent)
>                return -EINVAL;
> @@ -629,11 +614,10 @@ int eventfs_add_top_file(const char *nam
>        if (IS_ERR(ef))
>                return -ENOMEM;
>
> -       eventfs_rwsem = (struct rw_semaphore *) parent->d_inode->i_private;
> -       eventfs_down_write(eventfs_rwsem);
> +       mutex_lock(&eventfs_mutex);
>        list_add_tail(&ef->list, &ei->e_top_files);
>        ef->d_parent = parent;
> -       eventfs_up_write(eventfs_rwsem);
> +       mutex_unlock(&eventfs_mutex);
>        return 0;
> }
>
> @@ -658,7 +642,6 @@ int eventfs_add_file(const char *name, u
>                     const struct file_operations *fop)
> {
>        struct eventfs_file *ef;
> -       struct rw_semaphore *eventfs_rwsem;
>
>        if (!ef_parent)
>                return -EINVAL;
> @@ -670,14 +653,42 @@ int eventfs_add_file(const char *name, u
>        if (IS_ERR(ef))
>                return -ENOMEM;
>
> -       eventfs_rwsem = (struct rw_semaphore *) ef_parent->data;
> -       eventfs_down_write(eventfs_rwsem);
> +       mutex_lock(&eventfs_mutex);
>        list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
>        ef->d_parent = ef_parent->dentry;
> -       eventfs_up_write(eventfs_rwsem);
> +       mutex_unlock(&eventfs_mutex);
>        return 0;
> }
>
> +static LLIST_HEAD(free_list);
> +
> +static void eventfs_workfn(struct work_struct *work)
> +{
> +       struct eventfs_file *ef, *tmp;
> +       struct llist_node *llnode;
> +
> +       llnode = llist_del_all(&free_list);
> +       llist_for_each_entry_safe(ef, tmp, llnode, llist) {
> +               if (ef->created && ef->dentry)
> +                       dput(ef->dentry);
> +               kfree(ef->name);
> +               kfree(ef->ei);
> +               kfree(ef);
> +       }
> +}
> +
> +DECLARE_WORK(eventfs_work, eventfs_workfn);
> +
> +static void free_ef(struct rcu_head *head)
> +{
> +       struct eventfs_file *ef = container_of(head, struct eventfs_file, rcu);
> +
> +       if (!llist_add(&ef->llist, &free_list))
> +               return;
> +
> +       queue_work(system_unbound_wq, &eventfs_work);
> +}
> +
> /**
>  * eventfs_remove_rec - remove eventfs dir or file from list
>  * @ef: a pointer to eventfs_file to be removed.
> @@ -685,51 +696,51 @@ int eventfs_add_file(const char *name, u
>  * This function recursively remove eventfs_file which
>  * contains info of file or dir.
>  */
> -static void eventfs_remove_rec(struct eventfs_file *ef)
> +static void eventfs_remove_rec(struct eventfs_file *ef, int level)
> {
> -       struct eventfs_file *ef_child, *n;
> +       struct eventfs_file *ef_child;
>
>        if (!ef)
>                return;
> +       /*
> +        * Check recursion depth. It should never be greater than 3:
> +        * 0 - events/
> +        * 1 - events/group/
> +        * 2 - events/group/event/
> +        * 3 - events/group/event/file
> +        */
> +       if (WARN_ON_ONCE(level > 3))
> +               return;
>
>        if (ef->ei) {
>                /* search for nested folders or files */
> -               list_for_each_entry_safe(ef_child, n, &ef->ei->e_top_files, list) {
> -                       eventfs_remove_rec(ef_child);
> +               list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
> +                                        lockdep_is_held(&eventfs_mutex)) {
> +                       eventfs_remove_rec(ef_child, level + 1);
>                }
> -               kfree(ef->ei);
>        }
>
> -       if (ef->created && ef->dentry) {
> +       if (ef->created && ef->dentry)
>                d_invalidate(ef->dentry);
> -               dput(ef->dentry);
> -       }
> -       list_del(&ef->list);
> -       kfree(ef->name);
> -       kfree(ef);
> +
> +       list_del_rcu(&ef->list);
> +       call_srcu(&eventfs_srcu, &ef->rcu, free_ef);
> }
>
> /**
>  * eventfs_remove - remove eventfs dir or file from list
>  * @ef: a pointer to eventfs_file to be removed.
>  *
> - * This function acquire the eventfs_rwsem lock and call eventfs_remove_rec()
> + * This function acquire the eventfs_mutex lock and calls eventfs_remove_rec()
>  */
> void eventfs_remove(struct eventfs_file *ef)
> {
> -       struct rw_semaphore *eventfs_rwsem;
> -
>        if (!ef)
>                return;
>
> -       if (ef->ei)
> -               eventfs_rwsem = (struct rw_semaphore *) ef->data;
> -       else
> -               eventfs_rwsem = (struct rw_semaphore *) ef->d_parent->d_inode->i_private;
> -
> -       eventfs_down_write(eventfs_rwsem);
> -       eventfs_remove_rec(ef);
> -       eventfs_up_write(eventfs_rwsem);
> +       mutex_lock(&eventfs_mutex);
> +       eventfs_remove_rec(ef, 0);
> +       mutex_unlock(&eventfs_mutex);
> }
>
> /**
> Index: linux-trace.git/include/linux/tracefs.h
> ===================================================================
> --- linux-trace.git.orig/include/linux/tracefs.h        2023-07-07 22:04:44.490812310 -0400
> +++ linux-trace.git/include/linux/tracefs.h     2023-07-07 22:04:44.486812271 -0400
> @@ -21,22 +21,7 @@ struct file_operations;
>
> #ifdef CONFIG_TRACING
>
> -struct eventfs_inode {
> -       struct list_head                e_top_files;
> -};
> -
> -struct eventfs_file {
> -       const char                      *name;
> -       struct dentry                   *d_parent;
> -       struct dentry                   *dentry;
> -       struct list_head                list;
> -       struct eventfs_inode            *ei;
> -       const struct file_operations    *fop;
> -       const struct inode_operations   *iop;
> -       void                            *data;
> -       umode_t                         mode;
> -       bool                            created;
> -};
> +struct eventfs_file;
>
> struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
>
> @@ -45,16 +30,13 @@ struct dentry *eventfs_failed_creating(s
> struct dentry *eventfs_end_creating(struct dentry *dentry);
>
> struct dentry *eventfs_create_events_dir(const char *name,
> -                                        struct dentry *parent,
> -                                        struct rw_semaphore *eventfs_rwsem);
> +                                        struct dentry *parent);
>
> struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
> -                                              struct dentry *parent,
> -                                              struct rw_semaphore *eventfs_rwsem);
> +                                              struct dentry *parent);
>
> struct eventfs_file *eventfs_add_dir(const char *name,
> -                                    struct eventfs_file *ef_parent,
> -                                    struct rw_semaphore *eventfs_rwsem);
> +                                    struct eventfs_file *ef_parent);
>
> int eventfs_add_file(const char *name, umode_t mode,
>                     struct eventfs_file *ef_parent, void *data,
> Index: linux-trace.git/kernel/trace/trace.h
> ===================================================================
> --- linux-trace.git.orig/kernel/trace/trace.h   2023-07-07 22:04:44.490812310 -0400
> +++ linux-trace.git/kernel/trace/trace.h        2023-07-07 22:04:44.486812271 -0400
> @@ -359,7 +359,6 @@ struct trace_array {
>        struct dentry           *options;
>        struct dentry           *percpu_dir;
>        struct dentry           *event_dir;
> -       struct rw_semaphore     eventfs_rwsem;
>        struct trace_options    *topts;
>        struct list_head        systems;
>        struct list_head        events;
> Index: linux-trace.git/kernel/trace/trace_events.c
> ===================================================================
> --- linux-trace.git.orig/kernel/trace/trace_events.c    2023-07-07 22:04:44.490812310 -0400
> +++ linux-trace.git/kernel/trace/trace_events.c 2023-07-07 22:04:44.486812271 -0400
> @@ -2337,7 +2337,7 @@ event_subsystem_dir(struct trace_array *
>        } else
>                __get_system(system);
>
> -       dir->ef = eventfs_add_subsystem_dir(name, parent, &tr->eventfs_rwsem);
> +       dir->ef = eventfs_add_subsystem_dir(name, parent);
>        if (IS_ERR(dir->ef)) {
>                pr_warn("Failed to create system directory %s\n", name);
>                __put_system(system);
> @@ -2439,7 +2439,7 @@ event_create_dir(struct dentry *parent,
>                return -ENOMEM;
>
>        name = trace_event_name(call);
> -       file->ef = eventfs_add_dir(name, ef_subsystem, &tr->eventfs_rwsem);
> +       file->ef = eventfs_add_dir(name, ef_subsystem);
>        if (IS_ERR(file->ef)) {
>                pr_warn("Could not create tracefs '%s' directory\n", name);
>                return -1;
> @@ -3647,7 +3647,7 @@ create_event_toplevel_files(struct dentr
>        if (!entry)
>                return -ENOMEM;
>
> -       d_events = eventfs_create_events_dir("events", parent, &tr->eventfs_rwsem);
> +       d_events = eventfs_create_events_dir("events", parent);
>        if (IS_ERR(d_events)) {
>                pr_warn("Could not create tracefs 'events' directory\n");
>                return -ENOMEM;
>
> !! External Email: This email originated from outside of the organization. Do not click links or open attachments unless you recognize the sender.
Steven Rostedt July 10, 2023, 7:06 p.m. UTC | #9
On Mon, 10 Jul 2023 18:53:53 +0000
Ajay Kaher <akaher@vmware.com> wrote:

> > On 10-Jul-2023, at 7:24 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > !! External Email
> >
> > On Mon, 3 Jul 2023 15:52:26 -0400
> > Steven Rostedt <rostedt@goodmis.org> wrote:
> >  
> >> On Mon, 3 Jul 2023 18:51:22 +0000
> >> Ajay Kaher <akaher@vmware.com> wrote:
> >>  
> >>>>
> >>>> We can also look to see if we can implement this with RCU. What exactly
> >>>> is this rwsem protecting?
> >>>>  
> >>>
> >>> - struct eventfs_file holds the meta-data for file or dir.
> >>> https://github.com/intel-lab-lkp/linux/blob/dfe0dc15a73261ed83cdc728e43f4b3d4e315aae/include/linux/tracefs.h#L28
> >>> - eventfs_rwsem is supposed to protect the 'link-list which is made of struct eventfs_file
> >>> ' and elements of struct eventfs_file.  
> >>
> >> RCU is usually the perfect solution for protecting link lists though. I'll
> >> take a look at this when I get back to work.
> >>  
> >
> > So I did the below patch on top of this series. If you could fold this
> > into the appropriate patches, it should get us closer to an acceptable
> > solution.
> >
> > What I did was:
> >
> > 1. Moved the struct eventfs_file and eventfs_inode into event_inode.c as it
> >   really should not be exposed to all users.
> >
> > 2. Added a recursion check to eventfs_remove_rec() as it is really
> >   dangerous to have unchecked recursion in the kernel (we do have a fixed
> >   size stack).
> >
> > 3. Removed all the eventfs_rwsem code and replaced it with an srcu lock for
> >   the readers, and a mutex to synchronize the writers of the list.
> >
> > 4. Added a eventfs_mutex that is used for the modifications of the
> >   dentry itself (as well as modifying the list from 3 above).
> >
> > 5. Have the free use srcu callbacks. After the srcu grace periods are done,
> >   it adds the eventfs_file onto a llist (lockless link list) and wakes up a
> >   work queue. Then the work queue does the freeing (this needs to be done in
> >   task/workqueue context, as srcu callbacks are done in softirq context).
> >
> > This appears to pass through some of my instance stress tests as well as
> > the in tree ftrace selftests.
> >  
> 
> Awesome :)
> 
> I have manually applied the patches and ftracetest results are same as v3.
> No more complains from lockdep.
> 
> I will merge this into appropriate patches of v3 and soon send v4.
> 
> You have renamed eventfs_create_dir() to create_dir(), and kept  eventfs_create_dir()
> just a wrapper with lock, same for eventfs_create_file(). However these wrapper no where
> used, I will drop these wrappers.

Ah, I thought that because they started with "eventfs_" that they were used
for some fops pointer. Note, I try to avoid using the "eventfs_" naming for
static functions that are not exported elsewhere.

> 
> I was trying to have independent lock for each instance of events. As common lock
> for every instance of events is not must.

We can find a way to make the lock for the root later. Let's get it working
first before we optimize it. I do not want to expose any locking to the
users of this interface.

> 
> Something was broken in your mail (I guess cc list) and couldn’t reach to lkml or
> ignored by lkml. I just wanted to track the auto test results from linux-kselftest.

Yeah, claws-mail has an issue with some emails with quotes in it (sometimes
drops the second quote). Sad part is, it happens after I hit send, and it
is not part of the email. I'll send this reply now, but I bet it's going to happen again.

Let's see :-/  I checked the To and Cc's and they all have the proper
quotes. Let's see what ends up in my "Sent" folder.

-- Steve
Steven Rostedt July 10, 2023, 7:10 p.m. UTC | #10
On Mon, 10 Jul 2023 15:07:30 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Mon, 10 Jul 2023 15:06:06 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > > Something was broken in your mail (I guess cc list) and couldn’t reach to lkml or
> > > ignored by lkml. I just wanted to track the auto test results from linux-kselftest.    
> > 
> > Yeah, claws-mail has an issue with some emails with quotes in it (sometimes
> > drops the second quote). Sad part is, it happens after I hit send, and it
> > is not part of the email. I'll send this reply now, but I bet it's going to happen again.
> > 
> > Let's see :-/  I checked the To and Cc's and they all have the proper
> > quotes. Let's see what ends up in my "Sent" folder.  
> 
> This time it worked!
> 

But this reply did not :-p

It was fine before I sent, but the email in my Sent folder shows:

Cc: "mhiramat@kernel.org" <mhiramat@kernel.org>, "shuah@kernel.org"  <shuah@kernel.org>, "linux-kernel@vger.kernel.org"  <linux-kernel@vger.kernel.org>, "linux-trace-kernel@vger.kernel.org\"          <linux-trace-kernel@vger.kernel.org>, "linux-kselftest@vger.kernel.org"  <linux-kselftest@vger.kernel.org>, Ching-lin Yu <chinglinyu@google.com>,  Nadav Amit <namit@vmware.com>, "srivatsa@csail.mit.edu"  <srivatsa@csail.mit.edu>, Alexey Makhalov <amakhalov@vmware.com>, Vasavi  Sirnapalli <vsirnapalli@vmware.com>, Tapas Kundu <tkundu@vmware.com>,  "er.ajay.kaher@gmail.com" <er.ajay.kaher@gmail.com>

Claw's injected a backslash into:  "linux-trace-kernel@vger.kernel.org\"          <linux-trace-kernel@vger.kernel.org>

I have my own build of claws-mail, let me update it and perhaps this will
go away.

-- Steve
Steven Rostedt July 10, 2023, 8:15 p.m. UTC | #11
On Mon, 10 Jul 2023 15:06:06 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> > 
> > Something was broken in your mail (I guess cc list) and couldn’t reach to lkml or
> > ignored by lkml. I just wanted to track the auto test results from linux-kselftest.  
> 
> Yeah, claws-mail has an issue with some emails with quotes in it (sometimes
> drops the second quote). Sad part is, it happens after I hit send, and it
> is not part of the email. I'll send this reply now, but I bet it's going to happen again.
> 
> Let's see :-/  I checked the To and Cc's and they all have the proper
> quotes. Let's see what ends up in my "Sent" folder.

Sorry for the spam, but I just upgraded my claws-mail from 3.19.0 to 3.19.1
and I just want to see if it fails again.

-- Steve
Steven Rostedt July 10, 2023, 9:09 p.m. UTC | #12
On Mon, 10 Jul 2023 18:53:53 +0000
Ajay Kaher <akaher@vmware.com> wrote:

> Something was broken in your mail (I guess cc list) and couldn’t reach to lkml or
> ignored by lkml. I just wanted to track the auto test results from linux-kselftest.

Anyway, I pushed your series plus this as a commit to:

  https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git/log/?h=trace/rfc/eventfs

Which should trigger some of the zero-day bots.

I also ran all my testing with lockdep enabled and nothing triggered.

-- Steve
Steven Rostedt July 11, 2023, 2:24 p.m. UTC | #13
On Mon, 10 Jul 2023 18:53:53 +0000
Ajay Kaher <akaher@vmware.com> wrote:

> Something was broken in your mail (I guess cc list) and couldn’t reach to lkml or
> ignored by lkml. I just wanted to track the auto test results from linux-kselftest.

Below is the report from the tree I pushed. I guess I forgot to remove an
"idx" variable, and it also caught the unused functions you mentioned.

-- Steve


tree:   git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace trace/rfc/eventfs
head:   1dc48374bb8ad8aec6d7244267f9b36e0512d3bb
commit: 1dc48374bb8ad8aec6d7244267f9b36e0512d3bb [28/28] tracefs: Add RCU and global mutex for eventfs
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20230711/202307111415.tc8g7M63-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce: (https://download.01.org/0day-ci/archive/20230711/202307111415.tc8g7M63-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202307111415.tc8g7M63-lkp@intel.com/

All warnings (new ones prefixed by >>):

   fs/tracefs/event_inode.c: In function 'eventfs_post_create_dir':
>> fs/tracefs/event_inode.c:236:13: warning: unused variable 'idx' [-Wunused-variable]  
     236 |         int idx;
         |             ^~~
   fs/tracefs/event_inode.c: At top level:
   fs/tracefs/event_inode.c:184:23: warning: 'eventfs_create_dir' defined but not used [-Wunused-function]
     184 | static struct dentry *eventfs_create_dir(const char *name, umode_t mode,
         |                       ^~~~~~~~~~~~~~~~~~
   fs/tracefs/event_inode.c:108:23: warning: 'eventfs_create_file' defined but not used [-Wunused-function]
     108 | static struct dentry *eventfs_create_file(const char *name, umode_t mode,
         |                       ^~~~~~~~~~~~~~~~~~~


vim +/idx +236 fs/tracefs/event_inode.c

   225	
   226	/**
   227	 * eventfs_post_create_dir - post create dir routine
   228	 * @ef: eventfs_file of recently created dir
   229	 *
   230	 * Files with-in eventfs dir should know dentry of parent dir
   231	 */
   232	static void eventfs_post_create_dir(struct eventfs_file *ef)
   233	{
   234		struct eventfs_file *ef_child;
   235		struct tracefs_inode *ti;
 > 236		int idx;  
   237	
   238		/* srcu lock already held */
   239		/* fill parent-child relation */
   240		list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
   241					 srcu_read_lock_held(&eventfs_srcu)) {
   242			ef_child->d_parent = ef->dentry;
   243		}
   244	
   245		ti = get_tracefs(ef->dentry->d_inode);
   246		ti->private = ef->ei;
   247	}
   248
diff mbox series

Patch

diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
index 7c35a282b..73c56da8e 100644
--- a/fs/tracefs/Makefile
+++ b/fs/tracefs/Makefile
@@ -1,5 +1,6 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 tracefs-objs	:= inode.o
+tracefs-objs	+= event_inode.o
 
 obj-$(CONFIG_TRACING)	+= tracefs.o
 
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
new file mode 100644
index 000000000..a48ce23c0
--- /dev/null
+++ b/fs/tracefs/event_inode.c
@@ -0,0 +1,272 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  event_inode.c - part of tracefs, a pseudo file system for activating tracing
+ *
+ *  Copyright (C) 2020-22 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org>
+ *  Copyright (C) 2020-22 VMware Inc, author: Ajay Kaher <akaher@vmware.com>
+ *
+ *  eventfs is used to show trace events with one set of dentries
+ *
+ *  eventfs stores meta-data of files/dirs and skip to create object of
+ *  inodes/dentries. As and when requires, eventfs will create the
+ *  inodes/dentries for only required files/directories. Also eventfs
+ *  would delete the inodes/dentries once no more requires but preserve
+ *  the meta data.
+ */
+#include <linux/fsnotify.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/tracefs.h>
+#include <linux/kref.h>
+#include <linux/delay.h>
+#include "internal.h"
+
+/**
+ * eventfs_dentry_to_rwsem - Return corresponding eventfs_rwsem
+ * @dentry: a pointer to dentry
+ *
+ * helper function to return crossponding eventfs_rwsem for given dentry
+ */
+static struct rw_semaphore *eventfs_dentry_to_rwsem(struct dentry *dentry)
+{
+	if (S_ISDIR(dentry->d_inode->i_mode))
+		return (struct rw_semaphore *)dentry->d_inode->i_private;
+	else
+		return (struct rw_semaphore *)dentry->d_parent->d_inode->i_private;
+}
+
+/**
+ * eventfs_down_read - acquire read lock function
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * helper function to perform read lock. Nested locking requires because
+ * lookup(), release() requires read lock, these could be called directly
+ * or from open(), remove() which already hold the read/write lock.
+ */
+static void eventfs_down_read(struct rw_semaphore *eventfs_rwsem)
+{
+	down_read_nested(eventfs_rwsem, SINGLE_DEPTH_NESTING);
+}
+
+/**
+ * eventfs_up_read - release read lock function
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * helper function to release eventfs_rwsem lock if locked
+ */
+static void eventfs_up_read(struct rw_semaphore *eventfs_rwsem)
+{
+	up_read(eventfs_rwsem);
+}
+
+/**
+ * eventfs_down_write - acquire write lock function
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * helper function to perform write lock on eventfs_rwsem
+ */
+static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem)
+{
+	while (!down_write_trylock(eventfs_rwsem))
+		msleep(10);
+}
+
+/**
+ * eventfs_up_write - release write lock function
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * helper function to perform write lock on eventfs_rwsem
+ */
+static void eventfs_up_write(struct rw_semaphore *eventfs_rwsem)
+{
+	up_write(eventfs_rwsem);
+}
+
+static const struct file_operations eventfs_file_operations = {
+};
+
+static const struct inode_operations eventfs_root_dir_inode_operations = {
+};
+
+/**
+ * eventfs_prepare_ef - helper function to prepare eventfs_file
+ * @name: a pointer to a string containing the name of the file/directory
+ *        to create.
+ * @mode: the permission that the file should have.
+ * @fop: a pointer to a struct file_operations that should be used for
+ *        this file/directory.
+ * @iop: a pointer to a struct inode_operations that should be used for
+ *        this file/directory.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ *
+ * This function allocate the fill eventfs_file structure.
+ */
+static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode,
+					const struct file_operations *fop,
+					const struct inode_operations *iop,
+					void *data)
+{
+	struct eventfs_file *ef;
+
+	ef = kzalloc(sizeof(*ef), GFP_KERNEL);
+	if (!ef)
+		return ERR_PTR(-ENOMEM);
+
+	ef->name = kstrdup(name, GFP_KERNEL);
+	if (!ef->name) {
+		kfree(ef);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (S_ISDIR(mode)) {
+		ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL);
+		if (!ef->ei) {
+			kfree(ef->name);
+			kfree(ef);
+			return ERR_PTR(-ENOMEM);
+		}
+		INIT_LIST_HEAD(&ef->ei->e_top_files);
+	} else {
+		ef->ei = NULL;
+	}
+
+	ef->iop = iop;
+	ef->fop = fop;
+	ef->mode = mode;
+	ef->data = data;
+	ef->dentry = NULL;
+	ef->d_parent = NULL;
+	ef->created = false;
+	return ef;
+}
+
+/**
+ * eventfs_create_events_dir - create the trace event structure
+ * @name: a pointer to a string containing the name of the directory to
+ *        create.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          directory will be created in the root of the tracefs filesystem.
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * This function creates the top of the trace event directory.
+ */
+struct dentry *eventfs_create_events_dir(const char *name,
+					 struct dentry *parent,
+					 struct rw_semaphore *eventfs_rwsem)
+{
+	struct dentry *dentry = tracefs_start_creating(name, parent);
+	struct eventfs_inode *ei;
+	struct tracefs_inode *ti;
+	struct inode *inode;
+
+	if (IS_ERR(dentry))
+		return dentry;
+
+	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+	if (!ei)
+		return ERR_PTR(-ENOMEM);
+	inode = tracefs_get_inode(dentry->d_sb);
+	if (unlikely(!inode)) {
+		kfree(ei);
+		tracefs_failed_creating(dentry);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	init_rwsem(eventfs_rwsem);
+	INIT_LIST_HEAD(&ei->e_top_files);
+
+	ti = get_tracefs(inode);
+	ti->flags |= TRACEFS_EVENT_INODE;
+	ti->private = ei;
+
+	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	inode->i_op = &eventfs_root_dir_inode_operations;
+	inode->i_fop = &eventfs_file_operations;
+	inode->i_private = eventfs_rwsem;
+
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inc_nlink(inode);
+	d_instantiate(dentry, inode);
+	inc_nlink(dentry->d_parent->d_inode);
+	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+	return tracefs_end_creating(dentry);
+}
+
+/**
+ * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later
+ * @name: a pointer to a string containing the name of the file to create.
+ * @parent: a pointer to the parent dentry for this dir.
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * This function adds eventfs subsystem dir to list.
+ * And all these dirs are created on the fly when they are looked up,
+ * and the dentry and inodes will be removed when they are done.
+ */
+struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
+					       struct dentry *parent,
+					       struct rw_semaphore *eventfs_rwsem)
+{
+	struct tracefs_inode *ti_parent;
+	struct eventfs_inode *ei_parent;
+	struct eventfs_file *ef;
+
+	if (!parent)
+		return ERR_PTR(-EINVAL);
+
+	ti_parent = get_tracefs(parent->d_inode);
+	ei_parent = ti_parent->private;
+
+	ef = eventfs_prepare_ef(name,
+		S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
+		&eventfs_file_operations,
+		&eventfs_root_dir_inode_operations,
+		(void *) eventfs_rwsem);
+
+	if (IS_ERR(ef))
+		return ef;
+
+	eventfs_down_write(eventfs_rwsem);
+	list_add_tail(&ef->list, &ei_parent->e_top_files);
+	ef->d_parent = parent;
+	eventfs_up_write(eventfs_rwsem);
+	return ef;
+}
+
+/**
+ * eventfs_add_dir - add eventfs dir to list to create later
+ * @name: a pointer to a string containing the name of the file to create.
+ * @ef_parent: a pointer to the parent eventfs_file for this dir.
+ * @eventfs_rwsem: a pointer to rw_semaphore
+ *
+ * This function adds eventfs dir to list.
+ * And all these dirs are created on the fly when they are looked up,
+ * and the dentry and inodes will be removed when they are done.
+ */
+struct eventfs_file *eventfs_add_dir(const char *name,
+				     struct eventfs_file *ef_parent,
+				     struct rw_semaphore *eventfs_rwsem)
+{
+	struct eventfs_file *ef;
+
+	if (!ef_parent)
+		return ERR_PTR(-EINVAL);
+
+	ef = eventfs_prepare_ef(name,
+		S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
+		&eventfs_file_operations,
+		&eventfs_root_dir_inode_operations,
+		(void *) eventfs_rwsem);
+
+	if (IS_ERR(ef))
+		return ef;
+
+	eventfs_down_write(eventfs_rwsem);
+	list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
+	ef->d_parent = ef_parent->dentry;
+	eventfs_up_write(eventfs_rwsem);
+	return ef;
+}
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 999124459..aeca6761f 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -21,6 +21,35 @@  struct file_operations;
 
 #ifdef CONFIG_TRACING
 
+struct eventfs_inode {
+	struct list_head		e_top_files;
+};
+
+struct eventfs_file {
+	const char                      *name;
+	struct dentry                   *d_parent;
+	struct dentry                   *dentry;
+	struct list_head                list;
+	struct eventfs_inode            *ei;
+	const struct file_operations    *fop;
+	const struct inode_operations   *iop;
+	void                            *data;
+	umode_t                         mode;
+	bool                            created;
+};
+
+struct dentry *eventfs_create_events_dir(const char *name,
+					 struct dentry *parent,
+					 struct rw_semaphore *eventfs_rwsem);
+
+struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
+					       struct dentry *parent,
+					       struct rw_semaphore *eventfs_rwsem);
+
+struct eventfs_file *eventfs_add_dir(const char *name,
+				     struct eventfs_file *ef_parent,
+				     struct rw_semaphore *eventfs_rwsem);
+
 struct dentry *tracefs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 79bdefe92..b895c3346 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -359,6 +359,7 @@  struct trace_array {
 	struct dentry		*options;
 	struct dentry		*percpu_dir;
 	struct dentry		*event_dir;
+	struct rw_semaphore     eventfs_rwsem;
 	struct trace_options	*topts;
 	struct list_head	systems;
 	struct list_head	events;