diff mbox series

[RFC,bpf-next,fanotify,5/5] selftests/bpf: Add test for BPF based fanotify fastpath handler

Message ID 20241029231244.2834368-6-song@kernel.org (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series Fanotify fastpath handler | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/apply fail Patch does not apply to bpf-next-0

Commit Message

Song Liu Oct. 29, 2024, 11:12 p.m. UTC
This test shows a simplified logic that monitors a subtree. This is
simplified as it doesn't handle all the scenarios, such as:

  1) moving a subsubtree into/outof the being monitoring subtree;
  2) mount point inside the being monitored subtree

Therefore, this is not to show a way to reliably monitor a subtree.
Instead, this is to test the functionalities of bpf based fastpath.
To really monitor a subtree reliably, we will need more complex logic.

Overview of the logic:
  1. fanotify is created for the whole file system (/tmp);
  2. A bpf map (inode_storage_map) is used to tag directories to
     monitor (starting from /tmp/fanotify_test);
  3. On fsnotify_mkdir, thee tag is propagated to newly created sub
     directories (/tmp/fanotify_test/subdir);
  4. The bpf fastpath checks whether the fanotify event happens in a
     directory with the tag. If yes, the event is sent to user space;
     otherwise, the event is dropped.

Signed-off-by: Song Liu <song@kernel.org>
---
 tools/testing/selftests/bpf/bpf_kfuncs.h      |   4 +
 tools/testing/selftests/bpf/config            |   1 +
 .../testing/selftests/bpf/prog_tests/fan_fp.c | 245 ++++++++++++++++++
 tools/testing/selftests/bpf/progs/fan_fp.c    |  77 ++++++
 4 files changed, 327 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/fan_fp.c
 create mode 100644 tools/testing/selftests/bpf/progs/fan_fp.c

Comments

Amir Goldstein Nov. 7, 2024, 11:10 a.m. UTC | #1
On Wed, Oct 30, 2024 at 12:13 AM Song Liu <song@kernel.org> wrote:
>
> This test shows a simplified logic that monitors a subtree. This is
> simplified as it doesn't handle all the scenarios, such as:
>
>   1) moving a subsubtree into/outof the being monitoring subtree;

There is a solution for that (see below)

>   2) mount point inside the being monitored subtree

For that we will need to add the MOUNT/UNMOUNT/MOVE_MOUNT events,
but those have been requested by userspace anyway.

>
> Therefore, this is not to show a way to reliably monitor a subtree.
> Instead, this is to test the functionalities of bpf based fastpath.
> To really monitor a subtree reliably, we will need more complex logic.

Actually, this example is the foundation of my vision for efficient and race
free subtree filtering:

1. The inode map is to be treated as a cache for the is_subdir() query
2. Cache entries can also have a "distance from root" (i.e. depth) value
3. Each unknown queried path can call is_subdir() and populate the cache
    entries for all ancestors
4. The cache/map size should be limited and when limit is reached,
    evicting entries by depth priority makes sense
5. A rename event for a directory whose inode is in the map and whose
   new parent is not in the map or has a different value than old parent
   needs to invalidate the entire map
6. fast_path also needs a hook from inode evict to clear cache entries

This is similar, but more efficient and race free than what could already
be achieved in userspace using FAN_MARK_EVICTABLE.

Food for thought.

Thanks,
Amir.

>
> Overview of the logic:
>   1. fanotify is created for the whole file system (/tmp);
>   2. A bpf map (inode_storage_map) is used to tag directories to
>      monitor (starting from /tmp/fanotify_test);
>   3. On fsnotify_mkdir, thee tag is propagated to newly created sub
>      directories (/tmp/fanotify_test/subdir);
>   4. The bpf fastpath checks whether the fanotify event happens in a
>      directory with the tag. If yes, the event is sent to user space;
>      otherwise, the event is dropped.
>
> Signed-off-by: Song Liu <song@kernel.org>
> ---
>  tools/testing/selftests/bpf/bpf_kfuncs.h      |   4 +
>  tools/testing/selftests/bpf/config            |   1 +
>  .../testing/selftests/bpf/prog_tests/fan_fp.c | 245 ++++++++++++++++++
>  tools/testing/selftests/bpf/progs/fan_fp.c    |  77 ++++++
>  4 files changed, 327 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/fan_fp.c
>  create mode 100644 tools/testing/selftests/bpf/progs/fan_fp.c
>
> diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h
> index 2eb3483f2fb0..44dcf4991244 100644
> --- a/tools/testing/selftests/bpf/bpf_kfuncs.h
> +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h
> @@ -87,4 +87,8 @@ struct dentry;
>   */
>  extern int bpf_get_dentry_xattr(struct dentry *dentry, const char *name,
>                               struct bpf_dynptr *value_ptr) __ksym __weak;
> +
> +struct fanotify_fastpath_event;
> +extern struct inode *bpf_fanotify_data_inode(struct fanotify_fastpath_event *event) __ksym __weak;
> +extern void bpf_iput(struct inode *inode) __ksym __weak;
>  #endif
> diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
> index 4ca84c8d9116..392cbcad8a8b 100644
> --- a/tools/testing/selftests/bpf/config
> +++ b/tools/testing/selftests/bpf/config
> @@ -24,6 +24,7 @@ CONFIG_DEBUG_INFO_BTF=y
>  CONFIG_DEBUG_INFO_DWARF4=y
>  CONFIG_DUMMY=y
>  CONFIG_DYNAMIC_FTRACE=y
> +CONFIG_FANOTIFY=y
>  CONFIG_FPROBE=y
>  CONFIG_FTRACE_SYSCALLS=y
>  CONFIG_FUNCTION_ERROR_INJECTION=y
> diff --git a/tools/testing/selftests/bpf/prog_tests/fan_fp.c b/tools/testing/selftests/bpf/prog_tests/fan_fp.c
> new file mode 100644
> index 000000000000..ea57ed366647
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/fan_fp.c
> @@ -0,0 +1,245 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
> +
> +#define _GNU_SOURCE
> +#include <err.h>
> +#include <fcntl.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <sys/fanotify.h>
> +#include <unistd.h>
> +#include <sys/ioctl.h>
> +#include <sys/stat.h>
> +
> +#include <test_progs.h>
> +
> +#include "fan_fp.skel.h"
> +
> +#define TEST_FS "/tmp/"
> +#define TEST_DIR "/tmp/fanotify_test/"
> +
> +static int create_test_subtree(void)
> +{
> +       int err;
> +
> +       err = mkdir(TEST_DIR, 0777);
> +       if (err && errno != EEXIST)
> +               return err;
> +
> +       return open(TEST_DIR, O_RDONLY);
> +}
> +
> +static int create_fanotify_fd(void)
> +{
> +       int fanotify_fd, err;
> +
> +       fanotify_fd = fanotify_init(FAN_CLASS_NOTIF | FAN_REPORT_NAME | FAN_REPORT_DIR_FID,
> +                                   O_RDONLY);
> +
> +       if (!ASSERT_OK_FD(fanotify_fd, "fanotify_init"))
> +               return -1;
> +
> +       err = fanotify_mark(fanotify_fd, FAN_MARK_ADD | FAN_MARK_FILESYSTEM,
> +                           FAN_CREATE | FAN_OPEN | FAN_ONDIR | FAN_EVENT_ON_CHILD,
> +                           AT_FDCWD, TEST_FS);
> +       if (!ASSERT_OK(err, "fanotify_mark")) {
> +               close(fanotify_fd);
> +               return -1;
> +       }
> +
> +       return fanotify_fd;
> +}
> +
> +static int attach_global_fastpath(int fanotify_fd)
> +{
> +       struct fanotify_fastpath_args args = {
> +               .name = "_tmp_test_sub_tree",
> +               .version = 1,
> +               .flags = 0,
> +       };
> +
> +       if (ioctl(fanotify_fd, FAN_IOC_ADD_FP, &args))
> +               return -1;
> +
> +       return 0;
> +}
> +
> +#define EVENT_BUFFER_SIZE 4096
> +struct file_access_result {
> +       char name_prefix[16];
> +       bool accessed;
> +} access_results[3] = {
> +       {"aa", false},
> +       {"bb", false},
> +       {"cc", false},
> +};
> +
> +static void update_access_results(char *name)
> +{
> +       int i;
> +
> +       for (i = 0; i < 3; i++) {
> +               if (strstr(name, access_results[i].name_prefix))
> +                       access_results[i].accessed = true;
> +       }
> +}
> +
> +static void parse_event(char *buffer, int len)
> +{
> +       struct fanotify_event_metadata *event =
> +               (struct fanotify_event_metadata *) buffer;
> +       struct fanotify_event_info_header *info;
> +       struct fanotify_event_info_fid *fid;
> +       struct file_handle *handle;
> +       char *name;
> +       int off;
> +
> +       for (; FAN_EVENT_OK(event, len); event = FAN_EVENT_NEXT(event, len)) {
> +               for (off = sizeof(*event) ; off < event->event_len;
> +                    off += info->len) {
> +                       info = (struct fanotify_event_info_header *)
> +                               ((char *) event + off);
> +                       switch (info->info_type) {
> +                       case FAN_EVENT_INFO_TYPE_DFID_NAME:
> +                               fid = (struct fanotify_event_info_fid *) info;
> +                               handle = (struct file_handle *)&fid->handle;
> +                               name = (char *)handle + sizeof(*handle) + handle->handle_bytes;
> +                               update_access_results(name);
> +                               break;
> +                       default:
> +                               break;
> +                       }
> +               }
> +       }
> +}
> +
> +static void touch_file(const char *path)
> +{
> +       int fd;
> +
> +       fd = open(path, O_WRONLY|O_CREAT|O_NOCTTY|O_NONBLOCK, 0666);
> +       if (!ASSERT_OK_FD(fd, "open"))
> +               goto cleanup;
> +       close(fd);
> +cleanup:
> +       unlink(path);
> +}
> +
> +static void generate_and_test_event(int fanotify_fd)
> +{
> +       char buffer[EVENT_BUFFER_SIZE];
> +       int len, err;
> +
> +       /* access /tmp/fanotify_test/aa, this will generate event */
> +       touch_file(TEST_DIR "aa");
> +
> +       /* create /tmp/fanotify_test/subdir, this will get tag from the
> +        * parent directory (added in the bpf program on fsnotify_mkdir)
> +        */
> +       err = mkdir(TEST_DIR "subdir", 0777);
> +       ASSERT_OK(err, "mkdir");
> +
> +       /* access /tmp/fanotify_test/subdir/bb, this will generate event */
> +       touch_file(TEST_DIR "subdir/bb");
> +
> +       /* access /tmp/cc, this will NOT generate event, as the BPF
> +        * fastpath filtered this event out. (Because /tmp doesn't have
> +        * the tag.)
> +        */
> +       touch_file(TEST_FS "cc");
> +
> +       /* read and parse the events */
> +       len = read(fanotify_fd, buffer, EVENT_BUFFER_SIZE);
> +       if (!ASSERT_GE(len, 0, "read event"))
> +               goto cleanup;
> +       parse_event(buffer, len);
> +
> +       /* verify we generated events for aa and bb, but filtered out the
> +        * event for cc.
> +        */
> +       ASSERT_TRUE(access_results[0].accessed, "access aa");
> +       ASSERT_TRUE(access_results[1].accessed, "access bb");
> +       ASSERT_FALSE(access_results[2].accessed, "access cc");
> +
> +cleanup:
> +       rmdir(TEST_DIR "subdir");
> +       rmdir(TEST_DIR);
> +}
> +
> +/* This test shows a simplified logic that monitors a subtree. This is
> + * simplified as it doesn't handle all the scenarios, such as:
> + *
> + *  1) moving a subsubtree into/outof the being monitoring subtree;
> + *  2) mount point inside the being monitored subtree
> + *
> + * Therefore, this is not to show a way to reliably monitor a subtree.
> + * Instead, this is to test the functionalities of bpf based fastpath.
> + *
> + * Overview of the logic:
> + * 1. fanotify is created for the whole file system (/tmp);
> + * 2. A bpf map (inode_storage_map) is used to tag directories to
> + *    monitor (starting from /tmp/fanotify_test);
> + * 3. On fsnotify_mkdir, thee tag is propagated to newly created sub
> + *    directories (/tmp/fanotify_test/subdir);
> + * 4. The bpf fastpath checks whether the event happens in a directory
> + *    with the tag. If yes, the event is sent to user space; otherwise,
> + *    the event is dropped.
> + */
> +static void test_monitor_subtree(void)
> +{
> +       struct bpf_link *link;
> +       struct fan_fp *skel;
> +       int test_root_fd;
> +       __u32 one = 1;
> +       int err, fanotify_fd;
> +
> +       test_root_fd = create_test_subtree();
> +
> +       if (!ASSERT_OK_FD(test_root_fd, "create_test_subtree"))
> +               return;
> +
> +       skel = fan_fp__open_and_load();
> +
> +       if (!ASSERT_OK_PTR(skel, "fan_fp__open_and_load"))
> +               goto close_test_root_fd;
> +
> +       /* Add tag to /tmp/fanotify_test/ */
> +       err = bpf_map_update_elem(bpf_map__fd(skel->maps.inode_storage_map),
> +                                 &test_root_fd, &one, BPF_ANY);
> +       if (!ASSERT_OK(err, "bpf_map_update_elem"))
> +               goto destroy_skel;
> +       link = bpf_map__attach_struct_ops(skel->maps.bpf_fanotify_fastpath_ops);
> +       if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops"))
> +               goto destroy_skel;
> +
> +
> +       fanotify_fd = create_fanotify_fd();
> +       if (!ASSERT_OK_FD(fanotify_fd, "create_fanotify_fd"))
> +               goto destroy_link;
> +
> +       err = attach_global_fastpath(fanotify_fd);
> +       if (!ASSERT_OK(err, "attach_global_fastpath"))
> +               goto close_fanotify_fd;
> +
> +       generate_and_test_event(fanotify_fd);
> +
> +       ASSERT_EQ(skel->bss->added_inode_storage, 1, "added_inode_storage");
> +
> +close_fanotify_fd:
> +       close(fanotify_fd);
> +
> +destroy_link:
> +       bpf_link__destroy(link);
> +destroy_skel:
> +       fan_fp__destroy(skel);
> +
> +close_test_root_fd:
> +       close(test_root_fd);
> +       rmdir(TEST_DIR);
> +}
> +
> +void test_bpf_fanotify_fastpath(void)
> +{
> +       if (test__start_subtest("subtree"))
> +               test_monitor_subtree();
> +}
> diff --git a/tools/testing/selftests/bpf/progs/fan_fp.c b/tools/testing/selftests/bpf/progs/fan_fp.c
> new file mode 100644
> index 000000000000..ee86dc189e38
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/fan_fp.c
> @@ -0,0 +1,77 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2024 Facebook */
> +
> +#include "vmlinux.h"
> +#include <bpf/bpf_tracing.h>
> +#include <bpf/bpf_core_read.h>
> +
> +#define FS_CREATE              0x00000100      /* Subfile was created */
> +#define FS_ISDIR               0x40000000      /* event occurred against dir */
> +
> +struct {
> +       __uint(type, BPF_MAP_TYPE_INODE_STORAGE);
> +       __uint(map_flags, BPF_F_NO_PREALLOC);
> +       __type(key, int);
> +       __type(value, __u32);
> +} inode_storage_map SEC(".maps");
> +
> +int added_inode_storage;
> +
> +SEC("struct_ops")
> +int BPF_PROG(bpf_fp_handler,
> +            struct fsnotify_group *group,
> +            struct fanotify_fastpath_hook *fp_hook,
> +            struct fanotify_fastpath_event *fp_event)
> +{
> +       struct inode *dir;
> +       __u32 *value;
> +
> +       dir = fp_event->dir;
> +
> +       value = bpf_inode_storage_get(&inode_storage_map, dir, 0, 0);
> +
> +       /* if dir doesn't have the tag, skip the event */
> +       if (!value)
> +               return FAN_FP_RET_SKIP_EVENT;
> +
> +       /* propagate tag to subdir on fsnotify_mkdir */
> +       if (fp_event->mask == (FS_CREATE | FS_ISDIR) &&
> +           fp_event->data_type == FSNOTIFY_EVENT_DENTRY) {
> +               struct inode *new_inode;
> +
> +               new_inode = bpf_fanotify_data_inode(fp_event);
> +               if (!new_inode)
> +                       goto out;
> +
> +               value = bpf_inode_storage_get(&inode_storage_map, new_inode, 0,
> +                                             BPF_LOCAL_STORAGE_GET_F_CREATE);
> +               if (value) {
> +                       *value = 1;
> +                       added_inode_storage++;
> +               }
> +               bpf_iput(new_inode);
> +       }
> +out:
> +       return FAN_FP_RET_SEND_TO_USERSPACE;
> +}
> +
> +SEC("struct_ops")
> +int BPF_PROG(bpf_fp_init, struct fanotify_fastpath_hook *hook, const char *args)
> +{
> +       return 0;
> +}
> +
> +SEC("struct_ops")
> +void BPF_PROG(bpf_fp_free, struct fanotify_fastpath_hook *hook)
> +{
> +}
> +
> +SEC(".struct_ops.link")
> +struct fanotify_fastpath_ops bpf_fanotify_fastpath_ops = {
> +       .fp_handler = (void *)bpf_fp_handler,
> +       .fp_init = (void *)bpf_fp_init,
> +       .fp_free = (void *)bpf_fp_free,
> +       .name = "_tmp_test_sub_tree",
> +};
> +
> +char _license[] SEC("license") = "GPL";
> --
> 2.43.5
>
Song Liu Nov. 7, 2024, 7:53 p.m. UTC | #2
> On Nov 7, 2024, at 3:10 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> 
> On Wed, Oct 30, 2024 at 12:13 AM Song Liu <song@kernel.org> wrote:
>> 
>> This test shows a simplified logic that monitors a subtree. This is
>> simplified as it doesn't handle all the scenarios, such as:
>> 
>>  1) moving a subsubtree into/outof the being monitoring subtree;
> 
> There is a solution for that (see below)
> 
>>  2) mount point inside the being monitored subtree
> 
> For that we will need to add the MOUNT/UNMOUNT/MOVE_MOUNT events,
> but those have been requested by userspace anyway.
> 
>> 
>> Therefore, this is not to show a way to reliably monitor a subtree.
>> Instead, this is to test the functionalities of bpf based fastpath.
>> To really monitor a subtree reliably, we will need more complex logic.
> 
> Actually, this example is the foundation of my vision for efficient and race
> free subtree filtering:
> 
> 1. The inode map is to be treated as a cache for the is_subdir() query

Using is_subdir() as the truth and managing the cache in inode map seems
promising to me. 

> 2. Cache entries can also have a "distance from root" (i.e. depth) value
> 3. Each unknown queried path can call is_subdir() and populate the cache
>    entries for all ancestors
> 4. The cache/map size should be limited and when limit is reached,
>    evicting entries by depth priority makes sense
> 5. A rename event for a directory whose inode is in the map and whose
>   new parent is not in the map or has a different value than old parent
>   needs to invalidate the entire map
> 6. fast_path also needs a hook from inode evict to clear cache entries

The inode map is physically attached to the inode itself. So the evict 
event is automatically handled. IOW, an inode's entry in the inode map
is automatically removed when the inode is freed. For the same reason, 
we don't need to set a limit in map size and add evicting logic. Of 
course, this works based on the assumption that we don't use too much 
memory for each inode. I think this assumption is true. 

> 
> This is similar, but more efficient and race free than what could already
> be achieved in userspace using FAN_MARK_EVICTABLE.
> 
> Food for thought.

Thanks,
Song
Amir Goldstein Nov. 7, 2024, 8:33 p.m. UTC | #3
On Thu, Nov 7, 2024 at 8:53 PM Song Liu <songliubraving@meta.com> wrote:
>
>
>
> > On Nov 7, 2024, at 3:10 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Wed, Oct 30, 2024 at 12:13 AM Song Liu <song@kernel.org> wrote:
> >>
> >> This test shows a simplified logic that monitors a subtree. This is
> >> simplified as it doesn't handle all the scenarios, such as:
> >>
> >>  1) moving a subsubtree into/outof the being monitoring subtree;
> >
> > There is a solution for that (see below)
> >
> >>  2) mount point inside the being monitored subtree
> >
> > For that we will need to add the MOUNT/UNMOUNT/MOVE_MOUNT events,
> > but those have been requested by userspace anyway.
> >
> >>
> >> Therefore, this is not to show a way to reliably monitor a subtree.
> >> Instead, this is to test the functionalities of bpf based fastpath.
> >> To really monitor a subtree reliably, we will need more complex logic.
> >
> > Actually, this example is the foundation of my vision for efficient and race
> > free subtree filtering:
> >
> > 1. The inode map is to be treated as a cache for the is_subdir() query
>
> Using is_subdir() as the truth and managing the cache in inode map seems
> promising to me.
>
> > 2. Cache entries can also have a "distance from root" (i.e. depth) value
> > 3. Each unknown queried path can call is_subdir() and populate the cache
> >    entries for all ancestors
> > 4. The cache/map size should be limited and when limit is reached,
> >    evicting entries by depth priority makes sense
> > 5. A rename event for a directory whose inode is in the map and whose
> >   new parent is not in the map or has a different value than old parent
> >   needs to invalidate the entire map
> > 6. fast_path also needs a hook from inode evict to clear cache entries
>
> The inode map is physically attached to the inode itself. So the evict
> event is automatically handled. IOW, an inode's entry in the inode map
> is automatically removed when the inode is freed. For the same reason,
> we don't need to set a limit in map size and add evicting logic. Of
> course, this works based on the assumption that we don't use too much
> memory for each inode. I think this assumption is true.

Oh no, it is definitely wrong each inode is around 1K and this was the
main incentive to implement FAN_MARK_EVICTABLE and
FAN_MARK_FILESYSTEK in the first place, because recursive
pinning of all inodes in a large tree is not scalable to large trees.

Thanks,
Amir.
Song Liu Nov. 7, 2024, 8:48 p.m. UTC | #4
On Thu, Nov 7, 2024 at 12:34 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Thu, Nov 7, 2024 at 8:53 PM Song Liu <songliubraving@meta.com> wrote:
> >
> >
> >
> > > On Nov 7, 2024, at 3:10 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> > >
> > > On Wed, Oct 30, 2024 at 12:13 AM Song Liu <song@kernel.org> wrote:
> > >>
> > >> This test shows a simplified logic that monitors a subtree. This is
> > >> simplified as it doesn't handle all the scenarios, such as:
> > >>
> > >>  1) moving a subsubtree into/outof the being monitoring subtree;
> > >
> > > There is a solution for that (see below)
> > >
> > >>  2) mount point inside the being monitored subtree
> > >
> > > For that we will need to add the MOUNT/UNMOUNT/MOVE_MOUNT events,
> > > but those have been requested by userspace anyway.
> > >
> > >>
> > >> Therefore, this is not to show a way to reliably monitor a subtree.
> > >> Instead, this is to test the functionalities of bpf based fastpath.
> > >> To really monitor a subtree reliably, we will need more complex logic.
> > >
> > > Actually, this example is the foundation of my vision for efficient and race
> > > free subtree filtering:
> > >
> > > 1. The inode map is to be treated as a cache for the is_subdir() query
> >
> > Using is_subdir() as the truth and managing the cache in inode map seems
> > promising to me.
> >
> > > 2. Cache entries can also have a "distance from root" (i.e. depth) value
> > > 3. Each unknown queried path can call is_subdir() and populate the cache
> > >    entries for all ancestors
> > > 4. The cache/map size should be limited and when limit is reached,
> > >    evicting entries by depth priority makes sense
> > > 5. A rename event for a directory whose inode is in the map and whose
> > >   new parent is not in the map or has a different value than old parent
> > >   needs to invalidate the entire map
> > > 6. fast_path also needs a hook from inode evict to clear cache entries
> >
> > The inode map is physically attached to the inode itself. So the evict
> > event is automatically handled. IOW, an inode's entry in the inode map
> > is automatically removed when the inode is freed. For the same reason,
> > we don't need to set a limit in map size and add evicting logic. Of
> > course, this works based on the assumption that we don't use too much
> > memory for each inode. I think this assumption is true.
>
> Oh no, it is definitely wrong each inode is around 1K and this was the
> main incentive to implement FAN_MARK_EVICTABLE and
> FAN_MARK_FILESYSTEK in the first place, because recursive
> pinning of all inodes in a large tree is not scalable to large trees.

The inode map does not pin the inode in memory. The inode will
still be evicted as normal. The entry associated with the being
evicted inode in the map will be freed together as the inoide is freed.
The overhead I mentioned was the extra few bytes (for the flag, etc.)
per inode. When an evicted inode is loaded again, we will use
is_subdir() to recreate the entry in the inode map.

Does this make sense and address the concern?

Thanks,
Song
Amir Goldstein Nov. 8, 2024, 8:18 a.m. UTC | #5
On Thu, Nov 7, 2024 at 9:48 PM Song Liu <song@kernel.org> wrote:
>
> On Thu, Nov 7, 2024 at 12:34 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Thu, Nov 7, 2024 at 8:53 PM Song Liu <songliubraving@meta.com> wrote:
> > >
> > >
> > >
> > > > On Nov 7, 2024, at 3:10 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> > > >
> > > > On Wed, Oct 30, 2024 at 12:13 AM Song Liu <song@kernel.org> wrote:
> > > >>
> > > >> This test shows a simplified logic that monitors a subtree. This is
> > > >> simplified as it doesn't handle all the scenarios, such as:
> > > >>
> > > >>  1) moving a subsubtree into/outof the being monitoring subtree;
> > > >
> > > > There is a solution for that (see below)
> > > >
> > > >>  2) mount point inside the being monitored subtree
> > > >
> > > > For that we will need to add the MOUNT/UNMOUNT/MOVE_MOUNT events,
> > > > but those have been requested by userspace anyway.
> > > >
> > > >>
> > > >> Therefore, this is not to show a way to reliably monitor a subtree.
> > > >> Instead, this is to test the functionalities of bpf based fastpath.
> > > >> To really monitor a subtree reliably, we will need more complex logic.
> > > >
> > > > Actually, this example is the foundation of my vision for efficient and race
> > > > free subtree filtering:
> > > >
> > > > 1. The inode map is to be treated as a cache for the is_subdir() query
> > >
> > > Using is_subdir() as the truth and managing the cache in inode map seems
> > > promising to me.
> > >
> > > > 2. Cache entries can also have a "distance from root" (i.e. depth) value
> > > > 3. Each unknown queried path can call is_subdir() and populate the cache
> > > >    entries for all ancestors
> > > > 4. The cache/map size should be limited and when limit is reached,
> > > >    evicting entries by depth priority makes sense
> > > > 5. A rename event for a directory whose inode is in the map and whose
> > > >   new parent is not in the map or has a different value than old parent
> > > >   needs to invalidate the entire map
> > > > 6. fast_path also needs a hook from inode evict to clear cache entries
> > >
> > > The inode map is physically attached to the inode itself. So the evict
> > > event is automatically handled. IOW, an inode's entry in the inode map
> > > is automatically removed when the inode is freed. For the same reason,
> > > we don't need to set a limit in map size and add evicting logic. Of
> > > course, this works based on the assumption that we don't use too much
> > > memory for each inode. I think this assumption is true.
> >
> > Oh no, it is definitely wrong each inode is around 1K and this was the
> > main incentive to implement FAN_MARK_EVICTABLE and
> > FAN_MARK_FILESYSTEK in the first place, because recursive
> > pinning of all inodes in a large tree is not scalable to large trees.
>
> The inode map does not pin the inode in memory. The inode will
> still be evicted as normal. The entry associated with the being
> evicted inode in the map will be freed together as the inoide is freed.
> The overhead I mentioned was the extra few bytes (for the flag, etc.)
> per inode. When an evicted inode is loaded again, we will use
> is_subdir() to recreate the entry in the inode map.
>
> Does this make sense and address the concern?
>

Yes, that sounds good.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h
index 2eb3483f2fb0..44dcf4991244 100644
--- a/tools/testing/selftests/bpf/bpf_kfuncs.h
+++ b/tools/testing/selftests/bpf/bpf_kfuncs.h
@@ -87,4 +87,8 @@  struct dentry;
  */
 extern int bpf_get_dentry_xattr(struct dentry *dentry, const char *name,
 			      struct bpf_dynptr *value_ptr) __ksym __weak;
+
+struct fanotify_fastpath_event;
+extern struct inode *bpf_fanotify_data_inode(struct fanotify_fastpath_event *event) __ksym __weak;
+extern void bpf_iput(struct inode *inode) __ksym __weak;
 #endif
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 4ca84c8d9116..392cbcad8a8b 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -24,6 +24,7 @@  CONFIG_DEBUG_INFO_BTF=y
 CONFIG_DEBUG_INFO_DWARF4=y
 CONFIG_DUMMY=y
 CONFIG_DYNAMIC_FTRACE=y
+CONFIG_FANOTIFY=y
 CONFIG_FPROBE=y
 CONFIG_FTRACE_SYSCALLS=y
 CONFIG_FUNCTION_ERROR_INJECTION=y
diff --git a/tools/testing/selftests/bpf/prog_tests/fan_fp.c b/tools/testing/selftests/bpf/prog_tests/fan_fp.c
new file mode 100644
index 000000000000..ea57ed366647
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/fan_fp.c
@@ -0,0 +1,245 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/fanotify.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+
+#include <test_progs.h>
+
+#include "fan_fp.skel.h"
+
+#define TEST_FS "/tmp/"
+#define TEST_DIR "/tmp/fanotify_test/"
+
+static int create_test_subtree(void)
+{
+	int err;
+
+	err = mkdir(TEST_DIR, 0777);
+	if (err && errno != EEXIST)
+		return err;
+
+	return open(TEST_DIR, O_RDONLY);
+}
+
+static int create_fanotify_fd(void)
+{
+	int fanotify_fd, err;
+
+	fanotify_fd = fanotify_init(FAN_CLASS_NOTIF | FAN_REPORT_NAME | FAN_REPORT_DIR_FID,
+				    O_RDONLY);
+
+	if (!ASSERT_OK_FD(fanotify_fd, "fanotify_init"))
+		return -1;
+
+	err = fanotify_mark(fanotify_fd, FAN_MARK_ADD | FAN_MARK_FILESYSTEM,
+			    FAN_CREATE | FAN_OPEN | FAN_ONDIR | FAN_EVENT_ON_CHILD,
+			    AT_FDCWD, TEST_FS);
+	if (!ASSERT_OK(err, "fanotify_mark")) {
+		close(fanotify_fd);
+		return -1;
+	}
+
+	return fanotify_fd;
+}
+
+static int attach_global_fastpath(int fanotify_fd)
+{
+	struct fanotify_fastpath_args args = {
+		.name = "_tmp_test_sub_tree",
+		.version = 1,
+		.flags = 0,
+	};
+
+	if (ioctl(fanotify_fd, FAN_IOC_ADD_FP, &args))
+		return -1;
+
+	return 0;
+}
+
+#define EVENT_BUFFER_SIZE 4096
+struct file_access_result {
+	char name_prefix[16];
+	bool accessed;
+} access_results[3] = {
+	{"aa", false},
+	{"bb", false},
+	{"cc", false},
+};
+
+static void update_access_results(char *name)
+{
+	int i;
+
+	for (i = 0; i < 3; i++) {
+		if (strstr(name, access_results[i].name_prefix))
+			access_results[i].accessed = true;
+	}
+}
+
+static void parse_event(char *buffer, int len)
+{
+	struct fanotify_event_metadata *event =
+		(struct fanotify_event_metadata *) buffer;
+	struct fanotify_event_info_header *info;
+	struct fanotify_event_info_fid *fid;
+	struct file_handle *handle;
+	char *name;
+	int off;
+
+	for (; FAN_EVENT_OK(event, len); event = FAN_EVENT_NEXT(event, len)) {
+		for (off = sizeof(*event) ; off < event->event_len;
+		     off += info->len) {
+			info = (struct fanotify_event_info_header *)
+				((char *) event + off);
+			switch (info->info_type) {
+			case FAN_EVENT_INFO_TYPE_DFID_NAME:
+				fid = (struct fanotify_event_info_fid *) info;
+				handle = (struct file_handle *)&fid->handle;
+				name = (char *)handle + sizeof(*handle) + handle->handle_bytes;
+				update_access_results(name);
+				break;
+			default:
+				break;
+			}
+		}
+	}
+}
+
+static void touch_file(const char *path)
+{
+	int fd;
+
+	fd = open(path, O_WRONLY|O_CREAT|O_NOCTTY|O_NONBLOCK, 0666);
+	if (!ASSERT_OK_FD(fd, "open"))
+		goto cleanup;
+	close(fd);
+cleanup:
+	unlink(path);
+}
+
+static void generate_and_test_event(int fanotify_fd)
+{
+	char buffer[EVENT_BUFFER_SIZE];
+	int len, err;
+
+	/* access /tmp/fanotify_test/aa, this will generate event */
+	touch_file(TEST_DIR "aa");
+
+	/* create /tmp/fanotify_test/subdir, this will get tag from the
+	 * parent directory (added in the bpf program on fsnotify_mkdir)
+	 */
+	err = mkdir(TEST_DIR "subdir", 0777);
+	ASSERT_OK(err, "mkdir");
+
+	/* access /tmp/fanotify_test/subdir/bb, this will generate event */
+	touch_file(TEST_DIR "subdir/bb");
+
+	/* access /tmp/cc, this will NOT generate event, as the BPF
+	 * fastpath filtered this event out. (Because /tmp doesn't have
+	 * the tag.)
+	 */
+	touch_file(TEST_FS "cc");
+
+	/* read and parse the events */
+	len = read(fanotify_fd, buffer, EVENT_BUFFER_SIZE);
+	if (!ASSERT_GE(len, 0, "read event"))
+		goto cleanup;
+	parse_event(buffer, len);
+
+	/* verify we generated events for aa and bb, but filtered out the
+	 * event for cc.
+	 */
+	ASSERT_TRUE(access_results[0].accessed, "access aa");
+	ASSERT_TRUE(access_results[1].accessed, "access bb");
+	ASSERT_FALSE(access_results[2].accessed, "access cc");
+
+cleanup:
+	rmdir(TEST_DIR "subdir");
+	rmdir(TEST_DIR);
+}
+
+/* This test shows a simplified logic that monitors a subtree. This is
+ * simplified as it doesn't handle all the scenarios, such as:
+ *
+ *  1) moving a subsubtree into/outof the being monitoring subtree;
+ *  2) mount point inside the being monitored subtree
+ *
+ * Therefore, this is not to show a way to reliably monitor a subtree.
+ * Instead, this is to test the functionalities of bpf based fastpath.
+ *
+ * Overview of the logic:
+ * 1. fanotify is created for the whole file system (/tmp);
+ * 2. A bpf map (inode_storage_map) is used to tag directories to
+ *    monitor (starting from /tmp/fanotify_test);
+ * 3. On fsnotify_mkdir, thee tag is propagated to newly created sub
+ *    directories (/tmp/fanotify_test/subdir);
+ * 4. The bpf fastpath checks whether the event happens in a directory
+ *    with the tag. If yes, the event is sent to user space; otherwise,
+ *    the event is dropped.
+ */
+static void test_monitor_subtree(void)
+{
+	struct bpf_link *link;
+	struct fan_fp *skel;
+	int test_root_fd;
+	__u32 one = 1;
+	int err, fanotify_fd;
+
+	test_root_fd = create_test_subtree();
+
+	if (!ASSERT_OK_FD(test_root_fd, "create_test_subtree"))
+		return;
+
+	skel = fan_fp__open_and_load();
+
+	if (!ASSERT_OK_PTR(skel, "fan_fp__open_and_load"))
+		goto close_test_root_fd;
+
+	/* Add tag to /tmp/fanotify_test/ */
+	err = bpf_map_update_elem(bpf_map__fd(skel->maps.inode_storage_map),
+				  &test_root_fd, &one, BPF_ANY);
+	if (!ASSERT_OK(err, "bpf_map_update_elem"))
+		goto destroy_skel;
+	link = bpf_map__attach_struct_ops(skel->maps.bpf_fanotify_fastpath_ops);
+	if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops"))
+		goto destroy_skel;
+
+
+	fanotify_fd = create_fanotify_fd();
+	if (!ASSERT_OK_FD(fanotify_fd, "create_fanotify_fd"))
+		goto destroy_link;
+
+	err = attach_global_fastpath(fanotify_fd);
+	if (!ASSERT_OK(err, "attach_global_fastpath"))
+		goto close_fanotify_fd;
+
+	generate_and_test_event(fanotify_fd);
+
+	ASSERT_EQ(skel->bss->added_inode_storage, 1, "added_inode_storage");
+
+close_fanotify_fd:
+	close(fanotify_fd);
+
+destroy_link:
+	bpf_link__destroy(link);
+destroy_skel:
+	fan_fp__destroy(skel);
+
+close_test_root_fd:
+	close(test_root_fd);
+	rmdir(TEST_DIR);
+}
+
+void test_bpf_fanotify_fastpath(void)
+{
+	if (test__start_subtest("subtree"))
+		test_monitor_subtree();
+}
diff --git a/tools/testing/selftests/bpf/progs/fan_fp.c b/tools/testing/selftests/bpf/progs/fan_fp.c
new file mode 100644
index 000000000000..ee86dc189e38
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/fan_fp.c
@@ -0,0 +1,77 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#define FS_CREATE		0x00000100	/* Subfile was created */
+#define FS_ISDIR		0x40000000	/* event occurred against dir */
+
+struct {
+	__uint(type, BPF_MAP_TYPE_INODE_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, __u32);
+} inode_storage_map SEC(".maps");
+
+int added_inode_storage;
+
+SEC("struct_ops")
+int BPF_PROG(bpf_fp_handler,
+	     struct fsnotify_group *group,
+	     struct fanotify_fastpath_hook *fp_hook,
+	     struct fanotify_fastpath_event *fp_event)
+{
+	struct inode *dir;
+	__u32 *value;
+
+	dir = fp_event->dir;
+
+	value = bpf_inode_storage_get(&inode_storage_map, dir, 0, 0);
+
+	/* if dir doesn't have the tag, skip the event */
+	if (!value)
+		return FAN_FP_RET_SKIP_EVENT;
+
+	/* propagate tag to subdir on fsnotify_mkdir */
+	if (fp_event->mask == (FS_CREATE | FS_ISDIR) &&
+	    fp_event->data_type == FSNOTIFY_EVENT_DENTRY) {
+		struct inode *new_inode;
+
+		new_inode = bpf_fanotify_data_inode(fp_event);
+		if (!new_inode)
+			goto out;
+
+		value = bpf_inode_storage_get(&inode_storage_map, new_inode, 0,
+					      BPF_LOCAL_STORAGE_GET_F_CREATE);
+		if (value) {
+			*value = 1;
+			added_inode_storage++;
+		}
+		bpf_iput(new_inode);
+	}
+out:
+	return FAN_FP_RET_SEND_TO_USERSPACE;
+}
+
+SEC("struct_ops")
+int BPF_PROG(bpf_fp_init, struct fanotify_fastpath_hook *hook, const char *args)
+{
+	return 0;
+}
+
+SEC("struct_ops")
+void BPF_PROG(bpf_fp_free, struct fanotify_fastpath_hook *hook)
+{
+}
+
+SEC(".struct_ops.link")
+struct fanotify_fastpath_ops bpf_fanotify_fastpath_ops = {
+	.fp_handler = (void *)bpf_fp_handler,
+	.fp_init = (void *)bpf_fp_init,
+	.fp_free = (void *)bpf_fp_free,
+	.name = "_tmp_test_sub_tree",
+};
+
+char _license[] SEC("license") = "GPL";