Message ID | 20220329074904.2980320-15-amir73il@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Evictable fanotify marks | expand |
On Tue 29-03-22 10:49:02, Amir Goldstein wrote: > The ioctl can be used to request allocation of marks with large size > and attach them to an object, even if another mark already exists for > the group on the marked object. > > These large marks serve no function other than testing direct reclaim > in the context of mark allocation. > > Setting the value to 0 restores normal mark allocation. > > FAN_MARK_REMOVE refers to the first mark of the group on an object, so > the number of FAN_MARK_REMOVE calls need to match the number of large > marks on the object in order to remove all marks from the object or use > FAN_MARK_FLUSH to remove all marks of that object type. > > Signed-off-by: Amir Goldstein <amir73il@gmail.com> I understand this is useful as a debugging patch but I'm not sure we want this permanently in the kernel. I'm wondering if generally more useful approach would not be to improve error injection for page allocations to allow easier stressing of direct reclaim... Honza > --- > fs/notify/fanotify/fanotify.c | 5 +++- > fs/notify/fanotify/fanotify_user.c | 42 +++++++++++++++++++++++++++--- > include/linux/fsnotify_backend.h | 2 ++ > include/uapi/linux/fanotify.h | 4 +++ > 4 files changed, 49 insertions(+), 4 deletions(-) > > diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c > index 985e995d2a39..02990a6b1b65 100644 > --- a/fs/notify/fanotify/fanotify.c > +++ b/fs/notify/fanotify/fanotify.c > @@ -1075,7 +1075,10 @@ static void fanotify_freeing_mark(struct fsnotify_mark *mark, > > static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) > { > - kmem_cache_free(fanotify_mark_cache, fsn_mark); > + if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_KMALLOC) > + kfree(fsn_mark); > + else > + kmem_cache_free(fanotify_mark_cache, fsn_mark); > } > > const struct fsnotify_ops fanotify_fsnotify_ops = { > diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c > index 2c65038da4ce..a3539bd8e443 100644 > --- a/fs/notify/fanotify/fanotify_user.c > +++ b/fs/notify/fanotify/fanotify_user.c > @@ -928,6 +928,16 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar > spin_unlock(&group->notification_lock); > ret = put_user(send_len, (int __user *) p); > break; > + case FAN_IOC_SET_MARK_PAGE_ORDER: > + if (!capable(CAP_SYS_ADMIN)) > + return -EPERM; > + mutex_lock(&group->mark_mutex); > + group->fanotify_data.mark_page_order = (unsigned int)arg; > + pr_info("fanotify: set mark size page order to %u", > + group->fanotify_data.mark_page_order); > + ret = 0; > + mutex_unlock(&group->mark_mutex); > + break; > } > > return ret; > @@ -1150,6 +1160,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, > __kernel_fsid_t *fsid) > { > struct ucounts *ucounts = group->fanotify_data.ucounts; > + unsigned int order = group->fanotify_data.mark_page_order; > struct fsnotify_mark *mark; > int ret; > > @@ -1162,7 +1173,21 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, > !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) > return ERR_PTR(-ENOSPC); > > - mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); > + /* > + * If requested to test direct reclaim in mark allocation context, > + * start by trying to allocate requested page order per mark and > + * fall back to allocation size that is likely to trigger direct > + * reclaim but not too large to trigger compaction. > + */ > + if (order) { > + mark = kmalloc(PAGE_SIZE << order, > + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); > + if (!mark && order > PAGE_ALLOC_COSTLY_ORDER) > + mark = kmalloc(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, > + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); > + } else { > + mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); > + } > if (!mark) { > ret = -ENOMEM; > goto out_dec_ucounts; > @@ -1171,6 +1196,15 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, > fsnotify_init_mark(mark, group); > if (fan_flags & FAN_MARK_EVICTABLE) > mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF; > + /* > + * Allow adding multiple large marks per object for testing. > + * FAN_MARK_REMOVE refers to the first mark of the group, so one > + * FAN_MARK_REMOVE is needed for every added large mark (or use > + * FAN_MARK_FLUSH to remove all marks). > + */ > + if (order) > + mark->flags |= FSNOTIFY_MARK_FLAG_KMALLOC | > + FSNOTIFY_MARK_FLAG_ALLOW_DUPS; > > ret = fsnotify_add_mark_locked(mark, connp, obj_type, fsid); > if (ret) { > @@ -1201,11 +1235,13 @@ static int fanotify_add_mark(struct fsnotify_group *group, > __u32 mask, unsigned int flags, > __kernel_fsid_t *fsid) > { > - struct fsnotify_mark *fsn_mark; > + struct fsnotify_mark *fsn_mark = NULL; > int ret = 0; > > mutex_lock(&group->mark_mutex); > - fsn_mark = fsnotify_find_mark(connp, group); > + /* Allow adding multiple large marks per object for testing */ > + if (!group->fanotify_data.mark_page_order) > + fsn_mark = fsnotify_find_mark(connp, group); > if (!fsn_mark) { > fsn_mark = fanotify_add_new_mark(group, connp, obj_type, flags, > fsid); > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h > index df58439a86fa..8220cf560c28 100644 > --- a/include/linux/fsnotify_backend.h > +++ b/include/linux/fsnotify_backend.h > @@ -250,6 +250,7 @@ struct fsnotify_group { > int f_flags; /* event_f_flags from fanotify_init() */ > struct ucounts *ucounts; > mempool_t error_events_pool; > + unsigned int mark_page_order; /* for testing only */ > } fanotify_data; > #endif /* CONFIG_FANOTIFY */ > }; > @@ -528,6 +529,7 @@ struct fsnotify_mark { > #define FSNOTIFY_MARK_FLAG_ALLOW_DUPS 0x0040 > #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 > #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 > +#define FSNOTIFY_MARK_FLAG_KMALLOC 0x0400 > unsigned int flags; /* flags [mark->lock] */ > }; > > diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h > index f1f89132d60e..49cdc9008bf2 100644 > --- a/include/uapi/linux/fanotify.h > +++ b/include/uapi/linux/fanotify.h > @@ -3,6 +3,7 @@ > #define _UAPI_LINUX_FANOTIFY_H > > #include <linux/types.h> > +#include <linux/ioctl.h> > > /* the following events that user-space can register for */ > #define FAN_ACCESS 0x00000001 /* File was accessed */ > @@ -206,4 +207,7 @@ struct fanotify_response { > (long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \ > (long)(meta)->event_len <= (long)(len)) > > +/* Only for testing. Not useful otherwise */ > +#define FAN_IOC_SET_MARK_PAGE_ORDER _IOW(0xfa, 1, long) > + > #endif /* _UAPI_LINUX_FANOTIFY_H */ > -- > 2.25.1 >
On Mon, Apr 11, 2022 at 3:53 PM Jan Kara <jack@suse.cz> wrote: > > On Tue 29-03-22 10:49:02, Amir Goldstein wrote: > > The ioctl can be used to request allocation of marks with large size > > and attach them to an object, even if another mark already exists for > > the group on the marked object. > > > > These large marks serve no function other than testing direct reclaim > > in the context of mark allocation. > > > > Setting the value to 0 restores normal mark allocation. > > > > FAN_MARK_REMOVE refers to the first mark of the group on an object, so > > the number of FAN_MARK_REMOVE calls need to match the number of large > > marks on the object in order to remove all marks from the object or use > > FAN_MARK_FLUSH to remove all marks of that object type. > > > > Signed-off-by: Amir Goldstein <amir73il@gmail.com> > > I understand this is useful as a debugging patch but I'm not sure we want > this permanently in the kernel. I'm wondering if generally more useful > approach would not be to improve error injection for page allocations to > allow easier stressing of direct reclaim... I think you are probably right, but I had to stay within time budget to create a reproducer and it took me a lot of time even with this hack so I don't see myself investing more time on a reproducer with improved error injections. So for me, it is an adequate solution to carry this patch out-of-tree along with a matching out-of tree patch for LTP test: https://github.com/amir73il/ltp/commit/383db59959c44bb27dbec81e74d1d9caba45b0f2 For the community, we could rely on lockdep reports users now that we sorted out the lockdep annotations. BTW, before resorting to this ioctl I also started going down the path of running the test inside a restricted memcg, only to figure out that it is the inodes that need to be evicted not the marks and inodes are not accounted to memcg. I think there may have been some work in the direction of somehow accounting inode cache to memcg, but not sure where this stands. I am open for other suggestions. Thanks, Amir.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 985e995d2a39..02990a6b1b65 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -1075,7 +1075,10 @@ static void fanotify_freeing_mark(struct fsnotify_mark *mark, static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) { - kmem_cache_free(fanotify_mark_cache, fsn_mark); + if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_KMALLOC) + kfree(fsn_mark); + else + kmem_cache_free(fanotify_mark_cache, fsn_mark); } const struct fsnotify_ops fanotify_fsnotify_ops = { diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 2c65038da4ce..a3539bd8e443 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -928,6 +928,16 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; + case FAN_IOC_SET_MARK_PAGE_ORDER: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + mutex_lock(&group->mark_mutex); + group->fanotify_data.mark_page_order = (unsigned int)arg; + pr_info("fanotify: set mark size page order to %u", + group->fanotify_data.mark_page_order); + ret = 0; + mutex_unlock(&group->mark_mutex); + break; } return ret; @@ -1150,6 +1160,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, __kernel_fsid_t *fsid) { struct ucounts *ucounts = group->fanotify_data.ucounts; + unsigned int order = group->fanotify_data.mark_page_order; struct fsnotify_mark *mark; int ret; @@ -1162,7 +1173,21 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) return ERR_PTR(-ENOSPC); - mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + /* + * If requested to test direct reclaim in mark allocation context, + * start by trying to allocate requested page order per mark and + * fall back to allocation size that is likely to trigger direct + * reclaim but not too large to trigger compaction. + */ + if (order) { + mark = kmalloc(PAGE_SIZE << order, + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + if (!mark && order > PAGE_ALLOC_COSTLY_ORDER) + mark = kmalloc(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + } else { + mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + } if (!mark) { ret = -ENOMEM; goto out_dec_ucounts; @@ -1171,6 +1196,15 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, fsnotify_init_mark(mark, group); if (fan_flags & FAN_MARK_EVICTABLE) mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF; + /* + * Allow adding multiple large marks per object for testing. + * FAN_MARK_REMOVE refers to the first mark of the group, so one + * FAN_MARK_REMOVE is needed for every added large mark (or use + * FAN_MARK_FLUSH to remove all marks). + */ + if (order) + mark->flags |= FSNOTIFY_MARK_FLAG_KMALLOC | + FSNOTIFY_MARK_FLAG_ALLOW_DUPS; ret = fsnotify_add_mark_locked(mark, connp, obj_type, fsid); if (ret) { @@ -1201,11 +1235,13 @@ static int fanotify_add_mark(struct fsnotify_group *group, __u32 mask, unsigned int flags, __kernel_fsid_t *fsid) { - struct fsnotify_mark *fsn_mark; + struct fsnotify_mark *fsn_mark = NULL; int ret = 0; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_mark(connp, group); + /* Allow adding multiple large marks per object for testing */ + if (!group->fanotify_data.mark_page_order) + fsn_mark = fsnotify_find_mark(connp, group); if (!fsn_mark) { fsn_mark = fanotify_add_new_mark(group, connp, obj_type, flags, fsid); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index df58439a86fa..8220cf560c28 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -250,6 +250,7 @@ struct fsnotify_group { int f_flags; /* event_f_flags from fanotify_init() */ struct ucounts *ucounts; mempool_t error_events_pool; + unsigned int mark_page_order; /* for testing only */ } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; @@ -528,6 +529,7 @@ struct fsnotify_mark { #define FSNOTIFY_MARK_FLAG_ALLOW_DUPS 0x0040 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 +#define FSNOTIFY_MARK_FLAG_KMALLOC 0x0400 unsigned int flags; /* flags [mark->lock] */ }; diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index f1f89132d60e..49cdc9008bf2 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -3,6 +3,7 @@ #define _UAPI_LINUX_FANOTIFY_H #include <linux/types.h> +#include <linux/ioctl.h> /* the following events that user-space can register for */ #define FAN_ACCESS 0x00000001 /* File was accessed */ @@ -206,4 +207,7 @@ struct fanotify_response { (long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \ (long)(meta)->event_len <= (long)(len)) +/* Only for testing. Not useful otherwise */ +#define FAN_IOC_SET_MARK_PAGE_ORDER _IOW(0xfa, 1, long) + #endif /* _UAPI_LINUX_FANOTIFY_H */
The ioctl can be used to request allocation of marks with large size and attach them to an object, even if another mark already exists for the group on the marked object. These large marks serve no function other than testing direct reclaim in the context of mark allocation. Setting the value to 0 restores normal mark allocation. FAN_MARK_REMOVE refers to the first mark of the group on an object, so the number of FAN_MARK_REMOVE calls need to match the number of large marks on the object in order to remove all marks from the object or use FAN_MARK_FLUSH to remove all marks of that object type. Signed-off-by: Amir Goldstein <amir73il@gmail.com> --- fs/notify/fanotify/fanotify.c | 5 +++- fs/notify/fanotify/fanotify_user.c | 42 +++++++++++++++++++++++++++--- include/linux/fsnotify_backend.h | 2 ++ include/uapi/linux/fanotify.h | 4 +++ 4 files changed, 49 insertions(+), 4 deletions(-)