diff mbox series

[v2,14/16] fanotify: add FAN_IOC_SET_MARK_PAGE_ORDER ioctl for testing

Message ID 20220329074904.2980320-15-amir73il@gmail.com (mailing list archive)
State New, archived
Headers show
Series Evictable fanotify marks | expand

Commit Message

Amir Goldstein March 29, 2022, 7:49 a.m. UTC
The ioctl can be used to request allocation of marks with large size
and attach them to an object, even if another mark already exists for
the group on the marked object.

These large marks serve no function other than testing direct reclaim
in the context of mark allocation.

Setting the value to 0 restores normal mark allocation.

FAN_MARK_REMOVE refers to the first mark of the group on an object, so
the number of FAN_MARK_REMOVE calls need to match the number of large
marks on the object in order to remove all marks from the object or use
FAN_MARK_FLUSH to remove all marks of that object type.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/notify/fanotify/fanotify.c      |  5 +++-
 fs/notify/fanotify/fanotify_user.c | 42 +++++++++++++++++++++++++++---
 include/linux/fsnotify_backend.h   |  2 ++
 include/uapi/linux/fanotify.h      |  4 +++
 4 files changed, 49 insertions(+), 4 deletions(-)

Comments

Jan Kara April 11, 2022, 12:53 p.m. UTC | #1
On Tue 29-03-22 10:49:02, Amir Goldstein wrote:
> The ioctl can be used to request allocation of marks with large size
> and attach them to an object, even if another mark already exists for
> the group on the marked object.
> 
> These large marks serve no function other than testing direct reclaim
> in the context of mark allocation.
> 
> Setting the value to 0 restores normal mark allocation.
> 
> FAN_MARK_REMOVE refers to the first mark of the group on an object, so
> the number of FAN_MARK_REMOVE calls need to match the number of large
> marks on the object in order to remove all marks from the object or use
> FAN_MARK_FLUSH to remove all marks of that object type.
> 
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>

I understand this is useful as a debugging patch but I'm not sure we want
this permanently in the kernel. I'm wondering if generally more useful
approach would not be to improve error injection for page allocations to
allow easier stressing of direct reclaim...

								Honza

> ---
>  fs/notify/fanotify/fanotify.c      |  5 +++-
>  fs/notify/fanotify/fanotify_user.c | 42 +++++++++++++++++++++++++++---
>  include/linux/fsnotify_backend.h   |  2 ++
>  include/uapi/linux/fanotify.h      |  4 +++
>  4 files changed, 49 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index 985e995d2a39..02990a6b1b65 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -1075,7 +1075,10 @@ static void fanotify_freeing_mark(struct fsnotify_mark *mark,
>  
>  static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
>  {
> -	kmem_cache_free(fanotify_mark_cache, fsn_mark);
> +	if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_KMALLOC)
> +		kfree(fsn_mark);
> +	else
> +		kmem_cache_free(fanotify_mark_cache, fsn_mark);
>  }
>  
>  const struct fsnotify_ops fanotify_fsnotify_ops = {
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index 2c65038da4ce..a3539bd8e443 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -928,6 +928,16 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
>  		spin_unlock(&group->notification_lock);
>  		ret = put_user(send_len, (int __user *) p);
>  		break;
> +	case FAN_IOC_SET_MARK_PAGE_ORDER:
> +		if (!capable(CAP_SYS_ADMIN))
> +			return -EPERM;
> +		mutex_lock(&group->mark_mutex);
> +		group->fanotify_data.mark_page_order = (unsigned int)arg;
> +		pr_info("fanotify: set mark size page order to %u",
> +			group->fanotify_data.mark_page_order);
> +		ret = 0;
> +		mutex_unlock(&group->mark_mutex);
> +		break;
>  	}
>  
>  	return ret;
> @@ -1150,6 +1160,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
>  						   __kernel_fsid_t *fsid)
>  {
>  	struct ucounts *ucounts = group->fanotify_data.ucounts;
> +	unsigned int order = group->fanotify_data.mark_page_order;
>  	struct fsnotify_mark *mark;
>  	int ret;
>  
> @@ -1162,7 +1173,21 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
>  	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
>  		return ERR_PTR(-ENOSPC);
>  
> -	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
> +	/*
> +	 * If requested to test direct reclaim in mark allocation context,
> +	 * start by trying to allocate requested page order per mark and
> +	 * fall back to allocation size that is likely to trigger direct
> +	 * reclaim but not too large to trigger compaction.
> +	 */
> +	if (order) {
> +		mark = kmalloc(PAGE_SIZE << order,
> +			       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
> +		if (!mark && order > PAGE_ALLOC_COSTLY_ORDER)
> +			mark = kmalloc(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER,
> +				       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
> +	} else {
> +		mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
> +	}
>  	if (!mark) {
>  		ret = -ENOMEM;
>  		goto out_dec_ucounts;
> @@ -1171,6 +1196,15 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
>  	fsnotify_init_mark(mark, group);
>  	if (fan_flags & FAN_MARK_EVICTABLE)
>  		mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
> +	/*
> +	 * Allow adding multiple large marks per object for testing.
> +	 * FAN_MARK_REMOVE refers to the first mark of the group, so one
> +	 * FAN_MARK_REMOVE is needed for every added large mark (or use
> +	 * FAN_MARK_FLUSH to remove all marks).
> +	 */
> +	if (order)
> +		mark->flags |= FSNOTIFY_MARK_FLAG_KMALLOC |
> +			       FSNOTIFY_MARK_FLAG_ALLOW_DUPS;
>  
>  	ret = fsnotify_add_mark_locked(mark, connp, obj_type, fsid);
>  	if (ret) {
> @@ -1201,11 +1235,13 @@ static int fanotify_add_mark(struct fsnotify_group *group,
>  			     __u32 mask, unsigned int flags,
>  			     __kernel_fsid_t *fsid)
>  {
> -	struct fsnotify_mark *fsn_mark;
> +	struct fsnotify_mark *fsn_mark = NULL;
>  	int ret = 0;
>  
>  	mutex_lock(&group->mark_mutex);
> -	fsn_mark = fsnotify_find_mark(connp, group);
> +	/* Allow adding multiple large marks per object for testing */
> +	if (!group->fanotify_data.mark_page_order)
> +		fsn_mark = fsnotify_find_mark(connp, group);
>  	if (!fsn_mark) {
>  		fsn_mark = fanotify_add_new_mark(group, connp, obj_type, flags,
>  						 fsid);
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index df58439a86fa..8220cf560c28 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -250,6 +250,7 @@ struct fsnotify_group {
>  			int f_flags; /* event_f_flags from fanotify_init() */
>  			struct ucounts *ucounts;
>  			mempool_t error_events_pool;
> +			unsigned int mark_page_order; /* for testing only */
>  		} fanotify_data;
>  #endif /* CONFIG_FANOTIFY */
>  	};
> @@ -528,6 +529,7 @@ struct fsnotify_mark {
>  #define FSNOTIFY_MARK_FLAG_ALLOW_DUPS		0x0040
>  #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x0100
>  #define FSNOTIFY_MARK_FLAG_NO_IREF		0x0200
> +#define FSNOTIFY_MARK_FLAG_KMALLOC		0x0400
>  	unsigned int flags;		/* flags [mark->lock] */
>  };
>  
> diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> index f1f89132d60e..49cdc9008bf2 100644
> --- a/include/uapi/linux/fanotify.h
> +++ b/include/uapi/linux/fanotify.h
> @@ -3,6 +3,7 @@
>  #define _UAPI_LINUX_FANOTIFY_H
>  
>  #include <linux/types.h>
> +#include <linux/ioctl.h>
>  
>  /* the following events that user-space can register for */
>  #define FAN_ACCESS		0x00000001	/* File was accessed */
> @@ -206,4 +207,7 @@ struct fanotify_response {
>  				(long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \
>  				(long)(meta)->event_len <= (long)(len))
>  
> +/* Only for testing. Not useful otherwise */
> +#define	FAN_IOC_SET_MARK_PAGE_ORDER	_IOW(0xfa, 1, long)
> +
>  #endif /* _UAPI_LINUX_FANOTIFY_H */
> -- 
> 2.25.1
>
Amir Goldstein April 11, 2022, 1:07 p.m. UTC | #2
On Mon, Apr 11, 2022 at 3:53 PM Jan Kara <jack@suse.cz> wrote:
>
> On Tue 29-03-22 10:49:02, Amir Goldstein wrote:
> > The ioctl can be used to request allocation of marks with large size
> > and attach them to an object, even if another mark already exists for
> > the group on the marked object.
> >
> > These large marks serve no function other than testing direct reclaim
> > in the context of mark allocation.
> >
> > Setting the value to 0 restores normal mark allocation.
> >
> > FAN_MARK_REMOVE refers to the first mark of the group on an object, so
> > the number of FAN_MARK_REMOVE calls need to match the number of large
> > marks on the object in order to remove all marks from the object or use
> > FAN_MARK_FLUSH to remove all marks of that object type.
> >
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
>
> I understand this is useful as a debugging patch but I'm not sure we want
> this permanently in the kernel. I'm wondering if generally more useful
> approach would not be to improve error injection for page allocations to
> allow easier stressing of direct reclaim...

I think you are probably right, but I had to stay within time budget
to create a reproducer and it took me a lot of time even with this hack
so I don't see myself investing more time on a reproducer with improved
error injections.

So for me, it is an adequate solution to carry this patch out-of-tree
along with a matching out-of tree patch for LTP test:

https://github.com/amir73il/ltp/commit/383db59959c44bb27dbec81e74d1d9caba45b0f2

For the community, we could rely on lockdep reports users now that
we sorted out the lockdep annotations.

BTW, before resorting to this ioctl I also started going down the path of
running the test inside a restricted memcg, only to figure out that it
is the inodes
that need to be evicted not the marks and inodes are not accounted to memcg.
I think there may have been some work in the direction of somehow accounting
inode cache to memcg, but not sure where this stands.

I am open for other suggestions.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 985e995d2a39..02990a6b1b65 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -1075,7 +1075,10 @@  static void fanotify_freeing_mark(struct fsnotify_mark *mark,
 
 static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
 {
-	kmem_cache_free(fanotify_mark_cache, fsn_mark);
+	if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_KMALLOC)
+		kfree(fsn_mark);
+	else
+		kmem_cache_free(fanotify_mark_cache, fsn_mark);
 }
 
 const struct fsnotify_ops fanotify_fsnotify_ops = {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 2c65038da4ce..a3539bd8e443 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -928,6 +928,16 @@  static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
 		spin_unlock(&group->notification_lock);
 		ret = put_user(send_len, (int __user *) p);
 		break;
+	case FAN_IOC_SET_MARK_PAGE_ORDER:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		mutex_lock(&group->mark_mutex);
+		group->fanotify_data.mark_page_order = (unsigned int)arg;
+		pr_info("fanotify: set mark size page order to %u",
+			group->fanotify_data.mark_page_order);
+		ret = 0;
+		mutex_unlock(&group->mark_mutex);
+		break;
 	}
 
 	return ret;
@@ -1150,6 +1160,7 @@  static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 						   __kernel_fsid_t *fsid)
 {
 	struct ucounts *ucounts = group->fanotify_data.ucounts;
+	unsigned int order = group->fanotify_data.mark_page_order;
 	struct fsnotify_mark *mark;
 	int ret;
 
@@ -1162,7 +1173,21 @@  static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
 		return ERR_PTR(-ENOSPC);
 
-	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+	/*
+	 * If requested to test direct reclaim in mark allocation context,
+	 * start by trying to allocate requested page order per mark and
+	 * fall back to allocation size that is likely to trigger direct
+	 * reclaim but not too large to trigger compaction.
+	 */
+	if (order) {
+		mark = kmalloc(PAGE_SIZE << order,
+			       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+		if (!mark && order > PAGE_ALLOC_COSTLY_ORDER)
+			mark = kmalloc(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER,
+				       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+	} else {
+		mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+	}
 	if (!mark) {
 		ret = -ENOMEM;
 		goto out_dec_ucounts;
@@ -1171,6 +1196,15 @@  static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 	fsnotify_init_mark(mark, group);
 	if (fan_flags & FAN_MARK_EVICTABLE)
 		mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
+	/*
+	 * Allow adding multiple large marks per object for testing.
+	 * FAN_MARK_REMOVE refers to the first mark of the group, so one
+	 * FAN_MARK_REMOVE is needed for every added large mark (or use
+	 * FAN_MARK_FLUSH to remove all marks).
+	 */
+	if (order)
+		mark->flags |= FSNOTIFY_MARK_FLAG_KMALLOC |
+			       FSNOTIFY_MARK_FLAG_ALLOW_DUPS;
 
 	ret = fsnotify_add_mark_locked(mark, connp, obj_type, fsid);
 	if (ret) {
@@ -1201,11 +1235,13 @@  static int fanotify_add_mark(struct fsnotify_group *group,
 			     __u32 mask, unsigned int flags,
 			     __kernel_fsid_t *fsid)
 {
-	struct fsnotify_mark *fsn_mark;
+	struct fsnotify_mark *fsn_mark = NULL;
 	int ret = 0;
 
 	mutex_lock(&group->mark_mutex);
-	fsn_mark = fsnotify_find_mark(connp, group);
+	/* Allow adding multiple large marks per object for testing */
+	if (!group->fanotify_data.mark_page_order)
+		fsn_mark = fsnotify_find_mark(connp, group);
 	if (!fsn_mark) {
 		fsn_mark = fanotify_add_new_mark(group, connp, obj_type, flags,
 						 fsid);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index df58439a86fa..8220cf560c28 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -250,6 +250,7 @@  struct fsnotify_group {
 			int f_flags; /* event_f_flags from fanotify_init() */
 			struct ucounts *ucounts;
 			mempool_t error_events_pool;
+			unsigned int mark_page_order; /* for testing only */
 		} fanotify_data;
 #endif /* CONFIG_FANOTIFY */
 	};
@@ -528,6 +529,7 @@  struct fsnotify_mark {
 #define FSNOTIFY_MARK_FLAG_ALLOW_DUPS		0x0040
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x0100
 #define FSNOTIFY_MARK_FLAG_NO_IREF		0x0200
+#define FSNOTIFY_MARK_FLAG_KMALLOC		0x0400
 	unsigned int flags;		/* flags [mark->lock] */
 };
 
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index f1f89132d60e..49cdc9008bf2 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -3,6 +3,7 @@ 
 #define _UAPI_LINUX_FANOTIFY_H
 
 #include <linux/types.h>
+#include <linux/ioctl.h>
 
 /* the following events that user-space can register for */
 #define FAN_ACCESS		0x00000001	/* File was accessed */
@@ -206,4 +207,7 @@  struct fanotify_response {
 				(long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \
 				(long)(meta)->event_len <= (long)(len))
 
+/* Only for testing. Not useful otherwise */
+#define	FAN_IOC_SET_MARK_PAGE_ORDER	_IOW(0xfa, 1, long)
+
 #endif /* _UAPI_LINUX_FANOTIFY_H */