diff mbox

[2/4,PoC,RFC] Add rlimit-events framework

Message ID 20171018203230.29871-3-k.opasiak@samsung.com (mailing list archive)
State New, archived
Headers show

Commit Message

Krzysztof Opasiak Oct. 18, 2017, 8:32 p.m. UTC
Add a framework which allows to notify userspace programs
about change of resource (the same as in rlimits) usage.

To monitor some process, monitor FD has to be obtained from
kernel using rlimit-events netlink interface.
Then monitor can issue ioctls() to subscribe for a particular
usage level of given resource.
When monitoring subject crosses given usage level monitoring
fd will be ready to read resource change event from it.

It's possible to monitor multiple processes and single
process can be monitored by multiple other processes.

Signed-off-by: Krzysztof Opasiak <k.opasiak@samsung.com>
---
 include/asm-generic/resource.h   |  37 +-
 include/linux/init_task.h        |   1 +
 include/linux/rlimit_noti_kern.h |  54 +++
 include/linux/sched/signal.h     |   6 +
 include/uapi/linux/netlink.h     |   1 +
 include/uapi/linux/rlimit_noti.h |  71 ++++
 init/Kconfig                     |   6 +
 kernel/Makefile                  |   1 +
 kernel/rlimit_noti.c             | 786 +++++++++++++++++++++++++++++++++++++++
 9 files changed, 962 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/rlimit_noti_kern.h
 create mode 100644 include/uapi/linux/rlimit_noti.h
 create mode 100644 kernel/rlimit_noti.c

Comments

Greg KH Oct. 19, 2017, 7:41 a.m. UTC | #1
Meta-comments on the code, I'm not commenting on the content, just
normal code review things that I always see in kernel code...

On Wed, Oct 18, 2017 at 10:32:28PM +0200, Krzysztof Opasiak wrote:
> diff --git a/include/linux/rlimit_noti_kern.h b/include/linux/rlimit_noti_kern.h
> new file mode 100644
> index 000000000000..e49fddaa21c0
> --- /dev/null
> +++ b/include/linux/rlimit_noti_kern.h
> @@ -0,0 +1,54 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.

I have to ask, do you really mean "any later version" for this, and the
other new files you created?

And, it is nice to use SPDX for new files to identify their license.
It's not that prevelant, but is getting there...

> --- a/include/uapi/linux/netlink.h
> +++ b/include/uapi/linux/netlink.h
> @@ -28,6 +28,7 @@
>  #define NETLINK_RDMA		20
>  #define NETLINK_CRYPTO		21	/* Crypto layer */
>  #define NETLINK_SMC		22	/* SMC monitoring */
> +#define NETLINK_RLIMIT_EVENTS   23      /* rlimit notification */

No tabs?

> --- /dev/null
> +++ b/include/uapi/linux/rlimit_noti.h
> @@ -0,0 +1,71 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.

GPLv2+ in a user api header file?  You are really brave :)

> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#ifndef _UAPI_LINUX_RLIMIT_NOTI_H_
> +#define _UAPI_LINUX_RLIMIT_NOTI_H_
> +
> +#ifdef __KERNEL__
> +#include <linux/types.h>
> +#include <linux/resource.h>
> +#else
> +#include <stdint.h>
> +#endif
> +
> +#define RLIMIT_GET_NOTI_FD 1000
> +
> +/* ioctl's */
> +#define RLIMIT_ADD_NOTI_LVL 1
> +#define RLIMIT_RM_NOTI_LVL 2
> +
> +#define RLIMIT_SET_NOTI_ALL 3
> +#define RLIMIT_CLEAR_NOTI_ALL 4

No tabs?

> +
> +/*
> + * For future (notify every 5, 10 units change):
> + * #define RLIMIT_SET_NOTI_STEP 5
> + */
> +
> +#define RLIMIT_GET_NOTI_LVLS 6
> +#define RLIMIT_GET_NOTI_LVL_COUNT 7
> +
> +/* Flags for ioctl's */
> +#define RLIMIT_FLAG_NO_INHERIT (1u << 0)
> +
> +/* Event types */
> +enum {
> +	RLIMIT_EVENT_TYPE_RES_CHANGED,
> +	RLIMIT_EVENT_TYPE_MAX
> +};
> +
> +/* TODO take care of padding (packed) */
> +struct rlimit_noti_subject {
> +	pid_t pid;
> +	uint32_t resource;
> +};

For structures that cross the user/kernel boundry, you have to use the
correct variable types.  And that is never "unit32_t" and such, use
"__u32" and the other "__" types.

And are you _sure_ that pid_t is able to be exported to userspace
correctly?

> +
> +struct rlimit_noti_level {
> +	struct rlimit_noti_subject subj;
> +	uint64_t value;

__u64

> +	uint32_t flags;

__u32

And so on for all others.

You don't seem to describe an ioctl here in the "normal" method, but
only use vague numbers up above, that's odd, why?

> diff --git a/init/Kconfig b/init/Kconfig
> index 1d3475fc9496..4bc44fa86640 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -332,6 +332,12 @@ config AUDIT_TREE
>  	depends on AUDITSYSCALL
>  	select FSNOTIFY
>  
> +config RLIMIT_NOTIFICATION
> +       bool "Support fd notifications on given resource usage"
> +       depends on NET
> +       help
> +	Enable this to monitor process resource changes usage via fd.

Mix of tab and spaces :(

thanks,

greg k-h
Krzysztof Opasiak Oct. 19, 2017, 6:17 p.m. UTC | #2
Hi,

On 10/19/2017 09:41 AM, Greg KH wrote:
> Meta-comments on the code, I'm not commenting on the content, just
> normal code review things that I always see in kernel code...
> 
> On Wed, Oct 18, 2017 at 10:32:28PM +0200, Krzysztof Opasiak wrote:
>> diff --git a/include/linux/rlimit_noti_kern.h b/include/linux/rlimit_noti_kern.h
>> new file mode 100644
>> index 000000000000..e49fddaa21c0
>> --- /dev/null
>> +++ b/include/linux/rlimit_noti_kern.h
>> @@ -0,0 +1,54 @@
>> +/*
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
> 
> I have to ask, do you really mean "any later version" for this, and the
> other new files you created?
> 

If it's about me then I have not problems with "any later version" of 
GPL but there is not only me but also my company;)

To be honest, I copied this from a file created some time ago by one of 
my coworkers assuming that he fallowed the company procedures, but maybe 
he didn't as it's causing so much interest;)

I'll double check the company procedure and update this before sending 
v2. Thanks.

> And, it is nice to use SPDX for new files to identify their license.
> It's not that prevelant, but is getting there...

Ok I'll fix this using SPDX.

> 
>> --- a/include/uapi/linux/netlink.h
>> +++ b/include/uapi/linux/netlink.h
>> @@ -28,6 +28,7 @@
>>   #define NETLINK_RDMA		20
>>   #define NETLINK_CRYPTO		21	/* Crypto layer */
>>   #define NETLINK_SMC		22	/* SMC monitoring */
>> +#define NETLINK_RLIMIT_EVENTS   23      /* rlimit notification */
> 
> No tabs?

ahhh, my emacs is getting crazy after my last customization experiments. 
I'll fix this. It's weird that checkpatch didn't complain about this one.

> 
>> --- /dev/null
>> +++ b/include/uapi/linux/rlimit_noti.h
>> @@ -0,0 +1,71 @@
>> +/*
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
> 
> GPLv2+ in a user api header file?  You are really brave :)

Like above

> 
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + */
>> +
>> +#ifndef _UAPI_LINUX_RLIMIT_NOTI_H_
>> +#define _UAPI_LINUX_RLIMIT_NOTI_H_
>> +
>> +#ifdef __KERNEL__
>> +#include <linux/types.h>
>> +#include <linux/resource.h>
>> +#else
>> +#include <stdint.h>
>> +#endif
>> +
>> +#define RLIMIT_GET_NOTI_FD 1000
>> +
>> +/* ioctl's */
>> +#define RLIMIT_ADD_NOTI_LVL 1
>> +#define RLIMIT_RM_NOTI_LVL 2
>> +
>> +#define RLIMIT_SET_NOTI_ALL 3
>> +#define RLIMIT_CLEAR_NOTI_ALL 4
> 
> No tabs?
> 
>> +
>> +/*
>> + * For future (notify every 5, 10 units change):
>> + * #define RLIMIT_SET_NOTI_STEP 5
>> + */
>> +
>> +#define RLIMIT_GET_NOTI_LVLS 6
>> +#define RLIMIT_GET_NOTI_LVL_COUNT 7
>> +
>> +/* Flags for ioctl's */
>> +#define RLIMIT_FLAG_NO_INHERIT (1u << 0)
>> +
>> +/* Event types */
>> +enum {
>> +	RLIMIT_EVENT_TYPE_RES_CHANGED,
>> +	RLIMIT_EVENT_TYPE_MAX
>> +};
>> +
>> +/* TODO take care of padding (packed) */
>> +struct rlimit_noti_subject {
>> +	pid_t pid;
>> +	uint32_t resource;
>> +};
> 
> For structures that cross the user/kernel boundry, you have to use the
> correct variable types.  And that is never "unit32_t" and such, use
> "__u32" and the other "__" types.
> 
> And are you _sure_ that pid_t is able to be exported to userspace
> correctly?

Hmmm it's used in kernel headers alongside with __kernel_pid_t, but the 
later one is just a typedef from include/linux/types.h:

typedef __kernel_pid_t            pid_t;

but if you think I should use __kernel_pid_t then I'll fix this.

> 
>> +
>> +struct rlimit_noti_level {
>> +	struct rlimit_noti_subject subj;
>> +	uint64_t value;
> 
> __u64
> 
>> +	uint32_t flags;
> 
> __u32
> 
> And so on for all others.

I'll fix this for v2.

> 
> You don't seem to describe an ioctl here in the "normal" method, but
> only use vague numbers up above, that's odd, why?

Sorry, there is no real reason.

Just started with numbers to prepare some working prototype to show the 
concept before doing whole implementation and forgot to fix this.

> 
>> diff --git a/init/Kconfig b/init/Kconfig
>> index 1d3475fc9496..4bc44fa86640 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -332,6 +332,12 @@ config AUDIT_TREE
>>   	depends on AUDITSYSCALL
>>   	select FSNOTIFY
>>   
>> +config RLIMIT_NOTIFICATION
>> +       bool "Support fd notifications on given resource usage"
>> +       depends on NET
>> +       help
>> +	Enable this to monitor process resource changes usage via fd.
> 
> Mix of tab and spaces :(
> 

Sorry, I'll fix this. I'm curious why checkpatch didn't catch this. It 
reported some whitespace errors and I fixed all of them but they are 
still in there:(

Best regards,
diff mbox

Patch

diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h
index 5e752b959054..338f20ba7e56 100644
--- a/include/asm-generic/resource.h
+++ b/include/asm-generic/resource.h
@@ -2,7 +2,7 @@ 
 #define _ASM_GENERIC_RESOURCE_H
 
 #include <uapi/asm-generic/resource.h>
-
+#include <linux/spinlock.h>
 
 /*
  * boot-time rlimit defaults for the init task:
@@ -27,4 +27,39 @@ 
 	[RLIMIT_RTTIME]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\
 }
 
+#ifdef CONFIG_RLIMIT_NOTIFICATION
+
+#define INIT_RLIMIT_WATCHER(watchers, limit)	\
+	[limit] = LIST_HEAD_INIT(watchers[limit])
+
+#define INIT_RLIMIT_WATCHERS(watchers)				\
+{								\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_CPU),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_FSIZE),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_DATA),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_STACK),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_CORE),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_RSS),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_NPROC),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_NOFILE),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_MEMLOCK),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_AS),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_LOCKS),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_SIGPENDING),	\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_MSGQUEUE),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_NICE),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_RTPRIO),		\
+	INIT_RLIMIT_WATCHER(watchers, RLIMIT_RTTIME),		\
+}
+
+#define INIT_RLIMIT_EVENTS_CTX(sig)					\
+.rlimit_events_ctx = {						\
+	.lock = __SPIN_LOCK_UNLOCKED(sig.rlimit_events_ctx.lock),	\
+	.watchers = INIT_RLIMIT_WATCHERS(sig.rlimit_events_ctx.watchers),\
+	.process_dead = 0,						\
+},
+#else
+#define INIT_RLIMIT_EVENTS_CTX(sig)
+#endif /* CONFIG_RLIMIT_NOTIFICATION */
+
 #endif
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e049526bc188..65400b376b92 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -76,6 +76,7 @@  extern struct fs_struct init_fs;
 	INIT_POSIX_TIMERS(sig)						\
 	INIT_CPU_TIMERS(sig)						\
 	.rlim		= INIT_RLIMITS,					\
+	INIT_RLIMIT_EVENTS_CTX(sig)					\
 	INIT_CPUTIMER(sig)						\
 	INIT_PREV_CPUTIME(sig)						\
 	.cred_guard_mutex =						\
diff --git a/include/linux/rlimit_noti_kern.h b/include/linux/rlimit_noti_kern.h
new file mode 100644
index 000000000000..e49fddaa21c0
--- /dev/null
+++ b/include/linux/rlimit_noti_kern.h
@@ -0,0 +1,54 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_RLIMIT_NOTI_H_
+#define _LINUX_RLIMIT_NOTI_H_
+
+#include <uapi/linux/rlimit_noti.h>
+
+struct rlimit_noti_ctx {
+	/* for mdification protection */
+	spinlock_t lock;
+	/* protected by RCU */
+	struct list_head watchers[RLIM_NLIMITS];
+
+	unsigned process_dead:1;
+};
+
+#ifdef CONFIG_RLIMIT_NOTIFICATION
+
+int rlimit_noti_task_fork(struct task_struct *parent,
+			  struct task_struct *child);
+
+void rlimit_noti_task_exit(struct task_struct *tsk);
+
+int rlimit_noti_watch_active(struct task_struct *tsk, unsigned int res);
+
+void rlimit_noti_res_changed(struct task_struct *tsk, unsigned int res,
+			     uint64_t old, uint64_t new);
+
+#else
+
+static inline int rlimit_noti_watch_active(struct task_struct *tsk,
+					   unsigned int res)
+{
+	return 0;
+}
+
+static inline void rlimit_noti_res_changed(struct task_struct *tsk,
+					   unsigned int res,
+					   uint64_t old, uint64_t new)
+{
+}
+
+#endif /* CONFIG_RLIMIT_NOTIFICATION */
+#endif /* _LINUX_RLIMIT_NOTI_H_ */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 05cef037fbf2..36849df51c5b 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -7,7 +7,9 @@ 
 #include <linux/sched/jobctl.h>
 #include <linux/sched/task.h>
 #include <linux/cred.h>
+#include <linux/list.h>
 
+#include <linux/rlimit_noti_kern.h>
 /*
  * Types defining task->signal and task->sighand and APIs using them:
  */
@@ -197,6 +199,10 @@  struct signal_struct {
 	 */
 	struct rlimit rlim[RLIM_NLIMITS];
 
+#ifdef CONFIG_RLIMIT_NOTIFICATION
+	struct rlimit_noti_ctx rlimit_events_ctx;
+#endif
+
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct pacct_struct pacct;	/* per-process accounting information */
 #endif
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index f86127a46cfc..24b55805d607 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -28,6 +28,7 @@ 
 #define NETLINK_RDMA		20
 #define NETLINK_CRYPTO		21	/* Crypto layer */
 #define NETLINK_SMC		22	/* SMC monitoring */
+#define NETLINK_RLIMIT_EVENTS   23      /* rlimit notification */
 
 #define NETLINK_INET_DIAG	NETLINK_SOCK_DIAG
 
diff --git a/include/uapi/linux/rlimit_noti.h b/include/uapi/linux/rlimit_noti.h
new file mode 100644
index 000000000000..a15a2826bce9
--- /dev/null
+++ b/include/uapi/linux/rlimit_noti.h
@@ -0,0 +1,71 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_LINUX_RLIMIT_NOTI_H_
+#define _UAPI_LINUX_RLIMIT_NOTI_H_
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#include <linux/resource.h>
+#else
+#include <stdint.h>
+#endif
+
+#define RLIMIT_GET_NOTI_FD 1000
+
+/* ioctl's */
+#define RLIMIT_ADD_NOTI_LVL 1
+#define RLIMIT_RM_NOTI_LVL 2
+
+#define RLIMIT_SET_NOTI_ALL 3
+#define RLIMIT_CLEAR_NOTI_ALL 4
+
+/*
+ * For future (notify every 5, 10 units change):
+ * #define RLIMIT_SET_NOTI_STEP 5
+ */
+
+#define RLIMIT_GET_NOTI_LVLS 6
+#define RLIMIT_GET_NOTI_LVL_COUNT 7
+
+/* Flags for ioctl's */
+#define RLIMIT_FLAG_NO_INHERIT (1u << 0)
+
+/* Event types */
+enum {
+	RLIMIT_EVENT_TYPE_RES_CHANGED,
+	RLIMIT_EVENT_TYPE_MAX
+};
+
+/* TODO take care of padding (packed) */
+struct rlimit_noti_subject {
+	pid_t pid;
+	uint32_t resource;
+};
+
+struct rlimit_noti_level {
+	struct rlimit_noti_subject subj;
+	uint64_t value;
+	uint32_t flags;
+};
+
+struct rlimit_event {
+	uint32_t ev_type;
+	size_t size;
+};
+
+struct rlimit_event_res_changed {
+	struct rlimit_noti_subject subj;
+	uint64_t new_value;
+};
+
+#endif /* _UAPI_LINUX_RLIMIT_NOTI_H_ */
diff --git a/init/Kconfig b/init/Kconfig
index 1d3475fc9496..4bc44fa86640 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -332,6 +332,12 @@  config AUDIT_TREE
 	depends on AUDITSYSCALL
 	select FSNOTIFY
 
+config RLIMIT_NOTIFICATION
+       bool "Support fd notifications on given resource usage"
+       depends on NET
+       help
+	Enable this to monitor process resource changes usage via fd.
+
 source "kernel/irq/Kconfig"
 source "kernel/time/Kconfig"
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 72aa080f91f0..d927d41c35f5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -76,6 +76,7 @@  obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_RLIMIT_NOTIFICATION) += rlimit_noti.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_KCOV) += kcov.o
 obj-$(CONFIG_KPROBES) += kprobes.o
diff --git a/kernel/rlimit_noti.c b/kernel/rlimit_noti.c
new file mode 100644
index 000000000000..a4fe5b9dd02b
--- /dev/null
+++ b/kernel/rlimit_noti.c
@@ -0,0 +1,786 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Netlink communication strongly based on audit.c.
+ */
+
+#include <linux/rlimit_noti.h>
+
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <linux/skbuff.h>
+#include <net/netns/generic.h>
+
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/signal.h>
+#include <linux/spinlock.h>
+
+#define sig_watchers(sig) sig->rlimit_events_ctx.watchers
+
+#define sig_for_each_res(wlist, sig)					\
+	for (wlist = &sig_watchers(sig)[0];				\
+	     wlist - &sig_watchers(sig)[0] < ARRAY_SIZE(sig_watchers(sig)); \
+	     ++wlist)
+
+struct rlimit_event_list {
+	struct rlimit_event ev;
+	union {
+		struct rlimit_event_res_changed rchanged;
+	} event_data;
+	struct list_head node;
+};
+
+#define MAX_RLIMIT_EVENT_SIZE ({					\
+			struct rlimit_event_list *_rl = NULL;	\
+			sizeof(_rl->event_data);		\
+})
+
+struct rlimit_watch_fd_ctx {
+	struct kref kref;
+
+	spinlock_t noti_ctx_lock;
+	struct list_head watchers;
+	unsigned fd_invalid:1;
+
+	spinlock_t events_lock;
+	wait_queue_head_t events_queue;
+	struct list_head events;
+};
+
+struct rlimit_watcher {
+	struct rcu_head rcu;
+	struct rlimit_watch_fd_ctx *ctx;
+	struct signal_struct *signal;
+
+	struct list_head tsk_node;
+	struct list_head ctx_node;
+
+	uint64_t value;
+	unsigned noti_all_changes:1;
+};
+
+/******************************************************************************
+ * Public API
+ ******************************************************************************/
+
+static void release_ctx(struct kref *kref)
+{
+	struct rlimit_watch_fd_ctx *ctx =
+		container_of(kref, struct rlimit_watch_fd_ctx, kref);
+
+	kfree(ctx);
+}
+
+static struct rlimit_watcher *alloc_rlimit_watcher(
+	struct rlimit_watch_fd_ctx *ctx, struct signal_struct *signal,
+	uint64_t value, bool noti_all)
+{
+	struct rlimit_watcher *w;
+
+	w = kzalloc(sizeof(*w), GFP_ATOMIC);
+	if (!w)
+		return ERR_PTR(ENOMEM);
+
+	INIT_LIST_HEAD(&w->tsk_node);
+	INIT_LIST_HEAD(&w->ctx_node);
+
+	w->ctx = ctx;
+	kref_get(&ctx->kref);
+	w->signal = signal;
+	get_signal_struct(signal);
+	w->value = value;
+	w->noti_all_changes = noti_all;
+
+	return w;
+}
+
+static void free_rlimit_watcher(struct rlimit_watcher *w)
+{
+	if (!w)
+		return;
+
+	kref_put(&w->ctx->kref, release_ctx);
+	put_signal_struct(w->signal);
+	kfree(w);
+}
+
+static void free_rlimit_watcher_rcu(struct rcu_head *head)
+{
+	free_rlimit_watcher(container_of(head, struct rlimit_watcher, rcu));
+}
+
+static inline struct rlimit_watcher *rlimit_watcher_dup(
+	struct rlimit_watcher *org, struct task_struct *new_owner)
+{
+	return alloc_rlimit_watcher(org->ctx, new_owner->signal, org->value,
+				    org->noti_all_changes);
+}
+
+/* This is not called for threads */
+int rlimit_noti_task_fork(struct task_struct *parent, struct task_struct *child)
+{
+	struct rlimit_watcher *w, *nw;
+	struct signal_struct *sig = child->signal;
+	unsigned long flags;
+	struct list_head *iter;
+	int ret;
+
+	/*
+	 * init all list to avoid leaving uninitialized lists
+	 * in case of error
+	 */
+	sig_for_each_res(iter, sig)
+		INIT_LIST_HEAD(iter);
+
+	spin_lock_init(&sig->rlimit_events_ctx.lock);
+	sig->rlimit_events_ctx.process_dead = 0;
+
+	/* Lock the list to be safe against modification */
+	spin_lock_irqsave(&parent->signal->rlimit_events_ctx.lock, flags);
+
+	sig_for_each_res(iter, sig)
+		list_for_each_entry(w, iter, tsk_node) {
+			nw = rlimit_watcher_dup(w, child);
+			if (!nw) {
+				spin_unlock_irqrestore(
+					&parent->signal->rlimit_events_ctx.lock,
+					flags);
+				ret = -ENOMEM;
+				goto cleanup;
+			}
+
+			/*
+			 * For now we put this only on task side list
+			 * to avoid deadlock (ABBA)
+			 *
+			 * We assume that no one can access this new task
+			 * for now so we don't use any locking here
+			 */
+			list_add_tail_rcu(&nw->tsk_node, iter);
+		}
+
+	/*
+	 * now we got all watchers on our brand new list so we can release
+	 * parent lock and allow modification of its list
+	 */
+	spin_unlock_irqrestore(&parent->signal->rlimit_events_ctx.lock, flags);
+
+	sig_for_each_res(iter, sig) {
+start_again:
+		rcu_read_lock();
+		list_for_each_entry_rcu(w, iter, tsk_node) {
+			spin_lock_irqsave(&w->ctx->noti_ctx_lock, flags);
+			if (list_empty(&w->ctx_node)) {
+				if (!w->ctx->fd_invalid) {
+					list_add_tail(&w->ctx_node,
+						      &w->ctx->watchers);
+				} else {
+					spin_lock(&sig->rlimit_events_ctx.lock);
+					list_del_rcu(&w->tsk_node);
+					call_rcu(&w->rcu,
+						 free_rlimit_watcher_rcu);
+					spin_unlock(
+						&sig->rlimit_events_ctx.lock);
+					rcu_read_unlock();
+					goto start_again;
+				}
+			}
+			spin_unlock_irqrestore(&w->ctx->noti_ctx_lock, flags);
+		}
+		rcu_read_unlock();
+	}
+
+	return 0;
+cleanup:
+	sig_for_each_res(iter, sig) {
+		while (!list_empty(iter)) {
+			w = list_first_entry(iter,
+					     struct rlimit_watcher, ctx_node);
+			list_del_init(&w->tsk_node);
+			call_rcu(&w->rcu, free_rlimit_watcher_rcu);
+		}
+	}
+	return ret;
+}
+
+void rlimit_noti_task_exit(struct task_struct *tsk)
+{
+	struct rlimit_watcher *w;
+	struct rlimit_noti_ctx *n_ctx = &tsk->signal->rlimit_events_ctx;
+	unsigned long flags;
+	struct list_head *head;
+
+	if (tsk != tsk->group_leader)
+		return;
+
+	/*
+	 * Let's mark that we are in the middle of cleaning up
+	 * to prevent new watchers from being added to the list
+	 */
+	spin_lock_irqsave(&n_ctx->lock, flags);
+	WARN_ON(n_ctx->process_dead);
+	n_ctx->process_dead = true;
+	spin_unlock_irqrestore(&n_ctx->lock, flags);
+
+	sig_for_each_res(head, tsk->signal) {
+		/*
+		 * Let's go through the list and remove watchers form respective
+		 * fd contextes.
+		 */
+		rcu_read_lock();
+		list_for_each_entry_rcu(w, head, tsk_node) {
+			spin_lock_irqsave(&w->ctx->noti_ctx_lock, flags);
+			/*
+			 * List empty means that between iteration and acquiring
+			 * lock this watcher has been already removed and
+			 * it's just hanging due to grace period
+			 */
+			if (!list_empty(&w->ctx_node)
+			    && !list_empty(&w->tsk_node))
+				list_del_init(&w->ctx_node);
+
+			spin_unlock_irqrestore(&w->ctx->noti_ctx_lock, flags);
+		}
+		rcu_read_unlock();
+
+		/* Now let's cleanup our list */
+		spin_lock_irqsave(&n_ctx->lock, flags);
+		while (!list_empty(head)) {
+			w = list_first_entry(head,
+					     struct rlimit_watcher, tsk_node);
+			list_del_rcu(&w->tsk_node);
+			call_rcu(&w->rcu, free_rlimit_watcher_rcu);
+		}
+		spin_unlock_irqrestore(&n_ctx->lock, flags);
+	}
+}
+
+static int rlimit_generate_res_changed_event(struct rlimit_watch_fd_ctx *ctx,
+					     struct task_struct *tsk,
+					     unsigned int resource,
+					     uint64_t new, int mflags)
+{
+	struct rlimit_event_list *ev_list;
+	unsigned long flags;
+
+	ev_list = kzalloc(sizeof(*ev_list), mflags);
+	if (!ev_list)
+		return -ENOMEM;
+
+	ev_list->ev.ev_type = RLIMIT_EVENT_TYPE_RES_CHANGED;
+	ev_list->ev.size = sizeof(struct rlimit_event)
+		+ sizeof(struct rlimit_event_res_changed);
+
+	/* TODO add here support for PID namespace */
+	ev_list->event_data.rchanged.subj.pid = tsk->pid;
+	ev_list->event_data.rchanged.subj.resource = resource;
+
+	ev_list->event_data.rchanged.new_value = new;
+
+	INIT_LIST_HEAD(&ev_list->node);
+
+	spin_lock_irqsave(&ctx->events_lock, flags);
+	list_add_tail(&ev_list->node, &ctx->events);
+	wake_up_interruptible(&ctx->events_queue);
+	spin_unlock_irqrestore(&ctx->events_lock, flags);
+
+	return 0;
+}
+
+int rlimit_noti_watch_active(struct task_struct *tsk, unsigned int res)
+{
+	return !list_empty(&tsk->signal->rlimit_events_ctx.watchers[res]);
+}
+
+void rlimit_noti_res_changed(struct task_struct *tsk, unsigned int res,
+			     uint64_t old, uint64_t new)
+{
+	struct rlimit_watcher *w;
+	struct signal_struct *signal = tsk->signal;
+
+	rcu_read_lock();
+	/* TODO this should be replaced with sth faster */
+	list_for_each_entry_rcu(w, &signal->rlimit_events_ctx.watchers[res],
+				tsk_node)
+		if (w->noti_all_changes ||
+		    (w->value > old && w->value <= new) ||
+		    (w->value > new && w->value <= old)) {
+			/* ignore error as there is nothing we can do */
+			rlimit_generate_res_changed_event(w->ctx, tsk,
+							  res, new, GFP_ATOMIC);
+		}
+	rcu_read_unlock();
+}
+
+/******************************************************************************
+ * FD part
+ ******************************************************************************/
+
+static int add_new_watcher(struct rlimit_watch_fd_ctx *ctx,
+			   struct task_struct *tsk,
+			   int resource, uint64_t value, bool noti_all)
+{
+	struct rlimit_watcher *w;
+	struct signal_struct *signal;
+	unsigned long flags;
+	int ret = 0;
+
+	if (resource >= RLIM_NLIMITS)
+		return -EINVAL;
+
+	read_lock(&tasklist_lock);
+	if (!tsk->sighand) {
+		ret = -ESRCH;
+		goto unlock_read;
+	}
+
+	task_lock(tsk->group_leader);
+	signal = tsk->signal;
+
+	w = alloc_rlimit_watcher(ctx, signal, value, noti_all);
+	if (IS_ERR(w)) {
+		ret = PTR_ERR(w);
+		goto unlock_group_leader;
+	}
+
+	spin_lock_irqsave(&ctx->noti_ctx_lock, flags);
+	/*
+	 * First add it to ctx list as we are holding it's lock
+	 * and no one is going to modify or iterate it
+	 */
+	list_add_tail(&w->ctx_node, &ctx->watchers);
+	/* Now let's lock process side lock and add this torcu protected list */
+	spin_lock(&signal->rlimit_events_ctx.lock);
+
+	/* If process is in the middle of cleanup let's rollback everything */
+	if (!signal->rlimit_events_ctx.process_dead) {
+		list_add_tail_rcu(&signal->rlimit_events_ctx.watchers[resource],
+				  &w->tsk_node);
+		ret = 0;
+	} else {
+		list_del(&w->ctx_node);
+		free_rlimit_watcher(w);
+		ret = -ENOENT;
+	}
+
+	spin_unlock(&signal->rlimit_events_ctx.lock);
+	spin_unlock_irqrestore(&ctx->noti_ctx_lock, flags);
+unlock_group_leader:
+	task_unlock(tsk->group_leader);
+unlock_read:
+	read_unlock(&tasklist_lock);
+
+	return ret;
+}
+
+ssize_t rlimit_noti_read_event(struct file *file, char __user *buf,
+			       size_t size, loff_t *ptr)
+{
+	struct rlimit_watch_fd_ctx *ctx = file->private_data;
+	struct rlimit_event_list *ev_list;
+	unsigned long flags;
+	size_t ret;
+
+	/* TODO allow to read only part of event */
+	if (size < MAX_RLIMIT_EVENT_SIZE)
+		return -EINVAL;
+
+	spin_lock_irqsave(&ctx->events_lock, flags);
+#define READ_COND (!list_empty(&ctx->events))
+	while (!READ_COND) {
+		spin_unlock_irqrestore(&ctx->events_lock, flags);
+
+		if (wait_event_interruptible(ctx->events_queue, READ_COND))
+			return -ERESTARTSYS;
+		spin_lock_irqsave(&ctx->events_lock, flags);
+	}
+#undef READ_COND
+
+	ev_list = list_first_entry(&ctx->events,
+				   struct rlimit_event_list, node);
+	list_del(&ev_list->node);
+	spin_unlock_irqrestore(&ctx->events_lock, flags);
+
+	/* TODO handle fault */
+	ret = copy_to_user(buf, &ev_list->ev, ev_list->ev.size);
+	if (ret == 0)
+		ret = ev_list->ev.size;
+
+	kfree(ev_list);
+
+	return ret;
+}
+
+
+unsigned int rlimit_noti_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct rlimit_watch_fd_ctx *ctx = file->private_data;
+	unsigned int mask = POLLWRNORM;
+	unsigned long flags;
+
+	poll_wait(file, &ctx->events_queue, wait);
+
+	spin_lock_irqsave(&ctx->events_lock, flags);
+	if (!list_empty(&ctx->events))
+		mask |= POLLIN;
+
+	/* TODO add notification when last process exited */
+	spin_unlock_irqrestore(&ctx->events_lock, flags);
+
+	return mask;
+}
+
+
+static long rlimit_noti_ioctl(struct file *file,
+			      unsigned int cmd, unsigned long arg)
+{
+	struct rlimit_watch_fd_ctx *ctx = file->private_data;
+	struct task_struct *tsk;
+	struct rlimit_noti_level nlvl;
+	bool noti_all = false;
+	int ret;
+
+	switch (cmd) {
+	case RLIMIT_SET_NOTI_ALL:
+		if (copy_from_user(&nlvl.subj,
+				   (void __user *)arg, sizeof(nlvl.subj)))
+			return -EFAULT;
+
+		nlvl.value = 0;
+		noti_all = true;
+		goto set_watch;
+
+	case RLIMIT_ADD_NOTI_LVL:
+		if (copy_from_user(&nlvl, (void __user *)arg, sizeof(nlvl)))
+			return -EFAULT;
+set_watch:
+		rcu_read_lock();
+		tsk = find_task_by_vpid(nlvl.subj.pid);
+		if (!tsk) {
+			rcu_read_unlock();
+			printk(KERN_DEBUG "No PID in current NS\n");
+			return -EINVAL;
+		}
+
+		get_task_struct(tsk);
+		rcu_read_unlock();
+
+		/* TODO check for duplicates before adding */
+		ret = add_new_watcher(ctx, tsk, nlvl.subj.resource,
+				      nlvl.value, false);
+		put_task_struct(tsk);
+		break;
+
+	case RLIMIT_CLEAR_NOTI_ALL:
+	case RLIMIT_RM_NOTI_LVL:
+
+	case RLIMIT_GET_NOTI_LVLS:
+	case RLIMIT_GET_NOTI_LVL_COUNT:
+		/* TODO: Implement me */
+		ret = -ENOTSUPP;
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int rlimit_noti_release(struct inode *inode, struct file *file)
+{
+	struct rlimit_watch_fd_ctx *ctx = file->private_data;
+	struct rlimit_watcher *w;
+	struct rlimit_event_list *ev_list;
+	unsigned long flags;
+
+	/* Clean up watchers */
+	spin_lock_irqsave(&ctx->noti_ctx_lock, flags);
+	ctx->fd_invalid = 1;
+	list_for_each_entry(w, &ctx->watchers, ctx_node) {
+		spin_lock(&w->signal->rlimit_events_ctx.lock);
+		list_del_rcu(&w->tsk_node);
+		spin_unlock(&w->signal->rlimit_events_ctx.lock);
+	}
+
+	while (!list_empty(&ctx->watchers)) {
+		w = list_first_entry(&ctx->watchers,
+				     struct rlimit_watcher, ctx_node);
+		list_del_init(&w->ctx_node);
+		call_rcu(&w->rcu, free_rlimit_watcher_rcu);
+	}
+
+	spin_unlock_irqrestore(&ctx->noti_ctx_lock, flags);
+
+	/* to ensure that no more events will be generated */
+	synchronize_rcu();
+
+	spin_lock_irqsave(&ctx->events_lock, flags);
+	while (!list_empty(&ctx->events)) {
+		ev_list = list_first_entry(&ctx->events,
+					   struct rlimit_event_list, node);
+		list_del(&ev_list->node);
+		kfree(ev_list);
+	}
+	spin_unlock_irqrestore(&ctx->events_lock, flags);
+
+	kref_put(&ctx->kref, release_ctx);
+
+	return 0;
+}
+
+static const struct file_operations rlimit_noti_fops = {
+	.read = rlimit_noti_read_event,
+	.release = rlimit_noti_release,
+	.poll = rlimit_noti_poll,
+	.unlocked_ioctl = rlimit_noti_ioctl,
+};
+
+static int rlimit_noti_create_fd(void)
+{
+	struct rlimit_watch_fd_ctx *ctx;
+	int ret;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	kref_init(&ctx->kref);
+	spin_lock_init(&ctx->noti_ctx_lock);
+	INIT_LIST_HEAD(&ctx->watchers);
+	spin_lock_init(&ctx->events_lock);
+	INIT_LIST_HEAD(&ctx->events);
+	init_waitqueue_head(&ctx->events_queue);
+
+	ret = anon_inode_getfd("rlimit_noti", &rlimit_noti_fops, ctx, 0);
+	if (ret < 0)
+		goto put_ctx;
+
+	return ret;
+put_ctx:
+	kref_put(&ctx->kref, release_ctx);
+	return ret;
+}
+
+
+
+/******************************************************************************
+ * netlink part
+ ******************************************************************************/
+
+
+/* private rlimit_noti network namespace index */
+static unsigned int rlimit_noti_net_id;
+
+struct rlimit_noti_net {
+	struct sock *sk;
+};
+
+struct rlimit_noti_reply {
+	__u32 portid;
+	struct net *net;
+	struct sk_buff *skb;
+};
+
+static struct sock *rlimit_noti_get_socket(const struct net *net)
+{
+	struct rlimit_noti_net *rn_net;
+
+	if (!net)
+		return NULL;
+
+	rn_net = net_generic(net, rlimit_noti_net_id);
+	return rn_net->sk;
+}
+
+static struct sk_buff *rlimit_noti_make_reply(int seq, int type,
+					      void *payload, int size)
+{
+	struct sk_buff	*skb;
+	struct nlmsghdr	*nl_header;
+	int flags = 0;
+
+	skb = nlmsg_new(size, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	nl_header = nlmsg_put(skb, 0, seq, type, size, flags);
+	if (!nl_header)
+		goto free_skb;
+
+	memcpy(nlmsg_data(nl_header), payload, size);
+
+	return skb;
+
+free_skb:
+	kfree_skb(skb);
+	return NULL;
+}
+
+static int rlimit_noti_send_reply_thread(void *arg)
+{
+	struct rlimit_noti_reply *reply = arg;
+	struct sock *sk = rlimit_noti_get_socket(reply->net);
+
+	/*
+	 * Ignore failure. It'll only happen if the sender goes away,
+	 * because our timeout is set to infinite.
+	 */
+	netlink_unicast(sk, reply->skb, reply->portid, 0);
+	put_net(reply->net);
+	kfree(reply);
+	return 0;
+}
+
+static void rlimit_noti_send_reply(struct sk_buff *request_skb, int seq,
+				   int type, void *payload, int size)
+{
+	struct net *net = sock_net(NETLINK_CB(request_skb).sk);
+	struct sk_buff *skb;
+	struct task_struct *tsk;
+	struct rlimit_noti_reply *reply;
+
+	reply = kmalloc(sizeof(*reply), GFP_KERNEL);
+	if (!reply)
+		return;
+
+	skb = rlimit_noti_make_reply(seq, type, payload, size);
+	if (!skb)
+		goto out;
+
+	reply->net = get_net(net);
+	reply->portid = NETLINK_CB(request_skb).portid;
+	reply->skb = skb;
+
+	tsk = kthread_run(rlimit_noti_send_reply_thread, reply,
+			  "rlimit_noti_send_reply");
+	if (!IS_ERR(tsk))
+		return;
+	kfree_skb(skb);
+out:
+	kfree(reply);
+}
+
+static int rlimit_noti_netlink_ok(struct sk_buff *skb, u16 msg_type)
+{
+	/* TODO: put here some security and namespace checks */
+	return 0;
+}
+
+static int rlimit_noti_receive_msg(struct sk_buff *skb,
+				   struct nlmsghdr *nl_header)
+{
+	u32 seq_nb = nl_header->nlmsg_seq;
+	u16 msg_type = nl_header->nlmsg_type;
+	int ret;
+
+	ret = rlimit_noti_netlink_ok(skb, msg_type);
+	if (ret)
+		return ret;
+
+	switch (msg_type) {
+	case RLIMIT_GET_NOTI_FD: {
+		int fd = 10;
+
+		fd = rlimit_noti_create_fd();
+		if (fd < 0) {
+			ret = fd;
+			goto out;
+		}
+		rlimit_noti_send_reply(skb, seq_nb, RLIMIT_GET_NOTI_FD,
+				       &fd, sizeof(fd));
+		ret = 0;
+		break;
+	}
+	default:
+		ret = -EINVAL;
+		break;
+	}
+out:
+	return ret;
+}
+
+static void rlimit_noti_netlink_receive(struct sk_buff *skb)
+{
+	struct nlmsghdr *nl_header;
+	int len, ret;
+
+	nl_header = nlmsg_hdr(skb);
+	len = skb->len;
+
+	while (nlmsg_ok(nl_header, len)) {
+		ret = rlimit_noti_receive_msg(skb, nl_header);
+		/* if err or if this message says it wants a response */
+		if (ret || (nl_header->nlmsg_flags & NLM_F_ACK))
+			netlink_ack(skb, nl_header, ret, NULL);
+
+		nl_header = nlmsg_next(nl_header, &len);
+	}
+}
+
+static int rlimit_noti_netlink_bind(struct net *net, int group)
+{
+	/* For now we allow everyone but maybe this should be limited? */
+	return 0;
+}
+
+static int __net_init rlimit_noti_net_init(struct net *net)
+{
+	struct netlink_kernel_cfg cfg = {
+		.input	= rlimit_noti_netlink_receive,
+		.bind	= rlimit_noti_netlink_bind,
+		.flags	= NL_CFG_F_NONROOT_RECV,
+		.groups	= 1, /* Just one, the default */
+	};
+	struct rlimit_noti_net *rn_net = net_generic(net, rlimit_noti_net_id);
+
+	rn_net->sk = netlink_kernel_create(net, NETLINK_RLIMIT_EVENTS, &cfg);
+	if (rn_net->sk == NULL) {
+		printk(KERN_ERR
+		       "cannot initialize netlink socket in namespace");
+		return -ENOMEM;
+	}
+	rn_net->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+
+	return 0;
+
+}
+
+static void __net_exit rlimit_noti_net_exit(struct net *net)
+{
+	struct rlimit_noti_net *rn_net = net_generic(net, rlimit_noti_net_id);
+
+	netlink_kernel_release(rn_net->sk);
+}
+
+static struct pernet_operations rlimit_noti_net_ops __net_initdata = {
+	.init = rlimit_noti_net_init,
+	.exit = rlimit_noti_net_exit,
+	.id = &rlimit_noti_net_id,
+	.size = sizeof(struct rlimit_noti_net),
+};
+
+static int __init rlimit_noti_init(void)
+{
+	return register_pernet_subsys(&rlimit_noti_net_ops);
+}
+late_initcall(rlimit_noti_init);
+
+static void __exit rlimit_noti_exit(void)
+{
+	unregister_pernet_subsys(&rlimit_noti_net_ops);
+}