diff mbox

[net-next,RFC,v2,4/9] bpf, security: Add Checmate security LSM and BPF program type

Message ID 20160829114705.GA20877@ircssh.c.rugged-nimbus-611.internal (mailing list archive)
State New, archived
Headers show

Commit Message

Sargun Dhillon Aug. 29, 2016, 11:47 a.m. UTC
This patch adds a minor LSM, Checmate. Checmate is a flexible programmable,
extensible minor LSM that's coupled with cgroups and BPF. It is designed to
enforce container-specific policies. It is also a cgroupv2 controller. By
itself, it doesn't do very much, but in conjunction with a orchestrator
complex policies can be installed on the cgroup hierarchy.

These cgroup programs are tied to the kernel ABI version. If one tries
to load a BPF program compiled against a different kernel version,
an error will be thrown.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
---
 include/linux/cgroup_subsys.h    |   4 +
 include/linux/checmate.h         | 108 +++++++
 include/uapi/linux/bpf.h         |   1 +
 kernel/bpf/syscall.c             |   2 +-
 security/Kconfig                 |   1 +
 security/Makefile                |   2 +
 security/checmate/Kconfig        |  11 +
 security/checmate/Makefile       |   3 +
 security/checmate/checmate_bpf.c |  68 +++++
 security/checmate/checmate_lsm.c | 610 +++++++++++++++++++++++++++++++++++++++
 10 files changed, 809 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/checmate.h
 create mode 100644 security/checmate/Kconfig
 create mode 100644 security/checmate/Makefile
 create mode 100644 security/checmate/checmate_bpf.c
 create mode 100644 security/checmate/checmate_lsm.c

Comments

Tejun Heo Aug. 29, 2016, 5:01 p.m. UTC | #1
Hello,

On Mon, Aug 29, 2016 at 04:47:07AM -0700, Sargun Dhillon wrote:
> This patch adds a minor LSM, Checmate. Checmate is a flexible programmable,
> extensible minor LSM that's coupled with cgroups and BPF. It is designed to
> enforce container-specific policies. It is also a cgroupv2 controller. By
> itself, it doesn't do very much, but in conjunction with a orchestrator
> complex policies can be installed on the cgroup hierarchy.
> 
> These cgroup programs are tied to the kernel ABI version. If one tries
> to load a BPF program compiled against a different kernel version,
> an error will be thrown.

First of all, please talk with people working on network cgroup bpf
and landlock.  I don't think it's a good idea to have N different ways
to implement cgroup-aware bpf mechanism.  There can be multiple
consumers but there gotta be a common mechanism instead of several
independent controllers.

> diff --git a/include/linux/checmate.h b/include/linux/checmate.h
> new file mode 100644
> index 0000000..4c4db4a
> --- /dev/null
> +++ b/include/linux/checmate.h
> @@ -0,0 +1,108 @@
> +#ifndef _LINUX_CHECMATE_H_
> +#define _LINUX_CHECMATE_H_ 1
> +#include <linux/security.h>
> +
> +enum checmate_hook_num {
> +	/* CONFIG_SECURITY_NET hooks */
> +	CHECMATE_HOOK_UNIX_STREAM_CONNECT,
> +	CHECMATE_HOOK_UNIX_MAY_SEND,
> +	CHECMATE_HOOK_SOCKET_CREATE,
> +	CHECMATE_HOOK_SOCKET_POST_CREATE,
> +	CHECMATE_HOOK_SOCKET_BIND,
> +	CHECMATE_HOOK_SOCKET_CONNECT,
> +	CHECMATE_HOOK_SOCKET_LISTEN,
> +	CHECMATE_HOOK_SOCKET_ACCEPT,
> +	CHECMATE_HOOK_SOCKET_SENDMSG,
> +	CHECMATE_HOOK_SOCKET_RECVMSG,
> +	CHECMATE_HOOK_SOCKET_GETSOCKNAME,
> +	CHECMATE_HOOK_SOCKET_GETPEERNAME,
> +	CHECMATE_HOOK_SOCKET_GETSOCKOPT,
> +	CHECMATE_HOOK_SOCKET_SETSOCKOPT,
> +	CHECMATE_HOOK_SOCKET_SHUTDOWN,
> +	CHECMATE_HOOK_SOCKET_SOCK_RCV_SKB,
> +	CHECMATE_HOOK_SK_FREE_SECURITY,
> +	__CHECMATE_HOOK_MAX,
> +};

Do we really want a separate hook for each call?  A logical extension
of this would be having a separate hook per syscall which feels kinda
horrible.

> +/* CONFIG_SECURITY_NET contexts */
> +struct checmate_unix_stream_connect_ctx {
> +	struct sock *sock;
> +	struct sock *other;
> +	struct sock *newsk;
> +};
...
> +struct checmate_sk_free_security_ctx {
> +	struct sock *sk;
> +};
...
> +struct checmate_ctx {
> +	int hook;
> +	union {
> +/* CONFIG_SECURITY_NET contexts */
> +		struct checmate_unix_stream_connect_ctx	unix_stream_connect;
> +		struct checmate_unix_may_send_ctx	unix_may_send;
> +		struct checmate_socket_create_ctx	socket_create;
> +		struct checmate_socket_bind_ctx		socket_bind;
> +		struct checmate_socket_connect_ctx	socket_connect;
> +		struct checmate_socket_listen_ctx	socket_listen;
> +		struct checmate_socket_accept_ctx	socket_accept;
> +		struct checmate_socket_sendmsg_ctx	socket_sendmsg;
> +		struct checmate_socket_recvmsg_ctx	socket_recvmsg;
> +		struct checmate_socket_sock_rcv_skb_ctx	socket_sock_rcv_skb;
> +		struct checmate_sk_free_security_ctx	sk_free_security;
> +	};
> +};

I'm not convinced about the approach.  It's an approach which pretty
much requires future extensions while being rigid.  Not a good
combination.

> +/*

Please use /** for function comments.

> + * checmate_instance_cleanup_rcu - Cleans up a Checmate program instance
> + * @rp: rcu_head pointer to a Checmate instance
> + */
> +static void checmate_instance_cleanup_rcu(struct rcu_head *rp)
> +{
> +	struct checmate_instance *instance;
> +
> +	instance = container_of(rp, struct checmate_instance, rcu);
> +	bpf_prog_put(instance->prog);
> +	kfree(instance);
> +}
> +static struct cftype checmate_files[] = {
> +#ifdef CONFIG_SECURITY_NETWORK
> +	CHECMATE_CFTYPE(CHECMATE_HOOK_UNIX_STREAM_CONNECT,
> +			"unix_stream_connect"),
> +	CHECMATE_CFTYPE(CHECMATE_HOOK_UNIX_MAY_SEND,
> +			"unix_may_send"),
> +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_CREATE, "socket_create"),
> +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_BIND, "socket_bind"),
> +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_CONNECT, "socket_connect"),
> +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_LISTEN, "socket_listen"),
> +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_ACCEPT, "socket_accept"),
> +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_SENDMSG, "socket_sendmsg"),
> +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_RECVMSG, "socket_recvmsg"),
> +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_SHUTDOWN, "socket_shutdown"),
> +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_SOCK_RCV_SKB,
> +			"socket_sock_rcv_skb"),
> +	CHECMATE_CFTYPE(CHECMATE_HOOK_SK_FREE_SECURITY, "sk_free_security"),
> +#endif /* CONFIG_SECURITY_NETWORK */
> +	{}
> +};

I don't think this is a good interface approach.

> +struct cgroup_subsys checmate_cgrp_subsys = {
> +	.css_alloc	= checmate_css_alloc,
> +	.css_free	= checmate_css_free,
> +	.dfl_cftypes	= checmate_files,
> +};

Unless this is properly delegatable, IOW, it's safe to fully delegate
to a lesser security domain for all operations including program
loading and assignment (I can't see how that'd be the case), making it
an explicit controller doens't work in terms of userland interface.
It's fine for bpf / lsm / whatever to attach to cgroups by extending
struct cgroup itself or implementing an implicit controller but to be
visible as an explicit controller it must be able to follow cgroup
interface rules including delegation.  If not, it's best to come
through the interface which enforces the required permission checks
and then talk to cgroup from there.  This was also an issue with
network cgroup bpf programs that Daniel Mack is working on.  Please
chat with him.

> +static struct cgroup_subsys_state *css_from_sk(struct sock *sk)
> +{
> +	struct cgroup_subsys_state *css;
> +	struct cgroup *cgrp;
> +
> +	if (!sk_fullsock(sk))
> +		return ERR_PTR(-EINVAL);
> +	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> +
> +	rcu_read_lock();
> +	do {
> +		css = rcu_dereference(cgrp->subsys[checmate_cgrp_id]);
> +		if (css)
> +			goto out;
> +		cgrp = cgroup_parent(cgrp);
> +	} while (cgrp);
> +
> +out:
> +	rcu_read_unlock();
> +
> +	return css;

This pattern cannot be right.  What protects the returned @css?

> +static struct cgroup_subsys_state *css_from_current(void)
> +{
> +	struct cgroup_subsys_state *css;
> +
> +	if (unlikely(in_interrupt()))
> +		return ERR_PTR(-ENOENT);
> +
> +	rcu_read_lock();
> +	css = task_css(current, checmate_cgrp_id);
> +	rcu_read_unlock();
> +
> +	return css;

Ditto here.  RCU readlock is what was protecting @css.  Returning @css
after releasing RCU readlock makes no sense.

Thanks.
Sargun Dhillon Aug. 29, 2016, 6:49 p.m. UTC | #2
On Mon, Aug 29, 2016 at 01:01:18PM -0400, Tejun Heo wrote:
> Hello,
> 
> On Mon, Aug 29, 2016 at 04:47:07AM -0700, Sargun Dhillon wrote:
> > This patch adds a minor LSM, Checmate. Checmate is a flexible programmable,
> > extensible minor LSM that's coupled with cgroups and BPF. It is designed to
> > enforce container-specific policies. It is also a cgroupv2 controller. By
> > itself, it doesn't do very much, but in conjunction with a orchestrator
> > complex policies can be installed on the cgroup hierarchy.
> > 
> > These cgroup programs are tied to the kernel ABI version. If one tries
> > to load a BPF program compiled against a different kernel version,
> > an error will be thrown.
> 
> First of all, please talk with people working on network cgroup bpf
> and landlock.  I don't think it's a good idea to have N different ways
> to implement cgroup-aware bpf mechanism.  There can be multiple
> consumers but there gotta be a common mechanism instead of several
> independent controllers.
>
I've talked to Daniel Mack, and Alexei. I agree with you that it makes sense not 
to have an infinite number of these cgroup + bpf + lsm subsystems in the kernel. 
I think that making sure we don't sacrifice capability is important.

> > diff --git a/include/linux/checmate.h b/include/linux/checmate.h
> > new file mode 100644
> > index 0000000..4c4db4a
> > --- /dev/null
> > +++ b/include/linux/checmate.h
> > @@ -0,0 +1,108 @@
> > +#ifndef _LINUX_CHECMATE_H_
> > +#define _LINUX_CHECMATE_H_ 1
> > +#include <linux/security.h>
> > +
> > +enum checmate_hook_num {
> > +	/* CONFIG_SECURITY_NET hooks */
> > +	CHECMATE_HOOK_UNIX_STREAM_CONNECT,
> > +	CHECMATE_HOOK_UNIX_MAY_SEND,
> > +	CHECMATE_HOOK_SOCKET_CREATE,
> > +	CHECMATE_HOOK_SOCKET_POST_CREATE,
> > +	CHECMATE_HOOK_SOCKET_BIND,
> > +	CHECMATE_HOOK_SOCKET_CONNECT,
> > +	CHECMATE_HOOK_SOCKET_LISTEN,
> > +	CHECMATE_HOOK_SOCKET_ACCEPT,
> > +	CHECMATE_HOOK_SOCKET_SENDMSG,
> > +	CHECMATE_HOOK_SOCKET_RECVMSG,
> > +	CHECMATE_HOOK_SOCKET_GETSOCKNAME,
> > +	CHECMATE_HOOK_SOCKET_GETPEERNAME,
> > +	CHECMATE_HOOK_SOCKET_GETSOCKOPT,
> > +	CHECMATE_HOOK_SOCKET_SETSOCKOPT,
> > +	CHECMATE_HOOK_SOCKET_SHUTDOWN,
> > +	CHECMATE_HOOK_SOCKET_SOCK_RCV_SKB,
> > +	CHECMATE_HOOK_SK_FREE_SECURITY,
> > +	__CHECMATE_HOOK_MAX,
> > +};
> 
> Do we really want a separate hook for each call?  A logical extension
> of this would be having a separate hook per syscall which feels kinda
> horrible.
> 
It would be a separate hook per LSM hook. Why wouldn't we want a separate bpf 
hook per lsm hook? I think if one program has to handle them all, the first 
program would be looking up the hook program in a bpf prog array. If you think 
it's better to have this logic in the BPF program, that makes sense. 

I had a version of this patch that allowed you to attach a prog array instead, 
but I think that it's cleaner attaching a program per lsm hook. In addition, 
there's a performance impact that comes from these hooks, so I wouldn't want to 
execute unneccessary code if it's avoidable.

The prog array approach also makes stacking filters difficult. If people want 
multiple filters per hook, the orchestrator would have to rewrite the existing 
filters to be cooperative.

> > +/* CONFIG_SECURITY_NET contexts */
> > +struct checmate_unix_stream_connect_ctx {
> > +	struct sock *sock;
> > +	struct sock *other;
> > +	struct sock *newsk;
> > +};
> ...
> > +struct checmate_sk_free_security_ctx {
> > +	struct sock *sk;
> > +};
> ...
> > +struct checmate_ctx {
> > +	int hook;
> > +	union {
> > +/* CONFIG_SECURITY_NET contexts */
> > +		struct checmate_unix_stream_connect_ctx	unix_stream_connect;
> > +		struct checmate_unix_may_send_ctx	unix_may_send;
> > +		struct checmate_socket_create_ctx	socket_create;
> > +		struct checmate_socket_bind_ctx		socket_bind;
> > +		struct checmate_socket_connect_ctx	socket_connect;
> > +		struct checmate_socket_listen_ctx	socket_listen;
> > +		struct checmate_socket_accept_ctx	socket_accept;
> > +		struct checmate_socket_sendmsg_ctx	socket_sendmsg;
> > +		struct checmate_socket_recvmsg_ctx	socket_recvmsg;
> > +		struct checmate_socket_sock_rcv_skb_ctx	socket_sock_rcv_skb;
> > +		struct checmate_sk_free_security_ctx	sk_free_security;
> > +	};
> > +};
> 
> I'm not convinced about the approach.  It's an approach which pretty
> much requires future extensions while being rigid.  Not a good
> combination.
> 
Do you have an alternative recommendation? Maybe just a set of 5 u64s
as the context object along with the hook ID?

> > +/*
> 
> Please use /** for function comments.
> 
> > + * checmate_instance_cleanup_rcu - Cleans up a Checmate program instance
> > + * @rp: rcu_head pointer to a Checmate instance
> > + */
> > +static void checmate_instance_cleanup_rcu(struct rcu_head *rp)
> > +{
> > +	struct checmate_instance *instance;
> > +
> > +	instance = container_of(rp, struct checmate_instance, rcu);
> > +	bpf_prog_put(instance->prog);
> > +	kfree(instance);
> > +}
> > +static struct cftype checmate_files[] = {
> > +#ifdef CONFIG_SECURITY_NETWORK
> > +	CHECMATE_CFTYPE(CHECMATE_HOOK_UNIX_STREAM_CONNECT,
> > +			"unix_stream_connect"),
> > +	CHECMATE_CFTYPE(CHECMATE_HOOK_UNIX_MAY_SEND,
> > +			"unix_may_send"),
> > +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_CREATE, "socket_create"),
> > +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_BIND, "socket_bind"),
> > +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_CONNECT, "socket_connect"),
> > +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_LISTEN, "socket_listen"),
> > +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_ACCEPT, "socket_accept"),
> > +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_SENDMSG, "socket_sendmsg"),
> > +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_RECVMSG, "socket_recvmsg"),
> > +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_SHUTDOWN, "socket_shutdown"),
> > +	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_SOCK_RCV_SKB,
> > +			"socket_sock_rcv_skb"),
> > +	CHECMATE_CFTYPE(CHECMATE_HOOK_SK_FREE_SECURITY, "sk_free_security"),
> > +#endif /* CONFIG_SECURITY_NETWORK */
> > +	{}
> > +};
> 
> I don't think this is a good interface approach.
> 
Any other recommendations?
> > +struct cgroup_subsys checmate_cgrp_subsys = {
> > +	.css_alloc	= checmate_css_alloc,
> > +	.css_free	= checmate_css_free,
> > +	.dfl_cftypes	= checmate_files,
> > +};
> 
> Unless this is properly delegatable, IOW, it's safe to fully delegate
> to a lesser security domain for all operations including program
> loading and assignment (I can't see how that'd be the case), making it
> an explicit controller doens't work in terms of userland interface.
> It's fine for bpf / lsm / whatever to attach to cgroups by extending
> struct cgroup itself or implementing an implicit controller but to be
> visible as an explicit controller it must be able to follow cgroup
> interface rules including delegation.  If not, it's best to come
> through the interface which enforces the required permission checks
> and then talk to cgroup from there.  This was also an issue with
> network cgroup bpf programs that Daniel Mack is working on.  Please
> chat with him.
>
Program assignment is possible by lesser security domains. Program loading is 
limited to CAP_SYS_ADMIN in init_user_ns. We could make it accessible to 
CAP_SYS_ADMIN in any userns, but it the reasoning behind this is that Checmate
BPF programs can leak kernel pointers. 

Could we potentially restrict it to only CAP_MAC_OVERRIDE, while still meeting 
cgroup delegation requirements?

Filters which are higher up in the heirarchy will still be enforced during 
delegation. This was an explicit design, as the "Orchestrator in Orchestrator" 
use case needs to be supported.

> > +static struct cgroup_subsys_state *css_from_sk(struct sock *sk)
> > +{
> > +	struct cgroup_subsys_state *css;
> > +	struct cgroup *cgrp;
> > +
> > +	if (!sk_fullsock(sk))
> > +		return ERR_PTR(-EINVAL);
> > +	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> > +
> > +	rcu_read_lock();
> > +	do {
> > +		css = rcu_dereference(cgrp->subsys[checmate_cgrp_id]);
> > +		if (css)
> > +			goto out;
> > +		cgrp = cgroup_parent(cgrp);
> > +	} while (cgrp);
> > +
> > +out:
> > +	rcu_read_unlock();
> > +
> > +	return css;
> 
> This pattern cannot be right.  What protects the returned @css?
>
Thanks, I'll fix this and keep a reference. 
> > +static struct cgroup_subsys_state *css_from_current(void)
> > +{
> > +	struct cgroup_subsys_state *css;
> > +
> > +	if (unlikely(in_interrupt()))
> > +		return ERR_PTR(-ENOENT);
> > +
> > +	rcu_read_lock();
> > +	css = task_css(current, checmate_cgrp_id);
> > +	rcu_read_unlock();
> > +
> > +	return css;
> 
> Ditto here.  RCU readlock is what was protecting @css.  Returning @css
> after releasing RCU readlock makes no sense.
> 
> Thanks.
> 
> -- 
> tejun
Thanks for your feedback Tejun. I'll look at making fixes to the above, and 
touch base with Daniel, and Alexei.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo Aug. 29, 2016, 7:24 p.m. UTC | #3
Hello, Sargun.

On Mon, Aug 29, 2016 at 11:49:07AM -0700, Sargun Dhillon wrote:
> It would be a separate hook per LSM hook. Why wouldn't we want a separate bpf 
> hook per lsm hook? I think if one program has to handle them all, the first 
> program would be looking up the hook program in a bpf prog array. If you think 
> it's better to have this logic in the BPF program, that makes sense. 
> 
> I had a version of this patch that allowed you to attach a prog array instead, 
> but I think that it's cleaner attaching a program per lsm hook. In addition, 
> there's a performance impact that comes from these hooks, so I wouldn't want to 
> execute unneccessary code if it's avoidable.

Hmm... it doesn't really matter how the backend part looks like and if
we need to implement per-call hooks to lower runtime overhead, sure.
I was mostly worried about the approach propagating through the
userland visible interface.

> The prog array approach also makes stacking filters difficult. If people want 
> multiple filters per hook, the orchestrator would have to rewrite the existing 
> filters to be cooperative.

I'm not really sure "stacking" in the kernel side is a good idea.
Please see below.

> > I'm not convinced about the approach.  It's an approach which pretty
> > much requires future extensions while being rigid.  Not a good
> > combination.
>
> Do you have an alternative recommendation? Maybe just a set of 5 u64s
> as the context object along with the hook ID?

cgroup fs doesn't seem like the right interface for this but if it
were I'd go for named hook IDs instead of opaque numbers.

> > Unless this is properly delegatable, IOW, it's safe to fully delegate
> > to a lesser security domain for all operations including program
> > loading and assignment (I can't see how that'd be the case), making it
> > an explicit controller doens't work in terms of userland interface.
> > It's fine for bpf / lsm / whatever to attach to cgroups by extending
> > struct cgroup itself or implementing an implicit controller but to be
> > visible as an explicit controller it must be able to follow cgroup
> > interface rules including delegation.  If not, it's best to come
> > through the interface which enforces the required permission checks
> > and then talk to cgroup from there.  This was also an issue with
> > network cgroup bpf programs that Daniel Mack is working on.  Please
> > chat with him.
>
> Program assignment is possible by lesser security domains. Program loading is 
> limited to CAP_SYS_ADMIN in init_user_ns. We could make it accessible to 
> CAP_SYS_ADMIN in any userns, but it the reasoning behind this is that Checmate
> BPF programs can leak kernel pointers. 

That doesn't make much sense to me.  Delegation doesn't mean much if a
delegatee can't load its own program (and I don't see how one can
delegate kernel pointer access to !root).  Also, unless there's
per-program fine control on who can load it, it seems pretty dangerous
to let anyone load any program.

> Could we potentially restrict it to only CAP_MAC_OVERRIDE, while still meeting 
> cgroup delegation requirements?

Wouldn't it make far more sense to pass cgroup fd to bpf syscall so
that "load this program" and "attach this program to the cgroup
identified by this fd" through the same interface and permission
checks?  cgroup participating in bpf operations is all fine but
splitting the userland interface across two domains seems like a bad
idea.

> Filters which are higher up in the heirarchy will still be enforced during 
> delegation. This was an explicit design, as the "Orchestrator in Orchestrator" 
> use case needs to be supported.

Given that program loading is restricted to root, wouldn't it be an a
lot more efficient approach to let userland multiplex multiple
programs?  Walking the tree executing bpf programs each time one of
these operations runs can be pretty expensive.  Imagine a tree like
the following.

	A - B - C
	      \ D

Let's say program is currently loaded on D.  If someone wants to add a
program on B, userland can load the program on B, combine B's and D's
program and compile them into a single program and load it on D.  The
only thing kernel would need to do in terms of hierarchy is finding
what's the closest program to execute.  In the above example, C would
need to use B's program and that can be determined on program
assignment time rather than on each operation.

Thanks.
Alexei Starovoitov Aug. 29, 2016, 9:49 p.m. UTC | #4
On 8/29/16 12:24 PM, Tejun Heo wrote:
> Hello, Sargun.
>
> On Mon, Aug 29, 2016 at 11:49:07AM -0700, Sargun Dhillon wrote:
>> It would be a separate hook per LSM hook. Why wouldn't we want a separate bpf
>> hook per lsm hook? I think if one program has to handle them all, the first
>> program would be looking up the hook program in a bpf prog array. If you think
>> it's better to have this logic in the BPF program, that makes sense.
>>
>> I had a version of this patch that allowed you to attach a prog array instead,
>> but I think that it's cleaner attaching a program per lsm hook. In addition,
>> there's a performance impact that comes from these hooks, so I wouldn't want to
>> execute unneccessary code if it's avoidable.
>
> Hmm... it doesn't really matter how the backend part looks like and if
> we need to implement per-call hooks to lower runtime overhead, sure.
> I was mostly worried about the approach propagating through the
> userland visible interface.
>
>> The prog array approach also makes stacking filters difficult. If people want
>> multiple filters per hook, the orchestrator would have to rewrite the existing
>> filters to be cooperative.
>
> I'm not really sure "stacking" in the kernel side is a good idea.
> Please see below.
>
>>> I'm not convinced about the approach.  It's an approach which pretty
>>> much requires future extensions while being rigid.  Not a good
>>> combination.
>>
>> Do you have an alternative recommendation? Maybe just a set of 5 u64s
>> as the context object along with the hook ID?
>
> cgroup fs doesn't seem like the right interface for this but if it
> were I'd go for named hook IDs instead of opaque numbers.
>
>>> Unless this is properly delegatable, IOW, it's safe to fully delegate
>>> to a lesser security domain for all operations including program
>>> loading and assignment (I can't see how that'd be the case), making it
>>> an explicit controller doens't work in terms of userland interface.
>>> It's fine for bpf / lsm / whatever to attach to cgroups by extending
>>> struct cgroup itself or implementing an implicit controller but to be
>>> visible as an explicit controller it must be able to follow cgroup
>>> interface rules including delegation.  If not, it's best to come
>>> through the interface which enforces the required permission checks
>>> and then talk to cgroup from there.  This was also an issue with
>>> network cgroup bpf programs that Daniel Mack is working on.  Please
>>> chat with him.
>>
>> Program assignment is possible by lesser security domains. Program loading is
>> limited to CAP_SYS_ADMIN in init_user_ns. We could make it accessible to
>> CAP_SYS_ADMIN in any userns, but it the reasoning behind this is that Checmate
>> BPF programs can leak kernel pointers.
>
> That doesn't make much sense to me.  Delegation doesn't mean much if a
> delegatee can't load its own program (and I don't see how one can
> delegate kernel pointer access to !root).  Also, unless there's
> per-program fine control on who can load it, it seems pretty dangerous
> to let anyone load any program.
>
>> Could we potentially restrict it to only CAP_MAC_OVERRIDE, while still meeting
>> cgroup delegation requirements?
>
> Wouldn't it make far more sense to pass cgroup fd to bpf syscall so
> that "load this program" and "attach this program to the cgroup
> identified by this fd" through the same interface and permission
> checks?  cgroup participating in bpf operations is all fine but
> splitting the userland interface across two domains seems like a bad
> idea.
>
>> Filters which are higher up in the heirarchy will still be enforced during
>> delegation. This was an explicit design, as the "Orchestrator in Orchestrator"
>> use case needs to be supported.
>
> Given that program loading is restricted to root, wouldn't it be an a
> lot more efficient approach to let userland multiplex multiple
> programs?  Walking the tree executing bpf programs each time one of
> these operations runs can be pretty expensive.  Imagine a tree like
> the following.
>
> 	A - B - C
> 	      \ D
>
> Let's say program is currently loaded on D.  If someone wants to add a
> program on B, userland can load the program on B, combine B's and D's
> program and compile them into a single program and load it on D.  The
> only thing kernel would need to do in terms of hierarchy is finding
> what's the closest program to execute.  In the above example, C would
> need to use B's program and that can be determined on program
> assignment time rather than on each operation.

I think that's exactly what Daniel's patches are doing and imo
it makes sense to keep this style for lsm as well
and also apply the concept of hook_id.
Daniel adds two commands to bpf syscall to attach/detach from cgroup
with hook_id.
Initially two hooks will be for socket rx and tx.
Then all interesting lsm hooks can be added one by one.
Daniel's prog type will be bpf_prog_type_cgroup_socket_filter.
LSM's prog type will be bpf_prog_type_lsm.
And verifier can check type safety since the lsm hook_id will be
passed at the program load time.
See another thread we had with Mickael.

landlock and checmate are very similar and should really be
single lsm as long as we agree that both are cgroup based.
The main difference between the two:
- landlock is proposing unpriveleged mode
- checmate is proposing writing into arguments from the program
These differences can be flags/options to one lsm.
Implementations of course are different so far, but
instead of arguing landlock vs checmate, I'd like us
to focus on how we can make one lsm that solves all use cases.

--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mickaël Salaün Aug. 29, 2016, 10:19 p.m. UTC | #5
On 29/08/2016 23:49, Alexei Starovoitov wrote:
> On 8/29/16 12:24 PM, Tejun Heo wrote:
>> Hello, Sargun.
>>
>> On Mon, Aug 29, 2016 at 11:49:07AM -0700, Sargun Dhillon wrote:
>>> It would be a separate hook per LSM hook. Why wouldn't we want a
>>> separate bpf
>>> hook per lsm hook? I think if one program has to handle them all, the
>>> first
>>> program would be looking up the hook program in a bpf prog array. If
>>> you think
>>> it's better to have this logic in the BPF program, that makes sense.
>>>
>>> I had a version of this patch that allowed you to attach a prog array
>>> instead,
>>> but I think that it's cleaner attaching a program per lsm hook. In
>>> addition,
>>> there's a performance impact that comes from these hooks, so I
>>> wouldn't want to
>>> execute unneccessary code if it's avoidable.
>>
>> Hmm... it doesn't really matter how the backend part looks like and if
>> we need to implement per-call hooks to lower runtime overhead, sure.
>> I was mostly worried about the approach propagating through the
>> userland visible interface.
>>
>>> The prog array approach also makes stacking filters difficult. If
>>> people want
>>> multiple filters per hook, the orchestrator would have to rewrite the
>>> existing
>>> filters to be cooperative.
>>
>> I'm not really sure "stacking" in the kernel side is a good idea.
>> Please see below.
>>
>>>> I'm not convinced about the approach.  It's an approach which pretty
>>>> much requires future extensions while being rigid.  Not a good
>>>> combination.
>>>
>>> Do you have an alternative recommendation? Maybe just a set of 5 u64s
>>> as the context object along with the hook ID?
>>
>> cgroup fs doesn't seem like the right interface for this but if it
>> were I'd go for named hook IDs instead of opaque numbers.
>>
>>>> Unless this is properly delegatable, IOW, it's safe to fully delegate
>>>> to a lesser security domain for all operations including program
>>>> loading and assignment (I can't see how that'd be the case), making it
>>>> an explicit controller doens't work in terms of userland interface.
>>>> It's fine for bpf / lsm / whatever to attach to cgroups by extending
>>>> struct cgroup itself or implementing an implicit controller but to be
>>>> visible as an explicit controller it must be able to follow cgroup
>>>> interface rules including delegation.  If not, it's best to come
>>>> through the interface which enforces the required permission checks
>>>> and then talk to cgroup from there.  This was also an issue with
>>>> network cgroup bpf programs that Daniel Mack is working on.  Please
>>>> chat with him.
>>>
>>> Program assignment is possible by lesser security domains. Program
>>> loading is
>>> limited to CAP_SYS_ADMIN in init_user_ns. We could make it accessible to
>>> CAP_SYS_ADMIN in any userns, but it the reasoning behind this is that
>>> Checmate
>>> BPF programs can leak kernel pointers.
>>
>> That doesn't make much sense to me.  Delegation doesn't mean much if a
>> delegatee can't load its own program (and I don't see how one can
>> delegate kernel pointer access to !root).  Also, unless there's
>> per-program fine control on who can load it, it seems pretty dangerous
>> to let anyone load any program.
>>
>>> Could we potentially restrict it to only CAP_MAC_OVERRIDE, while
>>> still meeting
>>> cgroup delegation requirements?
>>
>> Wouldn't it make far more sense to pass cgroup fd to bpf syscall so
>> that "load this program" and "attach this program to the cgroup
>> identified by this fd" through the same interface and permission
>> checks?  cgroup participating in bpf operations is all fine but
>> splitting the userland interface across two domains seems like a bad
>> idea.
>>
>>> Filters which are higher up in the heirarchy will still be enforced
>>> during
>>> delegation. This was an explicit design, as the "Orchestrator in
>>> Orchestrator"
>>> use case needs to be supported.
>>
>> Given that program loading is restricted to root, wouldn't it be an a
>> lot more efficient approach to let userland multiplex multiple
>> programs?  Walking the tree executing bpf programs each time one of
>> these operations runs can be pretty expensive.  Imagine a tree like
>> the following.
>>
>>     A - B - C
>>           \ D
>>
>> Let's say program is currently loaded on D.  If someone wants to add a
>> program on B, userland can load the program on B, combine B's and D's
>> program and compile them into a single program and load it on D.  The
>> only thing kernel would need to do in terms of hierarchy is finding
>> what's the closest program to execute.  In the above example, C would
>> need to use B's program and that can be determined on program
>> assignment time rather than on each operation.
> 
> I think that's exactly what Daniel's patches are doing and imo
> it makes sense to keep this style for lsm as well
> and also apply the concept of hook_id.
> Daniel adds two commands to bpf syscall to attach/detach from cgroup
> with hook_id.
> Initially two hooks will be for socket rx and tx.
> Then all interesting lsm hooks can be added one by one.
> Daniel's prog type will be bpf_prog_type_cgroup_socket_filter.
> LSM's prog type will be bpf_prog_type_lsm.
> And verifier can check type safety since the lsm hook_id will be
> passed at the program load time.
> See another thread we had with Mickael.
> 
> landlock and checmate are very similar and should really be
> single lsm as long as we agree that both are cgroup based.
> The main difference between the two:
> - landlock is proposing unpriveleged mode
> - checmate is proposing writing into arguments from the program
> These differences can be flags/options to one lsm.
> Implementations of course are different so far, but
> instead of arguing landlock vs checmate, I'd like us
> to focus on how we can make one lsm that solves all use cases.

Thanks for putting me in the loop. I am agree that both approaches can
be combined and I'm working on a new RFC for Landlock in which it would
be possible to manage unprivileged and privileged eBPF programs
according to extra flags. Sargun's network manipulation and checks (from
Checmate) could then sit on top of it.

However, for this to work, I'm keeping the main Landlock design to be
able to manage unprivileged rules, which is a touchy part. The next RFC
will also contains cgroup handling thanks to Daniel Mack's
BPF_PROG_ATTACH feature.

Basically, the main constraints for an unprivileged LSM are:
* must use and check no_new_priv for all impacted processes, including
moving from and to a cgroup (which get more complicated when dealing
with different privileged eBPF programs);
* must stack/append rules and prohibit removal (need to deal with
multiple processes and their different privileges/user).

Regards,
 Mickaël
Andy Lutomirski Aug. 29, 2016, 11:16 p.m. UTC | #6
On Aug 29, 2016 3:19 PM, "Mickaël Salaün" <mic@digikod.net> wrote:
>
>
> On 29/08/2016 23:49, Alexei Starovoitov wrote:
> > On 8/29/16 12:24 PM, Tejun Heo wrote:
> >> Hello, Sargun.
> >>
> >> On Mon, Aug 29, 2016 at 11:49:07AM -0700, Sargun Dhillon wrote:
> >>> It would be a separate hook per LSM hook. Why wouldn't we want a
> >>> separate bpf
> >>> hook per lsm hook? I think if one program has to handle them all, the
> >>> first
> >>> program would be looking up the hook program in a bpf prog array. If
> >>> you think
> >>> it's better to have this logic in the BPF program, that makes sense.
> >>>
> >>> I had a version of this patch that allowed you to attach a prog array
> >>> instead,
> >>> but I think that it's cleaner attaching a program per lsm hook. In
> >>> addition,
> >>> there's a performance impact that comes from these hooks, so I
> >>> wouldn't want to
> >>> execute unneccessary code if it's avoidable.
> >>
> >> Hmm... it doesn't really matter how the backend part looks like and if
> >> we need to implement per-call hooks to lower runtime overhead, sure.
> >> I was mostly worried about the approach propagating through the
> >> userland visible interface.
> >>
> >>> The prog array approach also makes stacking filters difficult. If
> >>> people want
> >>> multiple filters per hook, the orchestrator would have to rewrite the
> >>> existing
> >>> filters to be cooperative.
> >>
> >> I'm not really sure "stacking" in the kernel side is a good idea.
> >> Please see below.
> >>
> >>>> I'm not convinced about the approach.  It's an approach which pretty
> >>>> much requires future extensions while being rigid.  Not a good
> >>>> combination.
> >>>
> >>> Do you have an alternative recommendation? Maybe just a set of 5 u64s
> >>> as the context object along with the hook ID?
> >>
> >> cgroup fs doesn't seem like the right interface for this but if it
> >> were I'd go for named hook IDs instead of opaque numbers.
> >>
> >>>> Unless this is properly delegatable, IOW, it's safe to fully delegate
> >>>> to a lesser security domain for all operations including program
> >>>> loading and assignment (I can't see how that'd be the case), making it
> >>>> an explicit controller doens't work in terms of userland interface.
> >>>> It's fine for bpf / lsm / whatever to attach to cgroups by extending
> >>>> struct cgroup itself or implementing an implicit controller but to be
> >>>> visible as an explicit controller it must be able to follow cgroup
> >>>> interface rules including delegation.  If not, it's best to come
> >>>> through the interface which enforces the required permission checks
> >>>> and then talk to cgroup from there.  This was also an issue with
> >>>> network cgroup bpf programs that Daniel Mack is working on.  Please
> >>>> chat with him.
> >>>
> >>> Program assignment is possible by lesser security domains. Program
> >>> loading is
> >>> limited to CAP_SYS_ADMIN in init_user_ns. We could make it accessible to
> >>> CAP_SYS_ADMIN in any userns, but it the reasoning behind this is that
> >>> Checmate
> >>> BPF programs can leak kernel pointers.
> >>
> >> That doesn't make much sense to me.  Delegation doesn't mean much if a
> >> delegatee can't load its own program (and I don't see how one can
> >> delegate kernel pointer access to !root).  Also, unless there's
> >> per-program fine control on who can load it, it seems pretty dangerous
> >> to let anyone load any program.
> >>
> >>> Could we potentially restrict it to only CAP_MAC_OVERRIDE, while
> >>> still meeting
> >>> cgroup delegation requirements?
> >>
> >> Wouldn't it make far more sense to pass cgroup fd to bpf syscall so
> >> that "load this program" and "attach this program to the cgroup
> >> identified by this fd" through the same interface and permission
> >> checks?  cgroup participating in bpf operations is all fine but
> >> splitting the userland interface across two domains seems like a bad
> >> idea.
> >>
> >>> Filters which are higher up in the heirarchy will still be enforced
> >>> during
> >>> delegation. This was an explicit design, as the "Orchestrator in
> >>> Orchestrator"
> >>> use case needs to be supported.
> >>
> >> Given that program loading is restricted to root, wouldn't it be an a
> >> lot more efficient approach to let userland multiplex multiple
> >> programs?  Walking the tree executing bpf programs each time one of
> >> these operations runs can be pretty expensive.  Imagine a tree like
> >> the following.
> >>
> >>     A - B - C
> >>           \ D
> >>
> >> Let's say program is currently loaded on D.  If someone wants to add a
> >> program on B, userland can load the program on B, combine B's and D's
> >> program and compile them into a single program and load it on D.  The
> >> only thing kernel would need to do in terms of hierarchy is finding
> >> what's the closest program to execute.  In the above example, C would
> >> need to use B's program and that can be determined on program
> >> assignment time rather than on each operation.
> >
> > I think that's exactly what Daniel's patches are doing and imo
> > it makes sense to keep this style for lsm as well
> > and also apply the concept of hook_id.
> > Daniel adds two commands to bpf syscall to attach/detach from cgroup
> > with hook_id.
> > Initially two hooks will be for socket rx and tx.
> > Then all interesting lsm hooks can be added one by one.
> > Daniel's prog type will be bpf_prog_type_cgroup_socket_filter.
> > LSM's prog type will be bpf_prog_type_lsm.
> > And verifier can check type safety since the lsm hook_id will be
> > passed at the program load time.
> > See another thread we had with Mickael.
> >
> > landlock and checmate are very similar and should really be
> > single lsm as long as we agree that both are cgroup based.
> > The main difference between the two:
> > - landlock is proposing unpriveleged mode
> > - checmate is proposing writing into arguments from the program
> > These differences can be flags/options to one lsm.
> > Implementations of course are different so far, but
> > instead of arguing landlock vs checmate, I'd like us
> > to focus on how we can make one lsm that solves all use cases.
>
> Thanks for putting me in the loop. I am agree that both approaches can
> be combined and I'm working on a new RFC for Landlock in which it would
> be possible to manage unprivileged and privileged eBPF programs
> according to extra flags. Sargun's network manipulation and checks (from
> Checmate) could then sit on top of it.
>
> However, for this to work, I'm keeping the main Landlock design to be
> able to manage unprivileged rules, which is a touchy part. The next RFC
> will also contains cgroup handling thanks to Daniel Mack's
> BPF_PROG_ATTACH feature.
>
> Basically, the main constraints for an unprivileged LSM are:
> * must use and check no_new_priv for all impacted processes, including
> moving from and to a cgroup (which get more complicated when dealing
> with different privileged eBPF programs);

...which is vastly simplified if you just don't let the unprivileged
stuff interact with cgroups.  If the only use case so far is to add
restrictions to unsuspecting processes, I would suggest addressing
that differently (LD_PRELOAD, new syscall, or simply don't support
it).  I still feel like this is a lot of effort to go through to try
to get cgroups and unprivileged sandboxing to play nice together, and
I'm not yet seeing the point.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sargun Dhillon Aug. 30, 2016, 4:25 a.m. UTC | #7
On Mon, Aug 29, 2016 at 02:49:17PM -0700, Alexei Starovoitov wrote:
> On 8/29/16 12:24 PM, Tejun Heo wrote:
> >Hello, Sargun.
> >
> >On Mon, Aug 29, 2016 at 11:49:07AM -0700, Sargun Dhillon wrote:
> >>It would be a separate hook per LSM hook. Why wouldn't we want a separate bpf
> >>hook per lsm hook? I think if one program has to handle them all, the first
> >>program would be looking up the hook program in a bpf prog array. If you think
> >>it's better to have this logic in the BPF program, that makes sense.
> >>
> >>I had a version of this patch that allowed you to attach a prog array instead,
> >>but I think that it's cleaner attaching a program per lsm hook. In addition,
> >>there's a performance impact that comes from these hooks, so I wouldn't want to
> >>execute unneccessary code if it's avoidable.
> >
> >Hmm... it doesn't really matter how the backend part looks like and if
> >we need to implement per-call hooks to lower runtime overhead, sure.
> >I was mostly worried about the approach propagating through the
> >userland visible interface.
> >
> >>The prog array approach also makes stacking filters difficult. If people want
> >>multiple filters per hook, the orchestrator would have to rewrite the existing
> >>filters to be cooperative.
> >
> >I'm not really sure "stacking" in the kernel side is a good idea.
> >Please see below.
> >
> >>>I'm not convinced about the approach.  It's an approach which pretty
> >>>much requires future extensions while being rigid.  Not a good
> >>>combination.
> >>
> >>Do you have an alternative recommendation? Maybe just a set of 5 u64s
> >>as the context object along with the hook ID?
> >
> >cgroup fs doesn't seem like the right interface for this but if it
> >were I'd go for named hook IDs instead of opaque numbers.
> >
> >>>Unless this is properly delegatable, IOW, it's safe to fully delegate
> >>>to a lesser security domain for all operations including program
> >>>loading and assignment (I can't see how that'd be the case), making it
> >>>an explicit controller doens't work in terms of userland interface.
> >>>It's fine for bpf / lsm / whatever to attach to cgroups by extending
> >>>struct cgroup itself or implementing an implicit controller but to be
> >>>visible as an explicit controller it must be able to follow cgroup
> >>>interface rules including delegation.  If not, it's best to come
> >>>through the interface which enforces the required permission checks
> >>>and then talk to cgroup from there.  This was also an issue with
> >>>network cgroup bpf programs that Daniel Mack is working on.  Please
> >>>chat with him.
> >>
> >>Program assignment is possible by lesser security domains. Program loading is
> >>limited to CAP_SYS_ADMIN in init_user_ns. We could make it accessible to
> >>CAP_SYS_ADMIN in any userns, but it the reasoning behind this is that Checmate
> >>BPF programs can leak kernel pointers.
> >
> >That doesn't make much sense to me.  Delegation doesn't mean much if a
> >delegatee can't load its own program (and I don't see how one can
> >delegate kernel pointer access to !root).  Also, unless there's
> >per-program fine control on who can load it, it seems pretty dangerous
> >to let anyone load any program.
> >
> >>Could we potentially restrict it to only CAP_MAC_OVERRIDE, while still meeting
> >>cgroup delegation requirements?
> >
> >Wouldn't it make far more sense to pass cgroup fd to bpf syscall so
> >that "load this program" and "attach this program to the cgroup
> >identified by this fd" through the same interface and permission
> >checks?  cgroup participating in bpf operations is all fine but
> >splitting the userland interface across two domains seems like a bad
> >idea.
> >
> >>Filters which are higher up in the heirarchy will still be enforced during
> >>delegation. This was an explicit design, as the "Orchestrator in Orchestrator"
> >>use case needs to be supported.
> >
> >Given that program loading is restricted to root, wouldn't it be an a
> >lot more efficient approach to let userland multiplex multiple
> >programs?  Walking the tree executing bpf programs each time one of
> >these operations runs can be pretty expensive.  Imagine a tree like
> >the following.
> >
> >	A - B - C
> >	      \ D
> >
> >Let's say program is currently loaded on D.  If someone wants to add a
> >program on B, userland can load the program on B, combine B's and D's
> >program and compile them into a single program and load it on D.  The
> >only thing kernel would need to do in terms of hierarchy is finding
> >what's the closest program to execute.  In the above example, C would
> >need to use B's program and that can be determined on program
> >assignment time rather than on each operation.
> 
> I think that's exactly what Daniel's patches are doing and imo
> it makes sense to keep this style for lsm as well
> and also apply the concept of hook_id.
> Daniel adds two commands to bpf syscall to attach/detach from cgroup
> with hook_id.
I have a couple outstanding question about Daniel's patches, but overall, I 
think the approach works just as well. I've asked him these questions on that 
thread.

> Initially two hooks will be for socket rx and tx.
> Then all interesting lsm hooks can be added one by one.
> Daniel's prog type will be bpf_prog_type_cgroup_socket_filter.
> LSM's prog type will be bpf_prog_type_lsm.
> And verifier can check type safety since the lsm hook_id will be
> passed at the program load time.
> See another thread we had with Mickael.
I read that thread, but I'm not entirely sure as to the reasoning behind the 
verifier needing further analysis of the programs, at least in MVP. 
bpf_probe_read / bpf_probe_(kernel_)write seem to be reasonable as a starter, 
and then we can modify the verifier to allow for direct access to those fields.

I also realize that the verifier can be used to prevent pointers leakage, but 
with my work, since it's meant to only be accessible to CAP_SYS_ADMIN, I'm not 
really worried about that.
> 
> landlock and checmate are very similar and should really be
> single lsm as long as we agree that both are cgroup based.
> The main difference between the two:
> - landlock is proposing unpriveleged mode
> - checmate is proposing writing into arguments from the program
> These differences can be flags/options to one lsm.
> Implementations of course are different so far, but
> instead of arguing landlock vs checmate, I'd like us
> to focus on how we can make one lsm that solves all use cases.
> 
I think it probably makes sense to be able to mark a specific program an 
unprivileged, and then the verifier can make sure it's safe. My fear with this, 
and the approach that Daniel's patches have is that the unprivileged isolators 
could interfere with privigleged isolators.

What's the permission model by which unprivileged programs will be able to load 
programs, and attach them to cgroups?
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336a..fbb7aa7 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,10 @@  SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_SECURITY_CHECMATE)
+SUBSYS(checmate)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/include/linux/checmate.h b/include/linux/checmate.h
new file mode 100644
index 0000000..4c4db4a
--- /dev/null
+++ b/include/linux/checmate.h
@@ -0,0 +1,108 @@ 
+#ifndef _LINUX_CHECMATE_H_
+#define _LINUX_CHECMATE_H_ 1
+#include <linux/security.h>
+
+enum checmate_hook_num {
+	/* CONFIG_SECURITY_NET hooks */
+	CHECMATE_HOOK_UNIX_STREAM_CONNECT,
+	CHECMATE_HOOK_UNIX_MAY_SEND,
+	CHECMATE_HOOK_SOCKET_CREATE,
+	CHECMATE_HOOK_SOCKET_POST_CREATE,
+	CHECMATE_HOOK_SOCKET_BIND,
+	CHECMATE_HOOK_SOCKET_CONNECT,
+	CHECMATE_HOOK_SOCKET_LISTEN,
+	CHECMATE_HOOK_SOCKET_ACCEPT,
+	CHECMATE_HOOK_SOCKET_SENDMSG,
+	CHECMATE_HOOK_SOCKET_RECVMSG,
+	CHECMATE_HOOK_SOCKET_GETSOCKNAME,
+	CHECMATE_HOOK_SOCKET_GETPEERNAME,
+	CHECMATE_HOOK_SOCKET_GETSOCKOPT,
+	CHECMATE_HOOK_SOCKET_SETSOCKOPT,
+	CHECMATE_HOOK_SOCKET_SHUTDOWN,
+	CHECMATE_HOOK_SOCKET_SOCK_RCV_SKB,
+	CHECMATE_HOOK_SK_FREE_SECURITY,
+	__CHECMATE_HOOK_MAX,
+};
+
+/* CONFIG_SECURITY_NET contexts */
+struct checmate_unix_stream_connect_ctx {
+	struct sock *sock;
+	struct sock *other;
+	struct sock *newsk;
+};
+
+struct checmate_unix_may_send_ctx {
+	struct socket *sock;
+	struct socket *other;
+};
+
+struct checmate_socket_create_ctx {
+	int family;
+	int type;
+	int protocol;
+	int kern;
+};
+
+struct checmate_socket_bind_ctx {
+	struct socket *sock;
+	struct sockaddr *address;
+	int addrlen;
+};
+
+struct checmate_socket_connect_ctx {
+	struct socket *sock;
+	struct sockaddr *address;
+	int addrlen;
+};
+
+struct checmate_socket_listen_ctx {
+	struct socket *sock;
+	int backlog;
+};
+
+struct checmate_socket_accept_ctx {
+	struct socket *sock;
+	struct socket *newsock;
+};
+
+struct checmate_socket_sendmsg_ctx {
+	struct socket *sock;
+	struct msghdr *msg;
+	int size;
+};
+
+struct checmate_socket_recvmsg_ctx {
+	struct socket *sock;
+	struct msghdr *msg;
+	int size;
+	int flags;
+};
+
+struct checmate_socket_sock_rcv_skb_ctx {
+	struct sock *sk;
+	struct sk_buff *skb;
+};
+
+struct checmate_sk_free_security_ctx {
+	struct sock *sk;
+};
+
+struct checmate_ctx {
+	int hook;
+	union {
+/* CONFIG_SECURITY_NET contexts */
+		struct checmate_unix_stream_connect_ctx	unix_stream_connect;
+		struct checmate_unix_may_send_ctx	unix_may_send;
+		struct checmate_socket_create_ctx	socket_create;
+		struct checmate_socket_bind_ctx		socket_bind;
+		struct checmate_socket_connect_ctx	socket_connect;
+		struct checmate_socket_listen_ctx	socket_listen;
+		struct checmate_socket_accept_ctx	socket_accept;
+		struct checmate_socket_sendmsg_ctx	socket_sendmsg;
+		struct checmate_socket_recvmsg_ctx	socket_recvmsg;
+		struct checmate_socket_sock_rcv_skb_ctx	socket_sock_rcv_skb;
+		struct checmate_sk_free_security_ctx	sk_free_security;
+	};
+};
+
+#endif /* _LINUX_CHECMATE_H_ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e4c5a1b..91bc92f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -95,6 +95,7 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_SCHED_ACT,
 	BPF_PROG_TYPE_TRACEPOINT,
 	BPF_PROG_TYPE_XDP,
+	BPF_PROG_TYPE_CHECMATE,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962..6f4f7b0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -741,7 +741,7 @@  static int bpf_prog_load(union bpf_attr *attr)
 	if (attr->insn_cnt >= BPF_MAXINSNS)
 		return -EINVAL;
 
-	if (type == BPF_PROG_TYPE_KPROBE &&
+	if ((type == BPF_PROG_TYPE_KPROBE || type == BPF_PROG_TYPE_CHECMATE) &&
 	    attr->kern_version != LINUX_VERSION_CODE)
 		return -EINVAL;
 
diff --git a/security/Kconfig b/security/Kconfig
index df28f2b..c819539 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -152,6 +152,7 @@  source security/tomoyo/Kconfig
 source security/apparmor/Kconfig
 source security/loadpin/Kconfig
 source security/yama/Kconfig
+source security/checmate/Kconfig
 
 source security/integrity/Kconfig
 
diff --git a/security/Makefile b/security/Makefile
index f2d71cd..6cc3342 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -8,6 +8,7 @@  subdir-$(CONFIG_SECURITY_SMACK)		+= smack
 subdir-$(CONFIG_SECURITY_TOMOYO)        += tomoyo
 subdir-$(CONFIG_SECURITY_APPARMOR)	+= apparmor
 subdir-$(CONFIG_SECURITY_YAMA)		+= yama
+subdir-$(CONFIG_SECURITY_CHECMATE)	+= checmate
 subdir-$(CONFIG_SECURITY_LOADPIN)	+= loadpin
 
 # always enable default capabilities
@@ -25,6 +26,7 @@  obj-$(CONFIG_SECURITY_APPARMOR)		+= apparmor/
 obj-$(CONFIG_SECURITY_YAMA)		+= yama/
 obj-$(CONFIG_SECURITY_LOADPIN)		+= loadpin/
 obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
+obj-$(CONFIG_SECURITY_CHECMATE)		+= checmate/
 
 # Object integrity file lists
 subdir-$(CONFIG_INTEGRITY)		+= integrity
diff --git a/security/checmate/Kconfig b/security/checmate/Kconfig
new file mode 100644
index 0000000..9dc76d1
--- /dev/null
+++ b/security/checmate/Kconfig
@@ -0,0 +1,11 @@ 
+config SECURITY_CHECMATE
+	bool "Checmate support"
+	depends on SECURITY && SOCK_CGROUP_DATA
+	default n
+	help
+	  This selects Checmate, which is an LSM that works in conjuncion with
+	  cgroups and BPF in order to provide programmable, flexible, and
+	  extensible security policies. Further information can be found in
+	  Documentation/security/Checmate.txt
+
+	  If you are unsure how to answer this question, answer N.
diff --git a/security/checmate/Makefile b/security/checmate/Makefile
new file mode 100644
index 0000000..c676773
--- /dev/null
+++ b/security/checmate/Makefile
@@ -0,0 +1,3 @@ 
+obj-$(CONFIG_SECURITY_CHECMATE) := checmate.o
+
+checmate-y := checmate_bpf.o checmate_lsm.o
diff --git a/security/checmate/checmate_bpf.c b/security/checmate/checmate_bpf.c
new file mode 100644
index 0000000..001225c
--- /dev/null
+++ b/security/checmate/checmate_bpf.c
@@ -0,0 +1,68 @@ 
+/*
+ * Checmate Linux Security Module
+ *
+ * Copyright (C) 2016 Sargun Dhillon <sargun@sargun.me>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bpf.h>
+#include <linux/checmate.h>
+
+static const struct bpf_func_proto *
+checmate_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_probe_read:
+		return &bpf_probe_read_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
+	case BPF_FUNC_get_current_pid_tgid:
+		return &bpf_get_current_pid_tgid_proto;
+	case BPF_FUNC_get_current_task:
+		return &bpf_get_current_task_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_get_current_comm:
+		return &bpf_get_current_comm_proto;
+	case BPF_FUNC_trace_printk:
+		return bpf_get_trace_printk_proto();
+	default:
+		return NULL;
+	}
+}
+
+static bool checmate_prog_is_valid_access(int off, int size,
+					  enum bpf_access_type type,
+					  enum bpf_reg_type *reg_type)
+{
+	if (type != BPF_READ)
+		return false;
+	if (off < 0 || off >= sizeof(struct checmate_ctx))
+		return false;
+	return true;
+}
+
+static const struct bpf_verifier_ops checmate_prog_ops = {
+	.get_func_proto		= checmate_prog_func_proto,
+	.is_valid_access	= checmate_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list checmate_tl = {
+	.ops	= &checmate_prog_ops,
+	.type	= BPF_PROG_TYPE_CHECMATE,
+};
+
+void register_checmate_prog_ops(void)
+{
+	bpf_register_prog_type(&checmate_tl);
+}
diff --git a/security/checmate/checmate_lsm.c b/security/checmate/checmate_lsm.c
new file mode 100644
index 0000000..ef8514d
--- /dev/null
+++ b/security/checmate/checmate_lsm.c
@@ -0,0 +1,610 @@ 
+/*
+ * Checmate Linux Security Module
+ *
+ * Copyright (C) 2016 Sargun Dhillon <sargun@sargun.me>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/prctl.h>
+#include <linux/checmate.h>
+#include <linux/lsm_hooks.h>
+#include <linux/mutex.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/cgroup.h>
+#include <net/sock.h>
+#include <net/request_sock.h>
+
+#define MAX_CHECMATE_INSTANCES 32
+
+/* Global mutex for any Checmate hook manipulation operations */
+DEFINE_MUTEX(checmate_mutex);
+
+#define CHECMATE_CFTYPE(HOOK, NAME)			\
+	{						\
+		.name		= NAME,			\
+		.private	= HOOK,			\
+		.read_u64	= checmate_read_u64,	\
+		.write_s64	= checmate_write_s64,	\
+		.flags		= CFTYPE_NOT_ON_ROOT	\
+	}
+
+extern void register_checmate_prog_ops(void);
+
+struct checmate_instance {
+	struct list_head	list;
+	struct rcu_head		rcu;
+	struct bpf_prog		*prog;
+};
+
+struct checmate_hook {
+	struct list_head	instances;
+	int			count;
+};
+
+struct checmate_css {
+	struct cgroup_subsys_state	css;
+	struct checmate_hook		hooks[__CHECMATE_HOOK_MAX];
+};
+
+static struct checmate_css *css_checmate(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct checmate_css, css);
+}
+
+static struct checmate_css *parent_checmate(struct checmate_css *checmate)
+{
+	return css_checmate(checmate->css.parent);
+}
+
+static struct cgroup_subsys_state *
+checmate_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct checmate_css *checmate;
+	int i;
+
+	checmate = kzalloc(sizeof(*checmate), GFP_KERNEL);
+	if (!checmate)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < ARRAY_SIZE(checmate->hooks); i++)
+		INIT_LIST_HEAD(&checmate->hooks[i].instances);
+
+	return &checmate->css;
+}
+
+/*
+ * checmate_hook_free - Deallocate, and release resources for a given hook
+ * @hook: The hook
+ *
+ * Always succeeds. Only to be used when hook is out of use, and therefore
+ * doesn't use the RCU mechanism to cleanup he hook. Only use it for
+ * retirement of a hook.
+ */
+static void checmate_hook_free(struct checmate_hook *hook)
+{
+	struct checmate_instance *instance, *next;
+
+	list_for_each_entry_safe(instance, next, &hook->instances, list) {
+		list_del(&instance->list);
+		bpf_prog_put(instance->prog);
+		kfree(instance);
+	}
+}
+
+/*
+ * checmate_css_free - Callback for css_free
+ * @css: The cgroup_subsys_state to be freed
+ */
+static void checmate_css_free(struct cgroup_subsys_state *css)
+{
+	struct checmate_css *checmate = css_checmate(css);
+	int i;
+
+	mutex_lock(&checmate_mutex);
+	for (i = 0; i < ARRAY_SIZE(checmate->hooks); i++)
+		checmate_hook_free(&checmate->hooks[i]);
+
+	kfree(checmate);
+	mutex_unlock(&checmate_mutex);
+}
+
+/*
+ * checmate_instance_add - Add BPF program instance to a Checmate hook
+ * @hook: The hook
+ * @prog: A checmate BPF program
+ *
+ * Checks if the program is already part of the hook, and only adds new
+ * programs.
+ *
+ * Returns 0 on success. -errno on failure.
+ *
+ * Requires that the Checmate mutex is held during the operation.
+ */
+static int checmate_instance_add(struct checmate_hook *hook,
+				 struct bpf_prog *prog)
+{
+	struct checmate_instance *instance;
+	int rc = 0;
+
+	if (hook->count >= MAX_CHECMATE_INSTANCES)
+		return -ENOSPC;
+
+	list_for_each_entry(instance, &hook->instances, list) {
+		if (instance->prog == prog) {
+			bpf_prog_put(prog);
+			rc = -EEXIST;
+			goto err;
+		}
+	}
+
+	instance = kmalloc(sizeof(*instance), GFP_KERNEL);
+	if (!instance) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	instance->prog = prog;
+	list_add_tail_rcu(&instance->list, &hook->instances);
+	hook->count++;
+	return rc;
+
+err:
+	bpf_prog_put(prog);
+	return rc;
+}
+
+/*
+ * checmate_instance_cleanup_rcu - Cleans up a Checmate program instance
+ * @rp: rcu_head pointer to a Checmate instance
+ */
+static void checmate_instance_cleanup_rcu(struct rcu_head *rp)
+{
+	struct checmate_instance *instance;
+
+	instance = container_of(rp, struct checmate_instance, rcu);
+	bpf_prog_put(instance->prog);
+	kfree(instance);
+}
+
+/*
+ * checmate_instance_remove - Remove Checmate program instance from a hook
+ * @hook: The hook
+ * @prog: A Checmate BPF program referred to by the instance.
+ *
+ * Returns 0 on success. -errno on failure.
+ *
+ * Requires that the Checmate mutex is held during the operation.
+ */
+static int checmate_instance_remove(struct checmate_hook *hook,
+				    struct bpf_prog *prog)
+{
+	struct checmate_instance *instance, *next;
+	int rc = -ENOENT;
+
+	list_for_each_entry_safe(instance, next, &hook->instances, list) {
+		if (instance->prog == prog) {
+			list_del_rcu(&instance->list);
+			call_rcu(&instance->rcu, checmate_instance_cleanup_rcu);
+			rc = 0;
+			hook->count--;
+			break;
+		}
+	}
+	bpf_prog_put(prog);
+
+	return rc;
+}
+
+/*
+ * checmate_hook_reset - Remove all program instances from a Checmate hook
+ * @hook: The hook
+ *
+ * Always succeeds.
+ *
+ * Requires that the Checmate mutex is held during the operation.
+ */
+static void checmate_hook_reset(struct checmate_hook *hook)
+{
+	struct checmate_instance *instance, *next;
+
+	list_for_each_entry_safe(instance, next, &hook->instances, list) {
+		list_del_rcu(&instance->list);
+		call_rcu(&instance->rcu, checmate_instance_cleanup_rcu);
+	}
+	hook->count = 0;
+}
+
+/*
+ * checmate_write_s64 - Handle a write to the checmate cgroup control file
+ * @css: The given cgroup state that own's the hook
+ * @cft: The given cftype that is being referenced, used to get the hook id.
+ * @val: The bpf program fd that is involved in the operation, or 0.
+ *
+ * val == 0: Reset all programs in hook.
+ * val > 0: Add the given program.
+ * val < 0: Remove the given program.
+ *
+ * Returns 0 on success. -errno on failure.
+ */
+static int checmate_write_s64(struct cgroup_subsys_state *css,
+			      struct cftype *cft, s64 val)
+{
+	struct checmate_css *checmate = css_checmate(css);
+	struct checmate_hook *hook;
+	struct bpf_prog *prog;
+	int rc = 0;
+
+	hook = &checmate->hooks[cft->private];
+	mutex_lock(&checmate_mutex);
+	if (val == 0) {
+		checmate_hook_reset(hook);
+		goto out;
+	}
+
+	/* If we're not resetting, we have to load, and check the program */
+	prog = bpf_prog_get_type(abs(val), BPF_PROG_TYPE_CHECMATE);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (val > 0)
+		rc = checmate_instance_add(hook, prog);
+	else
+		rc = checmate_instance_remove(hook, prog);
+
+out:
+	mutex_unlock(&checmate_mutex);
+	return rc;
+}
+
+/*
+ * checmate_read_u64 - Read the number of programs loaded into a given hook
+ * @css: The given cgroup state that own's the hook
+ * @cft: The given cftype that is being referenced, used to get the hook id.
+ *
+ *
+ * Returns number of programs loaded into hook. Always succeeds.
+ */
+static u64 checmate_read_u64(struct cgroup_subsys_state *css,
+			     struct cftype *cft)
+{
+	struct checmate_css *checmate = css_checmate(css);
+	struct checmate_hook *hook;
+
+	hook = &checmate->hooks[cft->private];
+	return hook->count;
+}
+
+static struct cftype checmate_files[] = {
+#ifdef CONFIG_SECURITY_NETWORK
+	CHECMATE_CFTYPE(CHECMATE_HOOK_UNIX_STREAM_CONNECT,
+			"unix_stream_connect"),
+	CHECMATE_CFTYPE(CHECMATE_HOOK_UNIX_MAY_SEND,
+			"unix_may_send"),
+	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_CREATE, "socket_create"),
+	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_BIND, "socket_bind"),
+	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_CONNECT, "socket_connect"),
+	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_LISTEN, "socket_listen"),
+	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_ACCEPT, "socket_accept"),
+	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_SENDMSG, "socket_sendmsg"),
+	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_RECVMSG, "socket_recvmsg"),
+	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_SHUTDOWN, "socket_shutdown"),
+	CHECMATE_CFTYPE(CHECMATE_HOOK_SOCKET_SOCK_RCV_SKB,
+			"socket_sock_rcv_skb"),
+	CHECMATE_CFTYPE(CHECMATE_HOOK_SK_FREE_SECURITY, "sk_free_security"),
+#endif /* CONFIG_SECURITY_NETWORK */
+	{}
+};
+
+struct cgroup_subsys checmate_cgrp_subsys = {
+	.css_alloc	= checmate_css_alloc,
+	.css_free	= checmate_css_free,
+	.dfl_cftypes	= checmate_files,
+};
+
+/*
+ * check_checmate_filters - Run all the BPF programs associated with a hook
+ * @css: A pointer to the Checmate css
+ * @ctx: A pointer to the Checmate ctx
+ *
+ * Return 0 on success, on first hook returning non-0, the error is returned
+ * to the caller.
+ */
+static int checmate_check_filters(struct checmate_css *checmate,
+				  struct checmate_ctx *ctx)
+{
+	struct checmate_instance *instance;
+	struct checmate_hook *hook;
+	int rc = 0;
+
+	hook = &checmate->hooks[ctx->hook];
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(instance, &hook->instances, list) {
+		rc = BPF_PROG_RUN(instance->prog, (void *)ctx);
+		if (rc)
+			break;
+	}
+	rcu_read_unlock();
+
+	return rc;
+}
+
+/*
+ * call_bpf_int_hook - Walk the cgroup hierarchy, running filters up the chain
+ * @hook: The Hook ID
+ * @css: A pointer to the Checmate css
+ * @cgrp: A pointer to the cgroup we're in, may be null or err
+ *
+ * Return 0 on success, on first hook erroring, the error is returned
+ * to the caller.
+ *
+ * Requires that the context struct is populated before passing, but
+ * the actual ctx->hook number is set by the function.
+ */
+static int call_bpf_int_hook(int hook, struct cgroup_subsys_state *css,
+			     struct checmate_ctx *ctx)
+{
+	struct checmate_css *checmate;
+	int rc = 0;
+
+	/* Fail open if we can't find the css / cgroup */
+	if (unlikely(IS_ERR_OR_NULL(css)))
+		goto out;
+
+	ctx->hook = hook;
+
+	for (checmate = css_checmate(css); parent_checmate(checmate);
+	     checmate = parent_checmate(checmate)) {
+		rc = checmate_check_filters(checmate, ctx);
+		if (rc)
+			break;
+	}
+
+out:
+	return rc;
+}
+
+/*
+ * call_bpf_void_hook - Run all the BPF programs associated with a hook
+ * Wrapper around call_bpf_int_hook.
+ */
+static void call_bpf_void_hook(int hook, struct cgroup_subsys_state *css,
+			       struct checmate_ctx *ctx)
+{
+	call_bpf_int_hook(hook, css, ctx);
+}
+
+/*
+ * css_from_sk - Get the Checmate CSS for an sk
+ * @sk: The struct sock we're trying to get the CSS for.
+ *
+ * Return Checmate CSS on success, or NULL / ERR_PTR on failure. It will try
+ * to return the effective CSS.
+ */
+static struct cgroup_subsys_state *css_from_sk(struct sock *sk)
+{
+	struct cgroup_subsys_state *css;
+	struct cgroup *cgrp;
+
+	if (!sk_fullsock(sk))
+		return ERR_PTR(-EINVAL);
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+
+	rcu_read_lock();
+	do {
+		css = rcu_dereference(cgrp->subsys[checmate_cgrp_id]);
+		if (css)
+			goto out;
+		cgrp = cgroup_parent(cgrp);
+	} while (cgrp);
+
+out:
+	rcu_read_unlock();
+
+	return css;
+}
+
+/*
+ * css_from_sock - Get the Checmate CSS for a socket
+ * @sock: The struct socket we're trying to get the CSS for.
+ *
+ * Return CSS on success. NULL / ERR_PTR on failure. It's a wrapper  around
+ * css_from_sk.
+ */
+static struct cgroup_subsys_state *css_from_sock(struct socket *sock)
+{
+	struct sock *sk;
+
+	sk = sock->sk;
+	if (!sk)
+		return ERR_PTR(-ENOENT);
+
+	return css_from_sk(sk);
+}
+
+/*
+ * css_from_sock - Get the checmate CSS for the current task context.
+ *
+ * Return CSS success on success. ERR_PTR on failure. It checks to see if it's
+ * being called from an interrupt as well.
+ */
+static struct cgroup_subsys_state *css_from_current(void)
+{
+	struct cgroup_subsys_state *css;
+
+	if (unlikely(in_interrupt()))
+		return ERR_PTR(-ENOENT);
+
+	rcu_read_lock();
+	css = task_css(current, checmate_cgrp_id);
+	rcu_read_unlock();
+
+	return css;
+}
+
+/* Checmate hooks */
+#ifdef CONFIG_SECURITY_NETWORK
+static int checmate_unix_stream_connect(struct sock *sock, struct sock *other,
+					struct sock *newsk)
+{
+	struct cgroup_subsys_state *css;
+	struct checmate_ctx ctx;
+
+	css = css_from_sk(sock);
+	ctx.unix_stream_connect.sock = sock;
+	ctx.unix_stream_connect.other = other;
+	ctx.unix_stream_connect.newsk = newsk;
+	return call_bpf_int_hook(CHECMATE_HOOK_UNIX_STREAM_CONNECT, css, &ctx);
+}
+
+static int checmate_unix_may_send(struct socket *sock, struct socket *other)
+{
+	struct cgroup_subsys_state *css;
+	struct checmate_ctx ctx;
+
+	css = css_from_sock(sock);
+	ctx.unix_may_send.sock = sock;
+	ctx.unix_may_send.other = other;
+	return call_bpf_int_hook(CHECMATE_HOOK_UNIX_MAY_SEND, css, &ctx);
+}
+
+static int checmate_socket_create(int family, int type, int protocol, int kern)
+{
+	struct cgroup_subsys_state *css;
+	struct checmate_ctx ctx;
+
+	css = css_from_current();
+	ctx.socket_create.family = family;
+	ctx.socket_create.type = type;
+	ctx.socket_create.protocol = protocol;
+	ctx.socket_create.kern = kern;
+	return call_bpf_int_hook(CHECMATE_HOOK_SOCKET_CREATE, css, &ctx);
+}
+
+static int checmate_socket_bind(struct socket *sock, struct sockaddr *address,
+				int addrlen)
+{
+	struct cgroup_subsys_state *css;
+	struct checmate_ctx ctx;
+
+	css = css_from_sock(sock);
+	ctx.socket_bind.sock = sock;
+	ctx.socket_bind.address = address;
+	ctx.socket_bind.addrlen = addrlen;
+	return call_bpf_int_hook(CHECMATE_HOOK_SOCKET_BIND, css, &ctx);
+}
+
+static int checmate_socket_connect(struct socket *sock,
+				   struct sockaddr *address, int addrlen)
+{
+	struct cgroup_subsys_state *css;
+	struct checmate_ctx ctx;
+
+	css = css_from_sock(sock);
+	ctx.socket_connect.sock = sock;
+	ctx.socket_connect.address = address;
+	ctx.socket_connect.addrlen = addrlen;
+	return call_bpf_int_hook(CHECMATE_HOOK_SOCKET_CONNECT, css, &ctx);
+}
+
+static int checmate_socket_listen(struct socket *sock, int backlog)
+{
+	struct cgroup_subsys_state *css;
+	struct checmate_ctx ctx;
+
+	css = css_from_sock(sock);
+	ctx.socket_listen.sock = sock;
+	ctx.socket_listen.backlog = backlog;
+	return call_bpf_int_hook(CHECMATE_HOOK_SOCKET_LISTEN, css, &ctx);
+}
+
+static int checmate_socket_accept(struct socket *sock, struct socket *newsock)
+{
+	struct cgroup_subsys_state *css;
+	struct checmate_ctx ctx;
+
+	css = css_from_sock(sock);
+	ctx.socket_accept.sock = sock;
+	ctx.socket_accept.newsock = newsock;
+	return call_bpf_int_hook(CHECMATE_HOOK_SOCKET_ACCEPT, css, &ctx);
+}
+
+static int checmate_socket_sendmsg(struct socket *sock, struct msghdr *msg,
+				   int size)
+{
+	struct cgroup_subsys_state *css;
+	struct checmate_ctx ctx;
+
+	css = css_from_sock(sock);
+	ctx.socket_sendmsg.sock = sock;
+	ctx.socket_sendmsg.msg = msg;
+	ctx.socket_sendmsg.size = size;
+	return call_bpf_int_hook(CHECMATE_HOOK_SOCKET_SENDMSG, css, &ctx);
+}
+
+static int checmate_socket_recvmsg(struct socket *sock, struct msghdr *msg,
+				   int size, int flags)
+{
+	struct cgroup_subsys_state *css;
+	struct checmate_ctx ctx;
+
+	css = css_from_sock(sock);
+	ctx.socket_recvmsg.sock = sock;
+	ctx.socket_recvmsg.msg = msg;
+	ctx.socket_recvmsg.size = size;
+	ctx.socket_recvmsg.flags = flags;
+	return call_bpf_int_hook(CHECMATE_HOOK_SOCKET_RECVMSG, css, &ctx);
+}
+
+static int checmate_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct cgroup_subsys_state *css;
+	struct checmate_ctx ctx;
+
+	css = css_from_sk(sk);
+	ctx.socket_sock_rcv_skb.sk = sk;
+	ctx.socket_sock_rcv_skb.skb = skb;
+	return call_bpf_int_hook(CHECMATE_HOOK_SOCKET_SOCK_RCV_SKB, css, &ctx);
+}
+
+static void checmate_sk_free_security(struct sock *sk)
+{
+	struct cgroup_subsys_state *css;
+	struct checmate_ctx ctx;
+
+	css = css_from_sk(sk);
+	ctx.sk_free_security.sk = sk;
+	return call_bpf_void_hook(CHECMATE_HOOK_SK_FREE_SECURITY, css, &ctx);
+}
+
+#endif /* CONFIG_SECURITY_NETWORK */
+
+static struct security_hook_list checmate_hooks[] = {
+#ifdef CONFIG_SECURITY_NETWORK
+	LSM_HOOK_INIT(unix_stream_connect, checmate_unix_stream_connect),
+	LSM_HOOK_INIT(unix_may_send, checmate_unix_may_send),
+	LSM_HOOK_INIT(socket_create, checmate_socket_create),
+	LSM_HOOK_INIT(socket_bind, checmate_socket_bind),
+	LSM_HOOK_INIT(socket_connect, checmate_socket_connect),
+	LSM_HOOK_INIT(socket_listen, checmate_socket_listen),
+	LSM_HOOK_INIT(socket_accept, checmate_socket_accept),
+	LSM_HOOK_INIT(socket_sendmsg, checmate_socket_sendmsg),
+	LSM_HOOK_INIT(socket_recvmsg, checmate_socket_recvmsg),
+	LSM_HOOK_INIT(socket_sock_rcv_skb, checmate_socket_sock_rcv_skb),
+	LSM_HOOK_INIT(sk_free_security, checmate_sk_free_security),
+#endif /* CONFIG_SECURITY_NETWORK */
+};
+
+static int __init checmate_setup(void)
+{
+	pr_info("Checmate activating.\n");
+	register_checmate_prog_ops();
+	security_add_hooks(checmate_hooks, ARRAY_SIZE(checmate_hooks));
+	return 0;
+}
+late_initcall(checmate_setup);