diff mbox

[v2,2/8] exec: turn self_exec_id into self_privunit

Message ID 1474663238-22134-3-git-send-email-jann@thejh.net (mailing list archive)
State New, archived
Headers show

Commit Message

Jann Horn Sept. 23, 2016, 8:40 p.m. UTC
This ensures that self_privunit ("privilege unit locally unique ID")
is only shared by processes that share the mm_struct and the signal_struct;
not just spatially, but also temporally. In other words, if you do execve()
or clone() without CLONE_THREAD, you get a new privunit that has never
been used before.

One reason for doing this is that it prevents an attacker from sending an
arbitrary signal to a parent process after performing 2^32-1 execve()
calls.

The second reason for this is that it permits using the self_exec_luid in
a later patch to check during a ptrace access whether subject and object
are temporally and spatially equal for privilege checking purposes.

The implementation of locally unique IDs is in sched.h and exec.c for now
because those are the only users so far - if anything else wants to use
them in the future, they can be moved elsewhere.

changed in v2:
 - have 2^64 IDs per CPU instead of 2^64 shared ones (luid scheme,
   suggested by Andy Lutomirski)
 - take task_lock for reading in setup_new_exec() while bumping the LUID

Signed-off-by: Jann Horn <jann@thejh.net>
---
 fs/exec.c             | 41 +++++++++++++++++++++++++++++++++++++++--
 include/linux/sched.h | 17 +++++++++++++++--
 kernel/fork.c         |  5 +++--
 kernel/signal.c       |  5 ++++-
 4 files changed, 61 insertions(+), 7 deletions(-)

Comments

Andy Lutomirski Sept. 23, 2016, 9:04 p.m. UTC | #1
On Fri, Sep 23, 2016 at 1:40 PM, Jann Horn <jann@thejh.net> wrote:
> This ensures that self_privunit ("privilege unit locally unique ID")
> is only shared by processes that share the mm_struct and the signal_struct;
> not just spatially, but also temporally. In other words, if you do execve()
> or clone() without CLONE_THREAD, you get a new privunit that has never
> been used before.
>
> One reason for doing this is that it prevents an attacker from sending an
> arbitrary signal to a parent process after performing 2^32-1 execve()
> calls.
>
> The second reason for this is that it permits using the self_exec_luid in
> a later patch to check during a ptrace access whether subject and object
> are temporally and spatially equal for privilege checking purposes.
>
> The implementation of locally unique IDs is in sched.h and exec.c for now
> because those are the only users so far - if anything else wants to use
> them in the future, they can be moved elsewhere.
>
> changed in v2:
>  - have 2^64 IDs per CPU instead of 2^64 shared ones (luid scheme,
>    suggested by Andy Lutomirski)
>  - take task_lock for reading in setup_new_exec() while bumping the LUID
>
> Signed-off-by: Jann Horn <jann@thejh.net>
> ---
>  fs/exec.c             | 41 +++++++++++++++++++++++++++++++++++++++--
>  include/linux/sched.h | 17 +++++++++++++++--
>  kernel/fork.c         |  5 +++--
>  kernel/signal.c       |  5 ++++-
>  4 files changed, 61 insertions(+), 7 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 84430ee..fcc11f0 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1281,6 +1281,34 @@ void would_dump(struct linux_binprm *bprm, struct file *file)
>  }
>  EXPORT_SYMBOL(would_dump);
>
> +static DEFINE_PER_CPU(u64, luid_counters);
> +
> +static int __init init_luid_counters(void)
> +{
> +       unsigned int cpu;
> +
> +       for_each_possible_cpu(cpu) {
> +               /* value 0 is reserved for init */
> +               per_cpu(luid_counters, cpu) = 1;
> +       }
> +
> +       return 0;
> +}
> +early_initcall(init_luid_counters);

How about static DEFINE_PER_CPU(u64, luid_counters) = 1?  You could
optionally use local64_t instead, which would let you avoid needing to
think about preemption.

> +
> +/*
> + * Allocates a new LUID and writes the allocated LUID to @out.
> + * This function must not be called from IRQ context.
> + */
> +void fill_luid(struct luid *out)
> +{
> +       preempt_disable();
> +       out->count = raw_cpu_read(luid_counters);
> +       raw_cpu_add(luid_counters, 1);
> +       out->cpu = smp_processor_id();
> +       preempt_enable();
> +}
> +

I would call this alloc_luid().
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jann Horn Sept. 23, 2016, 9:33 p.m. UTC | #2
On Fri, Sep 23, 2016 at 02:04:30PM -0700, Andy Lutomirski wrote:
> On Fri, Sep 23, 2016 at 1:40 PM, Jann Horn <jann@thejh.net> wrote:
> > This ensures that self_privunit ("privilege unit locally unique ID")
> > is only shared by processes that share the mm_struct and the signal_struct;
> > not just spatially, but also temporally. In other words, if you do execve()
> > or clone() without CLONE_THREAD, you get a new privunit that has never
> > been used before.
> >
> > One reason for doing this is that it prevents an attacker from sending an
> > arbitrary signal to a parent process after performing 2^32-1 execve()
> > calls.
> >
> > The second reason for this is that it permits using the self_exec_luid in
> > a later patch to check during a ptrace access whether subject and object
> > are temporally and spatially equal for privilege checking purposes.
> >
> > The implementation of locally unique IDs is in sched.h and exec.c for now
> > because those are the only users so far - if anything else wants to use
> > them in the future, they can be moved elsewhere.
> >
> > changed in v2:
> >  - have 2^64 IDs per CPU instead of 2^64 shared ones (luid scheme,
> >    suggested by Andy Lutomirski)
> >  - take task_lock for reading in setup_new_exec() while bumping the LUID
> >
> > Signed-off-by: Jann Horn <jann@thejh.net>
> > ---
> >  fs/exec.c             | 41 +++++++++++++++++++++++++++++++++++++++--
> >  include/linux/sched.h | 17 +++++++++++++++--
> >  kernel/fork.c         |  5 +++--
> >  kernel/signal.c       |  5 ++++-
> >  4 files changed, 61 insertions(+), 7 deletions(-)
> >
> > diff --git a/fs/exec.c b/fs/exec.c
> > index 84430ee..fcc11f0 100644
> > --- a/fs/exec.c
> > +++ b/fs/exec.c
> > @@ -1281,6 +1281,34 @@ void would_dump(struct linux_binprm *bprm, struct file *file)
> >  }
> >  EXPORT_SYMBOL(would_dump);
> >
> > +static DEFINE_PER_CPU(u64, luid_counters);
> > +
> > +static int __init init_luid_counters(void)
> > +{
> > +       unsigned int cpu;
> > +
> > +       for_each_possible_cpu(cpu) {
> > +               /* value 0 is reserved for init */
> > +               per_cpu(luid_counters, cpu) = 1;
> > +       }
> > +
> > +       return 0;
> > +}
> > +early_initcall(init_luid_counters);
> 
> How about static DEFINE_PER_CPU(u64, luid_counters) = 1?  You could
> optionally use local64_t instead, which would let you avoid needing to
> think about preemption.

Ah, I didn't realize that either of those was possible. Yes, I guess I'll
change it to use local64_t.

> > +
> > +/*
> > + * Allocates a new LUID and writes the allocated LUID to @out.
> > + * This function must not be called from IRQ context.
> > + */
> > +void fill_luid(struct luid *out)
> > +{
> > +       preempt_disable();
> > +       out->count = raw_cpu_read(luid_counters);
> > +       raw_cpu_add(luid_counters, 1);
> > +       out->cpu = smp_processor_id();
> > +       preempt_enable();
> > +}
> > +
> 
> I would call this alloc_luid().
diff mbox

Patch

diff --git a/fs/exec.c b/fs/exec.c
index 84430ee..fcc11f0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1281,6 +1281,34 @@  void would_dump(struct linux_binprm *bprm, struct file *file)
 }
 EXPORT_SYMBOL(would_dump);
 
+static DEFINE_PER_CPU(u64, luid_counters);
+
+static int __init init_luid_counters(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		/* value 0 is reserved for init */
+		per_cpu(luid_counters, cpu) = 1;
+	}
+
+	return 0;
+}
+early_initcall(init_luid_counters);
+
+/*
+ * Allocates a new LUID and writes the allocated LUID to @out.
+ * This function must not be called from IRQ context.
+ */
+void fill_luid(struct luid *out)
+{
+	preempt_disable();
+	out->count = raw_cpu_read(luid_counters);
+	raw_cpu_add(luid_counters, 1);
+	out->cpu = smp_processor_id();
+	preempt_enable();
+}
+
 void setup_new_exec(struct linux_binprm * bprm)
 {
 	arch_pick_mmap_layout(current->mm);
@@ -1313,8 +1341,17 @@  void setup_new_exec(struct linux_binprm * bprm)
 	}
 
 	/* An exec changes our domain. We are no longer part of the thread
-	   group */
-	current->self_exec_id++;
+	 * group.
+	 * The privunit luid is regenerated with the tasklist_lock held for
+	 * reading to allow do_notify_parent() (which only runs with
+	 * tasklist_lock held for writing) to inspect privunit IDs of other
+	 * tasks without taking the cred_guard_light (which wouldn't work
+	 * because the tasklist_lock is held).
+	 */
+	read_lock(&tasklist_lock);
+	fill_luid(&current->self_privunit);
+	read_unlock(&tasklist_lock);
+
 	flush_signal_handlers(current, 0);
 	do_close_on_exec(current->files);
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2a1df2f..fa90e36 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1467,6 +1467,19 @@  struct tlbflush_unmap_batch {
 	bool writable;
 };
 
+/* locally unique ID */
+struct luid {
+	u64 count;
+	unsigned int cpu;
+};
+
+void fill_luid(struct luid *out);
+
+static inline bool luid_eq(const struct luid *a, const struct luid *b)
+{
+	return a->count == b->count && a->cpu == b->cpu;
+}
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1688,8 +1701,8 @@  struct task_struct {
 	struct seccomp seccomp;
 
 /* Thread group tracking */
-   	u32 parent_exec_id;
-   	u32 self_exec_id;
+	struct luid parent_privunit;
+	struct luid self_privunit;
 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
  * mempolicy */
 	spinlock_t alloc_lock;
diff --git a/kernel/fork.c b/kernel/fork.c
index 2d46f3a..e1bd501 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1567,6 +1567,7 @@  static struct task_struct *copy_process(unsigned long clone_flags,
 			p->exit_signal = (clone_flags & CSIGNAL);
 		p->group_leader = p;
 		p->tgid = p->pid;
+		fill_luid(&p->self_privunit);
 	}
 
 	p->nr_dirtied = 0;
@@ -1597,10 +1598,10 @@  static struct task_struct *copy_process(unsigned long clone_flags,
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
 		p->real_parent = current->real_parent;
-		p->parent_exec_id = current->parent_exec_id;
+		p->parent_privunit = current->parent_privunit;
 	} else {
 		p->real_parent = current;
-		p->parent_exec_id = current->self_exec_id;
+		p->parent_privunit = current->self_privunit;
 	}
 
 	spin_lock(&current->sighand->siglock);
diff --git a/kernel/signal.c b/kernel/signal.c
index af21afc..3dbd25b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1566,6 +1566,8 @@  ret:
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
  *
+ * Must be called with tasklist_lock held for writing.
+ *
  * Returns true if our parent ignored us and so we've switched to
  * self-reaping.
  */
@@ -1590,7 +1592,8 @@  bool do_notify_parent(struct task_struct *tsk, int sig)
 		 * This is only possible if parent == real_parent.
 		 * Check if it has changed security domain.
 		 */
-		if (tsk->parent_exec_id != tsk->parent->self_exec_id)
+		if (!luid_eq(&tsk->parent_privunit,
+			     &tsk->parent->self_privunit))
 			sig = SIGCHLD;
 	}