diff mbox series

[RFC,2/2] signal: extend pidfd_send_signal() to allow expedited process killing

Message ID 20190411014353.113252-3-surenb@google.com (mailing list archive)
State New, archived
Headers show
Series opportunistic memory reclaim of a killed process | expand

Commit Message

Suren Baghdasaryan April 11, 2019, 1:43 a.m. UTC
Add new SS_EXPEDITE flag to be used when sending SIGKILL via
pidfd_send_signal() syscall to allow expedited memory reclaim of the
victim process. The usage of this flag is currently limited to SIGKILL
signal and only to privileged users.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/sched/signal.h |  3 ++-
 include/linux/signal.h       | 11 ++++++++++-
 ipc/mqueue.c                 |  2 +-
 kernel/signal.c              | 37 ++++++++++++++++++++++++++++--------
 kernel/time/itimer.c         |  2 +-
 5 files changed, 43 insertions(+), 12 deletions(-)

Comments

Christian Brauner April 11, 2019, 10:30 a.m. UTC | #1
On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> pidfd_send_signal() syscall to allow expedited memory reclaim of the
> victim process. The usage of this flag is currently limited to SIGKILL
> signal and only to privileged users.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  include/linux/sched/signal.h |  3 ++-
>  include/linux/signal.h       | 11 ++++++++++-
>  ipc/mqueue.c                 |  2 +-
>  kernel/signal.c              | 37 ++++++++++++++++++++++++++++--------
>  kernel/time/itimer.c         |  2 +-
>  5 files changed, 43 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index e412c092c1e8..8a227633a058 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -327,7 +327,8 @@ extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
>  extern void force_sigsegv(int sig, struct task_struct *p);
>  extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *);
>  extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
> -extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
> +extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
> +				bool expedite);
>  extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *,
>  				const struct cred *);
>  extern int kill_pgrp(struct pid *pid, int sig, int priv);
> diff --git a/include/linux/signal.h b/include/linux/signal.h
> index 9702016734b1..34b7852aa4a0 100644
> --- a/include/linux/signal.h
> +++ b/include/linux/signal.h
> @@ -446,8 +446,17 @@ int __save_altstack(stack_t __user *, unsigned long);
>  } while (0);
>  
>  #ifdef CONFIG_PROC_FS
> +
> +/*
> + * SS_FLAGS values used in pidfd_send_signal:
> + *
> + * SS_EXPEDITE indicates desire to expedite the operation.
> + */
> +#define SS_EXPEDITE	0x00000001

Does this make sense as an SS_* flag?
How does this relate to the signal stack?
Is there any intention to ever use this flag with stack_t?

New flags should be PIDFD_SIGNAL_*. (E.g. the thread flag will be
PIDFD_SIGNAL_THREAD.)
And since this is exposed to userspace in contrast to the mm internal
naming it should be something more easily understandable like
PIDFD_SIGNAL_MM_RECLAIM{_FASTER} or something.

> +
>  struct seq_file;
>  extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
> -#endif
> +
> +#endif /* CONFIG_PROC_FS */
>  
>  #endif /* _LINUX_SIGNAL_H */
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index aea30530c472..27c66296e08e 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -720,7 +720,7 @@ static void __do_notify(struct mqueue_inode_info *info)
>  			rcu_read_unlock();
>  
>  			kill_pid_info(info->notify.sigev_signo,
> -				      &sig_i, info->notify_owner);
> +				      &sig_i, info->notify_owner, false);
>  			break;
>  		case SIGEV_THREAD:
>  			set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
> diff --git a/kernel/signal.c b/kernel/signal.c
> index f98448cf2def..02ed4332d17c 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -43,6 +43,7 @@
>  #include <linux/compiler.h>
>  #include <linux/posix-timers.h>
>  #include <linux/livepatch.h>
> +#include <linux/oom.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/signal.h>
> @@ -1394,7 +1395,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
>  	return success ? 0 : retval;
>  }
>  
> -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
> +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
> +				  bool expedite)
>  {
>  	int error = -ESRCH;
>  	struct task_struct *p;
> @@ -1402,8 +1404,17 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
>  	for (;;) {
>  		rcu_read_lock();
>  		p = pid_task(pid, PIDTYPE_PID);
> -		if (p)
> +		if (p) {
>  			error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
> +
> +			/*
> +			 * Ignore expedite_reclaim return value, it is best
> +			 * effort only.
> +			 */
> +			if (!error && expedite)
> +				expedite_reclaim(p);

SIGKILL will take the whole thread group down so the reclaim should make
sense here.

> +		}
> +
>  		rcu_read_unlock();
>  		if (likely(!p || error != -ESRCH))
>  			return error;
> @@ -1420,7 +1431,7 @@ static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
>  {
>  	int error;
>  	rcu_read_lock();
> -	error = kill_pid_info(sig, info, find_vpid(pid));
> +	error = kill_pid_info(sig, info, find_vpid(pid), false);
>  	rcu_read_unlock();
>  	return error;
>  }
> @@ -1487,7 +1498,7 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
>  
>  	if (pid > 0) {
>  		rcu_read_lock();
> -		ret = kill_pid_info(sig, info, find_vpid(pid));
> +		ret = kill_pid_info(sig, info, find_vpid(pid), false);
>  		rcu_read_unlock();
>  		return ret;
>  	}
> @@ -1704,7 +1715,7 @@ EXPORT_SYMBOL(kill_pgrp);
>  
>  int kill_pid(struct pid *pid, int sig, int priv)
>  {
> -	return kill_pid_info(sig, __si_special(priv), pid);
> +	return kill_pid_info(sig, __si_special(priv), pid, false);
>  }
>  EXPORT_SYMBOL(kill_pid);
>  
> @@ -3577,10 +3588,20 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
>  	struct pid *pid;
>  	kernel_siginfo_t kinfo;
>  
> -	/* Enforce flags be set to 0 until we add an extension. */
> -	if (flags)
> +	/* Enforce no unknown flags. */
> +	if (flags & ~SS_EXPEDITE)
>  		return -EINVAL;
>  
> +	if (flags & SS_EXPEDITE) {
> +		/* Enforce SS_EXPEDITE to be used with SIGKILL only. */
> +		if (sig != SIGKILL)
> +			return -EINVAL;

Not super fond of this being a SIGKILL specific flag but I get why.

> +
> +		/* Limit expedited killing to privileged users only. */
> +		if (!capable(CAP_SYS_NICE))
> +			return -EPERM;

Do you have a specific (DOS or other) attack vector in mind that renders
ns_capable unsuitable?

> +	}
> +
>  	f = fdget_raw(pidfd);
>  	if (!f.file)
>  		return -EBADF;
> @@ -3614,7 +3635,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
>  		prepare_kill_siginfo(sig, &kinfo);
>  	}
>  
> -	ret = kill_pid_info(sig, &kinfo, pid);
> +	ret = kill_pid_info(sig, &kinfo, pid, (flags & SS_EXPEDITE) != 0);
>  
>  err:
>  	fdput(f);
> diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
> index 02068b2d5862..c926483cdb53 100644
> --- a/kernel/time/itimer.c
> +++ b/kernel/time/itimer.c
> @@ -140,7 +140,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
>  	struct pid *leader_pid = sig->pids[PIDTYPE_TGID];
>  
>  	trace_itimer_expire(ITIMER_REAL, leader_pid, 0);
> -	kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid);
> +	kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid, false);
>  
>  	return HRTIMER_NORESTART;
>  }
> -- 
> 2.21.0.392.gf8f6787159e-goog
>
Christian Brauner April 11, 2019, 10:34 a.m. UTC | #2
Cc: Oleg too

On Thu, Apr 11, 2019 at 12:30:18PM +0200, Christian Brauner wrote:
> On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > victim process. The usage of this flag is currently limited to SIGKILL
> > signal and only to privileged users.
> > 
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> >  include/linux/sched/signal.h |  3 ++-
> >  include/linux/signal.h       | 11 ++++++++++-
> >  ipc/mqueue.c                 |  2 +-
> >  kernel/signal.c              | 37 ++++++++++++++++++++++++++++--------
> >  kernel/time/itimer.c         |  2 +-
> >  5 files changed, 43 insertions(+), 12 deletions(-)
> > 
> > diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> > index e412c092c1e8..8a227633a058 100644
> > --- a/include/linux/sched/signal.h
> > +++ b/include/linux/sched/signal.h
> > @@ -327,7 +327,8 @@ extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
> >  extern void force_sigsegv(int sig, struct task_struct *p);
> >  extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *);
> >  extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
> > -extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
> > +extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
> > +				bool expedite);
> >  extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *,
> >  				const struct cred *);
> >  extern int kill_pgrp(struct pid *pid, int sig, int priv);
> > diff --git a/include/linux/signal.h b/include/linux/signal.h
> > index 9702016734b1..34b7852aa4a0 100644
> > --- a/include/linux/signal.h
> > +++ b/include/linux/signal.h
> > @@ -446,8 +446,17 @@ int __save_altstack(stack_t __user *, unsigned long);
> >  } while (0);
> >  
> >  #ifdef CONFIG_PROC_FS
> > +
> > +/*
> > + * SS_FLAGS values used in pidfd_send_signal:
> > + *
> > + * SS_EXPEDITE indicates desire to expedite the operation.
> > + */
> > +#define SS_EXPEDITE	0x00000001
> 
> Does this make sense as an SS_* flag?
> How does this relate to the signal stack?
> Is there any intention to ever use this flag with stack_t?
> 
> New flags should be PIDFD_SIGNAL_*. (E.g. the thread flag will be
> PIDFD_SIGNAL_THREAD.)
> And since this is exposed to userspace in contrast to the mm internal
> naming it should be something more easily understandable like
> PIDFD_SIGNAL_MM_RECLAIM{_FASTER} or something.
> 
> > +
> >  struct seq_file;
> >  extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
> > -#endif
> > +
> > +#endif /* CONFIG_PROC_FS */
> >  
> >  #endif /* _LINUX_SIGNAL_H */
> > diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> > index aea30530c472..27c66296e08e 100644
> > --- a/ipc/mqueue.c
> > +++ b/ipc/mqueue.c
> > @@ -720,7 +720,7 @@ static void __do_notify(struct mqueue_inode_info *info)
> >  			rcu_read_unlock();
> >  
> >  			kill_pid_info(info->notify.sigev_signo,
> > -				      &sig_i, info->notify_owner);
> > +				      &sig_i, info->notify_owner, false);
> >  			break;
> >  		case SIGEV_THREAD:
> >  			set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
> > diff --git a/kernel/signal.c b/kernel/signal.c
> > index f98448cf2def..02ed4332d17c 100644
> > --- a/kernel/signal.c
> > +++ b/kernel/signal.c
> > @@ -43,6 +43,7 @@
> >  #include <linux/compiler.h>
> >  #include <linux/posix-timers.h>
> >  #include <linux/livepatch.h>
> > +#include <linux/oom.h>
> >  
> >  #define CREATE_TRACE_POINTS
> >  #include <trace/events/signal.h>
> > @@ -1394,7 +1395,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
> >  	return success ? 0 : retval;
> >  }
> >  
> > -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
> > +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
> > +				  bool expedite)
> >  {
> >  	int error = -ESRCH;
> >  	struct task_struct *p;
> > @@ -1402,8 +1404,17 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
> >  	for (;;) {
> >  		rcu_read_lock();
> >  		p = pid_task(pid, PIDTYPE_PID);
> > -		if (p)
> > +		if (p) {
> >  			error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
> > +
> > +			/*
> > +			 * Ignore expedite_reclaim return value, it is best
> > +			 * effort only.
> > +			 */
> > +			if (!error && expedite)
> > +				expedite_reclaim(p);
> 
> SIGKILL will take the whole thread group down so the reclaim should make
> sense here.
> 
> > +		}
> > +
> >  		rcu_read_unlock();
> >  		if (likely(!p || error != -ESRCH))
> >  			return error;
> > @@ -1420,7 +1431,7 @@ static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
> >  {
> >  	int error;
> >  	rcu_read_lock();
> > -	error = kill_pid_info(sig, info, find_vpid(pid));
> > +	error = kill_pid_info(sig, info, find_vpid(pid), false);
> >  	rcu_read_unlock();
> >  	return error;
> >  }
> > @@ -1487,7 +1498,7 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
> >  
> >  	if (pid > 0) {
> >  		rcu_read_lock();
> > -		ret = kill_pid_info(sig, info, find_vpid(pid));
> > +		ret = kill_pid_info(sig, info, find_vpid(pid), false);
> >  		rcu_read_unlock();
> >  		return ret;
> >  	}
> > @@ -1704,7 +1715,7 @@ EXPORT_SYMBOL(kill_pgrp);
> >  
> >  int kill_pid(struct pid *pid, int sig, int priv)
> >  {
> > -	return kill_pid_info(sig, __si_special(priv), pid);
> > +	return kill_pid_info(sig, __si_special(priv), pid, false);
> >  }
> >  EXPORT_SYMBOL(kill_pid);
> >  
> > @@ -3577,10 +3588,20 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
> >  	struct pid *pid;
> >  	kernel_siginfo_t kinfo;
> >  
> > -	/* Enforce flags be set to 0 until we add an extension. */
> > -	if (flags)
> > +	/* Enforce no unknown flags. */
> > +	if (flags & ~SS_EXPEDITE)
> >  		return -EINVAL;
> >  
> > +	if (flags & SS_EXPEDITE) {
> > +		/* Enforce SS_EXPEDITE to be used with SIGKILL only. */
> > +		if (sig != SIGKILL)
> > +			return -EINVAL;
> 
> Not super fond of this being a SIGKILL specific flag but I get why.
> 
> > +
> > +		/* Limit expedited killing to privileged users only. */
> > +		if (!capable(CAP_SYS_NICE))
> > +			return -EPERM;
> 
> Do you have a specific (DOS or other) attack vector in mind that renders
> ns_capable unsuitable?
> 
> > +	}
> > +
> >  	f = fdget_raw(pidfd);
> >  	if (!f.file)
> >  		return -EBADF;
> > @@ -3614,7 +3635,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
> >  		prepare_kill_siginfo(sig, &kinfo);
> >  	}
> >  
> > -	ret = kill_pid_info(sig, &kinfo, pid);
> > +	ret = kill_pid_info(sig, &kinfo, pid, (flags & SS_EXPEDITE) != 0);
> >  
> >  err:
> >  	fdput(f);
> > diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
> > index 02068b2d5862..c926483cdb53 100644
> > --- a/kernel/time/itimer.c
> > +++ b/kernel/time/itimer.c
> > @@ -140,7 +140,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
> >  	struct pid *leader_pid = sig->pids[PIDTYPE_TGID];
> >  
> >  	trace_itimer_expire(ITIMER_REAL, leader_pid, 0);
> > -	kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid);
> > +	kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid, false);
> >  
> >  	return HRTIMER_NORESTART;
> >  }
> > -- 
> > 2.21.0.392.gf8f6787159e-goog
> >
Suren Baghdasaryan April 11, 2019, 3:18 p.m. UTC | #3
Thanks for the feedback!
Just to be clear, this implementation is used in this RFC as a
reference to explain the intent. To be honest I don't think it will be
adopted as is even if the idea survives scrutiny.

On Thu, Apr 11, 2019 at 3:30 AM Christian Brauner <christian@brauner.io> wrote:
>
> On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > victim process. The usage of this flag is currently limited to SIGKILL
> > signal and only to privileged users.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> >  include/linux/sched/signal.h |  3 ++-
> >  include/linux/signal.h       | 11 ++++++++++-
> >  ipc/mqueue.c                 |  2 +-
> >  kernel/signal.c              | 37 ++++++++++++++++++++++++++++--------
> >  kernel/time/itimer.c         |  2 +-
> >  5 files changed, 43 insertions(+), 12 deletions(-)
> >
> > diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> > index e412c092c1e8..8a227633a058 100644
> > --- a/include/linux/sched/signal.h
> > +++ b/include/linux/sched/signal.h
> > @@ -327,7 +327,8 @@ extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
> >  extern void force_sigsegv(int sig, struct task_struct *p);
> >  extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *);
> >  extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
> > -extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
> > +extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
> > +                             bool expedite);
> >  extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *,
> >                               const struct cred *);
> >  extern int kill_pgrp(struct pid *pid, int sig, int priv);
> > diff --git a/include/linux/signal.h b/include/linux/signal.h
> > index 9702016734b1..34b7852aa4a0 100644
> > --- a/include/linux/signal.h
> > +++ b/include/linux/signal.h
> > @@ -446,8 +446,17 @@ int __save_altstack(stack_t __user *, unsigned long);
> >  } while (0);
> >
> >  #ifdef CONFIG_PROC_FS
> > +
> > +/*
> > + * SS_FLAGS values used in pidfd_send_signal:
> > + *
> > + * SS_EXPEDITE indicates desire to expedite the operation.
> > + */
> > +#define SS_EXPEDITE  0x00000001
>
> Does this make sense as an SS_* flag?
> How does this relate to the signal stack?

It doesn't, so I agree that the name should be changed.
PIDFD_SIGNAL_EXPEDITE_MM_RECLAIM would seem appropriate.

> Is there any intention to ever use this flag with stack_t?
>
> New flags should be PIDFD_SIGNAL_*. (E.g. the thread flag will be
> PIDFD_SIGNAL_THREAD.)
> And since this is exposed to userspace in contrast to the mm internal
> naming it should be something more easily understandable like
> PIDFD_SIGNAL_MM_RECLAIM{_FASTER} or something.
>
> > +
> >  struct seq_file;
> >  extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
> > -#endif
> > +
> > +#endif /* CONFIG_PROC_FS */
> >
> >  #endif /* _LINUX_SIGNAL_H */
> > diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> > index aea30530c472..27c66296e08e 100644
> > --- a/ipc/mqueue.c
> > +++ b/ipc/mqueue.c
> > @@ -720,7 +720,7 @@ static void __do_notify(struct mqueue_inode_info *info)
> >                       rcu_read_unlock();
> >
> >                       kill_pid_info(info->notify.sigev_signo,
> > -                                   &sig_i, info->notify_owner);
> > +                                   &sig_i, info->notify_owner, false);
> >                       break;
> >               case SIGEV_THREAD:
> >                       set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
> > diff --git a/kernel/signal.c b/kernel/signal.c
> > index f98448cf2def..02ed4332d17c 100644
> > --- a/kernel/signal.c
> > +++ b/kernel/signal.c
> > @@ -43,6 +43,7 @@
> >  #include <linux/compiler.h>
> >  #include <linux/posix-timers.h>
> >  #include <linux/livepatch.h>
> > +#include <linux/oom.h>
> >
> >  #define CREATE_TRACE_POINTS
> >  #include <trace/events/signal.h>
> > @@ -1394,7 +1395,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
> >       return success ? 0 : retval;
> >  }
> >
> > -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
> > +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
> > +                               bool expedite)
> >  {
> >       int error = -ESRCH;
> >       struct task_struct *p;
> > @@ -1402,8 +1404,17 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
> >       for (;;) {
> >               rcu_read_lock();
> >               p = pid_task(pid, PIDTYPE_PID);
> > -             if (p)
> > +             if (p) {
> >                       error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
> > +
> > +                     /*
> > +                      * Ignore expedite_reclaim return value, it is best
> > +                      * effort only.
> > +                      */
> > +                     if (!error && expedite)
> > +                             expedite_reclaim(p);
>
> SIGKILL will take the whole thread group down so the reclaim should make
> sense here.
>

This sounds like confirmation. I hope I'm not missing some flaw that
you are trying to point out.

> > +             }
> > +
> >               rcu_read_unlock();
> >               if (likely(!p || error != -ESRCH))
> >                       return error;
> > @@ -1420,7 +1431,7 @@ static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
> >  {
> >       int error;
> >       rcu_read_lock();
> > -     error = kill_pid_info(sig, info, find_vpid(pid));
> > +     error = kill_pid_info(sig, info, find_vpid(pid), false);
> >       rcu_read_unlock();
> >       return error;
> >  }
> > @@ -1487,7 +1498,7 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
> >
> >       if (pid > 0) {
> >               rcu_read_lock();
> > -             ret = kill_pid_info(sig, info, find_vpid(pid));
> > +             ret = kill_pid_info(sig, info, find_vpid(pid), false);
> >               rcu_read_unlock();
> >               return ret;
> >       }
> > @@ -1704,7 +1715,7 @@ EXPORT_SYMBOL(kill_pgrp);
> >
> >  int kill_pid(struct pid *pid, int sig, int priv)
> >  {
> > -     return kill_pid_info(sig, __si_special(priv), pid);
> > +     return kill_pid_info(sig, __si_special(priv), pid, false);
> >  }
> >  EXPORT_SYMBOL(kill_pid);
> >
> > @@ -3577,10 +3588,20 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
> >       struct pid *pid;
> >       kernel_siginfo_t kinfo;
> >
> > -     /* Enforce flags be set to 0 until we add an extension. */
> > -     if (flags)
> > +     /* Enforce no unknown flags. */
> > +     if (flags & ~SS_EXPEDITE)
> >               return -EINVAL;
> >
> > +     if (flags & SS_EXPEDITE) {
> > +             /* Enforce SS_EXPEDITE to be used with SIGKILL only. */
> > +             if (sig != SIGKILL)
> > +                     return -EINVAL;
>
> Not super fond of this being a SIGKILL specific flag but I get why.

Understood. I was thinking that EXPEDITE flag might make sense for
other signals in the future but from internal feedback sounds like if
we go this way the flag name should be more specific.

> > +
> > +             /* Limit expedited killing to privileged users only. */
> > +             if (!capable(CAP_SYS_NICE))
> > +                     return -EPERM;
>
> Do you have a specific (DOS or other) attack vector in mind that renders
> ns_capable unsuitable?
>
> > +     }
> > +
> >       f = fdget_raw(pidfd);
> >       if (!f.file)
> >               return -EBADF;
> > @@ -3614,7 +3635,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
> >               prepare_kill_siginfo(sig, &kinfo);
> >       }
> >
> > -     ret = kill_pid_info(sig, &kinfo, pid);
> > +     ret = kill_pid_info(sig, &kinfo, pid, (flags & SS_EXPEDITE) != 0);
> >
> >  err:
> >       fdput(f);
> > diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
> > index 02068b2d5862..c926483cdb53 100644
> > --- a/kernel/time/itimer.c
> > +++ b/kernel/time/itimer.c
> > @@ -140,7 +140,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
> >       struct pid *leader_pid = sig->pids[PIDTYPE_TGID];
> >
> >       trace_itimer_expire(ITIMER_REAL, leader_pid, 0);
> > -     kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid);
> > +     kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid, false);
> >
> >       return HRTIMER_NORESTART;
> >  }
> > --
> > 2.21.0.392.gf8f6787159e-goog
> >
Suren Baghdasaryan April 11, 2019, 3:23 p.m. UTC | #4
On Thu, Apr 11, 2019 at 8:18 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> Thanks for the feedback!
> Just to be clear, this implementation is used in this RFC as a
> reference to explain the intent. To be honest I don't think it will be
> adopted as is even if the idea survives scrutiny.
>
> On Thu, Apr 11, 2019 at 3:30 AM Christian Brauner <christian@brauner.io> wrote:
> >
> > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > victim process. The usage of this flag is currently limited to SIGKILL
> > > signal and only to privileged users.
> > >
> > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > ---
> > >  include/linux/sched/signal.h |  3 ++-
> > >  include/linux/signal.h       | 11 ++++++++++-
> > >  ipc/mqueue.c                 |  2 +-
> > >  kernel/signal.c              | 37 ++++++++++++++++++++++++++++--------
> > >  kernel/time/itimer.c         |  2 +-
> > >  5 files changed, 43 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> > > index e412c092c1e8..8a227633a058 100644
> > > --- a/include/linux/sched/signal.h
> > > +++ b/include/linux/sched/signal.h
> > > @@ -327,7 +327,8 @@ extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
> > >  extern void force_sigsegv(int sig, struct task_struct *p);
> > >  extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *);
> > >  extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
> > > -extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
> > > +extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
> > > +                             bool expedite);
> > >  extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *,
> > >                               const struct cred *);
> > >  extern int kill_pgrp(struct pid *pid, int sig, int priv);
> > > diff --git a/include/linux/signal.h b/include/linux/signal.h
> > > index 9702016734b1..34b7852aa4a0 100644
> > > --- a/include/linux/signal.h
> > > +++ b/include/linux/signal.h
> > > @@ -446,8 +446,17 @@ int __save_altstack(stack_t __user *, unsigned long);
> > >  } while (0);
> > >
> > >  #ifdef CONFIG_PROC_FS
> > > +
> > > +/*
> > > + * SS_FLAGS values used in pidfd_send_signal:
> > > + *
> > > + * SS_EXPEDITE indicates desire to expedite the operation.
> > > + */
> > > +#define SS_EXPEDITE  0x00000001
> >
> > Does this make sense as an SS_* flag?
> > How does this relate to the signal stack?
>
> It doesn't, so I agree that the name should be changed.
> PIDFD_SIGNAL_EXPEDITE_MM_RECLAIM would seem appropriate.
>
> > Is there any intention to ever use this flag with stack_t?
> >
> > New flags should be PIDFD_SIGNAL_*. (E.g. the thread flag will be
> > PIDFD_SIGNAL_THREAD.)
> > And since this is exposed to userspace in contrast to the mm internal
> > naming it should be something more easily understandable like
> > PIDFD_SIGNAL_MM_RECLAIM{_FASTER} or something.
> >
> > > +
> > >  struct seq_file;
> > >  extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
> > > -#endif
> > > +
> > > +#endif /* CONFIG_PROC_FS */
> > >
> > >  #endif /* _LINUX_SIGNAL_H */
> > > diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> > > index aea30530c472..27c66296e08e 100644
> > > --- a/ipc/mqueue.c
> > > +++ b/ipc/mqueue.c
> > > @@ -720,7 +720,7 @@ static void __do_notify(struct mqueue_inode_info *info)
> > >                       rcu_read_unlock();
> > >
> > >                       kill_pid_info(info->notify.sigev_signo,
> > > -                                   &sig_i, info->notify_owner);
> > > +                                   &sig_i, info->notify_owner, false);
> > >                       break;
> > >               case SIGEV_THREAD:
> > >                       set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
> > > diff --git a/kernel/signal.c b/kernel/signal.c
> > > index f98448cf2def..02ed4332d17c 100644
> > > --- a/kernel/signal.c
> > > +++ b/kernel/signal.c
> > > @@ -43,6 +43,7 @@
> > >  #include <linux/compiler.h>
> > >  #include <linux/posix-timers.h>
> > >  #include <linux/livepatch.h>
> > > +#include <linux/oom.h>
> > >
> > >  #define CREATE_TRACE_POINTS
> > >  #include <trace/events/signal.h>
> > > @@ -1394,7 +1395,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
> > >       return success ? 0 : retval;
> > >  }
> > >
> > > -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
> > > +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
> > > +                               bool expedite)
> > >  {
> > >       int error = -ESRCH;
> > >       struct task_struct *p;
> > > @@ -1402,8 +1404,17 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
> > >       for (;;) {
> > >               rcu_read_lock();
> > >               p = pid_task(pid, PIDTYPE_PID);
> > > -             if (p)
> > > +             if (p) {
> > >                       error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
> > > +
> > > +                     /*
> > > +                      * Ignore expedite_reclaim return value, it is best
> > > +                      * effort only.
> > > +                      */
> > > +                     if (!error && expedite)
> > > +                             expedite_reclaim(p);
> >
> > SIGKILL will take the whole thread group down so the reclaim should make
> > sense here.
> >
>
> This sounds like confirmation. I hope I'm not missing some flaw that
> you are trying to point out.
>
> > > +             }
> > > +
> > >               rcu_read_unlock();
> > >               if (likely(!p || error != -ESRCH))
> > >                       return error;
> > > @@ -1420,7 +1431,7 @@ static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
> > >  {
> > >       int error;
> > >       rcu_read_lock();
> > > -     error = kill_pid_info(sig, info, find_vpid(pid));
> > > +     error = kill_pid_info(sig, info, find_vpid(pid), false);
> > >       rcu_read_unlock();
> > >       return error;
> > >  }
> > > @@ -1487,7 +1498,7 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
> > >
> > >       if (pid > 0) {
> > >               rcu_read_lock();
> > > -             ret = kill_pid_info(sig, info, find_vpid(pid));
> > > +             ret = kill_pid_info(sig, info, find_vpid(pid), false);
> > >               rcu_read_unlock();
> > >               return ret;
> > >       }
> > > @@ -1704,7 +1715,7 @@ EXPORT_SYMBOL(kill_pgrp);
> > >
> > >  int kill_pid(struct pid *pid, int sig, int priv)
> > >  {
> > > -     return kill_pid_info(sig, __si_special(priv), pid);
> > > +     return kill_pid_info(sig, __si_special(priv), pid, false);
> > >  }
> > >  EXPORT_SYMBOL(kill_pid);
> > >
> > > @@ -3577,10 +3588,20 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
> > >       struct pid *pid;
> > >       kernel_siginfo_t kinfo;
> > >
> > > -     /* Enforce flags be set to 0 until we add an extension. */
> > > -     if (flags)
> > > +     /* Enforce no unknown flags. */
> > > +     if (flags & ~SS_EXPEDITE)
> > >               return -EINVAL;
> > >
> > > +     if (flags & SS_EXPEDITE) {
> > > +             /* Enforce SS_EXPEDITE to be used with SIGKILL only. */
> > > +             if (sig != SIGKILL)
> > > +                     return -EINVAL;
> >
> > Not super fond of this being a SIGKILL specific flag but I get why.
>
> Understood. I was thinking that EXPEDITE flag might make sense for
> other signals in the future but from internal feedback sounds like if
> we go this way the flag name should be more specific.
>
> > > +
> > > +             /* Limit expedited killing to privileged users only. */
> > > +             if (!capable(CAP_SYS_NICE))
> > > +                     return -EPERM;
> >
> > Do you have a specific (DOS or other) attack vector in mind that renders
> > ns_capable unsuitable?
> >

Missed this one. I was thinking of oom-reaper thread as a limited
system resource (one thread which maintains a kill list and reaps
process mms one at a time) and therefore should be protected from
abuse.

> > > +     }
> > > +
> > >       f = fdget_raw(pidfd);
> > >       if (!f.file)
> > >               return -EBADF;
> > > @@ -3614,7 +3635,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
> > >               prepare_kill_siginfo(sig, &kinfo);
> > >       }
> > >
> > > -     ret = kill_pid_info(sig, &kinfo, pid);
> > > +     ret = kill_pid_info(sig, &kinfo, pid, (flags & SS_EXPEDITE) != 0);
> > >
> > >  err:
> > >       fdput(f);
> > > diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
> > > index 02068b2d5862..c926483cdb53 100644
> > > --- a/kernel/time/itimer.c
> > > +++ b/kernel/time/itimer.c
> > > @@ -140,7 +140,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
> > >       struct pid *leader_pid = sig->pids[PIDTYPE_TGID];
> > >
> > >       trace_itimer_expire(ITIMER_REAL, leader_pid, 0);
> > > -     kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid);
> > > +     kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid, false);
> > >
> > >       return HRTIMER_NORESTART;
> > >  }
> > > --
> > > 2.21.0.392.gf8f6787159e-goog
> > >
Matthew Wilcox April 11, 2019, 3:33 p.m. UTC | #5
On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> pidfd_send_signal() syscall to allow expedited memory reclaim of the
> victim process. The usage of this flag is currently limited to SIGKILL
> signal and only to privileged users.

What is the downside of doing expedited memory reclaim?  ie why not do it
every time a process is going to die?
Daniel Colascione April 11, 2019, 4:25 p.m. UTC | #6
On Thu, Apr 11, 2019 at 8:23 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > > victim process. The usage of this flag is currently limited to SIGKILL
> > > > signal and only to privileged users.

FWIW, I like Suren's general idea, but I was thinking of a different
way of exposing the same general functionality to userspace. The way I
look at it, it's very useful for an auto-balancing memory system like
Android (or, I presume, something that uses oomd) to recover memory
*immediately* after a SIGKILL instead of waiting for the process to
kill itself: a process's death can be delayed for a long time due to
factors like scheduling and being blocked in various uninterruptible
kernel-side operations. Suren's proposal is basically about pulling
forward in time page reclaimation that would happen anyway.

What if we let userspace control exactly when this reclaimation
happens? I'm imagining a new* kernel facility that basically looks
like this. It lets lmkd determine for itself how much work the system
should expend on reclaiming memory from dying processes.

size_t try_reap_dying_process(
  int pidfd,
  int flags /* must be zero */,
  size_t maximum_bytes_to_reap);

Precondition: process is pending group-exit (someone already sent it SIGKILL)
Postcondition: some memory reclaimed from dying process
Invariant: doesn't sleep; stops reaping after MAXIMUM_BYTES_TO_REAP

-> success: return number of bytes reaped
-> failure: (size_t)-1

EBUSY: couldn't get mmap_sem
EINVAL: PIDFD isn't a pidfd or otherwise invalid arguments
EPERM: process hasn't been send SIGKILL: try_reap_dying_process on a
process that isn't dying is illegal

Kernel-side, try_reap_dying_process would try-acquire mmap_sem and
just fail if it couldn't get it. Once acquired, it would release
"easy" pages (using the same check the oom reaper uses) until it
either ran out of pages or hit the MAXIMUM_BYTES_TO_REAP cap. The
purpose of MAXIMUM_BYTES_TO_REAP is to let userspace bound-above the
amount of time we spend reclaiming pages. It'd be up to userspace to
set policy on retries, the actual value of the reap cap, the priority
at which we run TRY_REAP_DYING_PROCESS, and so on. We return the
number of bytes we managed to free this way so that lmkd can make an
informed decision about what to do next, e.g., kill something else or
wait a little while.

Personally, I like th approach a bit more that recruiting the oom
reaper through because it doesn't affect any kind of  emergency memory
reserve permission and because it frees us from having to think about
whether the oom reaper's thread priority is right for this particular
job.

It also occurred to me that try_reap_dying_process might make a decent
shrinker callback. Shrinkers are there, AIUI, to reclaim memory that's
easy to free and that's not essential for correct kernel operation.
Usually, it's some kind of cache that meets these criteria. But the
private pages of a dying process also meet the criteria, don't they?
I'm imagining the shrinker just picking an arbitrary doomed (dying but
not yet dead) process and freeing some of its pages. I know there are
concerns about slow shrinkers causing havoc throughout the system, but
since this shrinker would be bounded above on CPU time and would never
block, I feel like it'd be pretty safe.

* insert standard missive about system calls being cheap, but we can
talk about the way in which we expose this functionality after we
agree that it's a good idea generally
Johannes Weiner April 11, 2019, 5:05 p.m. UTC | #7
On Thu, Apr 11, 2019 at 08:33:13AM -0700, Matthew Wilcox wrote:
> On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > victim process. The usage of this flag is currently limited to SIGKILL
> > signal and only to privileged users.
> 
> What is the downside of doing expedited memory reclaim?  ie why not do it
> every time a process is going to die?

I agree with this. The oom reaper mostly does work the exiting task
would have to do anyway, so this shouldn't be measurably more
expensive to do it per default. I wouldn't want to add a new interface
unless we really do need that type of control.
Suren Baghdasaryan April 11, 2019, 5:09 p.m. UTC | #8
On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > victim process. The usage of this flag is currently limited to SIGKILL
> > signal and only to privileged users.
>
> What is the downside of doing expedited memory reclaim?  ie why not do it
> every time a process is going to die?

I think with an implementation that does not use/abuse oom-reaper
thread this could be done for any kill. As I mentioned oom-reaper is a
limited resource which has access to memory reserves and should not be
abused in the way I do in this reference implementation.
While there might be downsides that I don't know of, I'm not sure it's
required to hurry every kill's memory reclaim. I think there are cases
when resource deallocation is critical, for example when we kill to
relieve resource shortage and there are kills when reclaim speed is
not essential. It would be great if we can identify urgent cases
without userspace hints, so I'm open to suggestions that do not
involve additional flags.
Daniel Colascione April 11, 2019, 5:33 p.m. UTC | #9
On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > victim process. The usage of this flag is currently limited to SIGKILL
> > > signal and only to privileged users.
> >
> > What is the downside of doing expedited memory reclaim?  ie why not do it
> > every time a process is going to die?
>
> I think with an implementation that does not use/abuse oom-reaper
> thread this could be done for any kill. As I mentioned oom-reaper is a
> limited resource which has access to memory reserves and should not be
> abused in the way I do in this reference implementation.
> While there might be downsides that I don't know of, I'm not sure it's
> required to hurry every kill's memory reclaim. I think there are cases
> when resource deallocation is critical, for example when we kill to
> relieve resource shortage and there are kills when reclaim speed is
> not essential. It would be great if we can identify urgent cases
> without userspace hints, so I'm open to suggestions that do not
> involve additional flags.

I was imagining a PI-ish approach where we'd reap in case an RT
process was waiting on the death of some other process. I'd still
prefer the API I proposed in the other message because it gets the
kernel out of the business of deciding what the right signal is. I'm a
huge believer in "mechanism, not policy".
Matthew Wilcox April 11, 2019, 5:36 p.m. UTC | #10
On Thu, Apr 11, 2019 at 10:33:32AM -0700, Daniel Colascione wrote:
> On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > > victim process. The usage of this flag is currently limited to SIGKILL
> > > > signal and only to privileged users.
> > >
> > > What is the downside of doing expedited memory reclaim?  ie why not do it
> > > every time a process is going to die?
> >
> > I think with an implementation that does not use/abuse oom-reaper
> > thread this could be done for any kill. As I mentioned oom-reaper is a
> > limited resource which has access to memory reserves and should not be
> > abused in the way I do in this reference implementation.
> > While there might be downsides that I don't know of, I'm not sure it's
> > required to hurry every kill's memory reclaim. I think there are cases
> > when resource deallocation is critical, for example when we kill to
> > relieve resource shortage and there are kills when reclaim speed is
> > not essential. It would be great if we can identify urgent cases
> > without userspace hints, so I'm open to suggestions that do not
> > involve additional flags.
> 
> I was imagining a PI-ish approach where we'd reap in case an RT
> process was waiting on the death of some other process. I'd still
> prefer the API I proposed in the other message because it gets the
> kernel out of the business of deciding what the right signal is. I'm a
> huge believer in "mechanism, not policy".

It's not a question of the kernel deciding what the right signal is.
The kernel knows whether a signal is fatal to a particular process or not.
The question is whether the killing process should do the work of reaping
the dying process's resources sometimes, always or never.  Currently,
that is never (the process reaps its own resources); Suren is suggesting
sometimes, and I'm asking "Why not always?"
Daniel Colascione April 11, 2019, 5:47 p.m. UTC | #11
On Thu, Apr 11, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, Apr 11, 2019 at 10:33:32AM -0700, Daniel Colascione wrote:
> > On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote:
> > > >
> > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > > > victim process. The usage of this flag is currently limited to SIGKILL
> > > > > signal and only to privileged users.
> > > >
> > > > What is the downside of doing expedited memory reclaim?  ie why not do it
> > > > every time a process is going to die?
> > >
> > > I think with an implementation that does not use/abuse oom-reaper
> > > thread this could be done for any kill. As I mentioned oom-reaper is a
> > > limited resource which has access to memory reserves and should not be
> > > abused in the way I do in this reference implementation.
> > > While there might be downsides that I don't know of, I'm not sure it's
> > > required to hurry every kill's memory reclaim. I think there are cases
> > > when resource deallocation is critical, for example when we kill to
> > > relieve resource shortage and there are kills when reclaim speed is
> > > not essential. It would be great if we can identify urgent cases
> > > without userspace hints, so I'm open to suggestions that do not
> > > involve additional flags.
> >
> > I was imagining a PI-ish approach where we'd reap in case an RT
> > process was waiting on the death of some other process. I'd still
> > prefer the API I proposed in the other message because it gets the
> > kernel out of the business of deciding what the right signal is. I'm a
> > huge believer in "mechanism, not policy".
>
> It's not a question of the kernel deciding what the right signal is.
> The kernel knows whether a signal is fatal to a particular process or not.
> The question is whether the killing process should do the work of reaping
> the dying process's resources sometimes, always or never.  Currently,
> that is never (the process reaps its own resources); Suren is suggesting
> sometimes, and I'm asking "Why not always?"

FWIW, Suren's initial proposal is that the oom_reaper kthread do the
reaping, not the process sending the kill. Are you suggesting that
sending SIGKILL should spend a while in signal delivery reaping pages
before returning? I thought about just doing it this way, but I didn't
like the idea: it'd slow down mass-killing programs like killall(1).
Programs expect sending SIGKILL to be a fast operation that returns
immediately.
Suren Baghdasaryan April 11, 2019, 5:52 p.m. UTC | #12
On Thu, Apr 11, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, Apr 11, 2019 at 10:33:32AM -0700, Daniel Colascione wrote:
> > On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote:
> > > >
> > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > > > victim process. The usage of this flag is currently limited to SIGKILL
> > > > > signal and only to privileged users.
> > > >
> > > > What is the downside of doing expedited memory reclaim?  ie why not do it
> > > > every time a process is going to die?
> > >
> > > I think with an implementation that does not use/abuse oom-reaper
> > > thread this could be done for any kill. As I mentioned oom-reaper is a
> > > limited resource which has access to memory reserves and should not be
> > > abused in the way I do in this reference implementation.
> > > While there might be downsides that I don't know of, I'm not sure it's
> > > required to hurry every kill's memory reclaim. I think there are cases
> > > when resource deallocation is critical, for example when we kill to
> > > relieve resource shortage and there are kills when reclaim speed is
> > > not essential. It would be great if we can identify urgent cases
> > > without userspace hints, so I'm open to suggestions that do not
> > > involve additional flags.
> >
> > I was imagining a PI-ish approach where we'd reap in case an RT
> > process was waiting on the death of some other process. I'd still
> > prefer the API I proposed in the other message because it gets the
> > kernel out of the business of deciding what the right signal is. I'm a
> > huge believer in "mechanism, not policy".
>
> It's not a question of the kernel deciding what the right signal is.
> The kernel knows whether a signal is fatal to a particular process or not.
> The question is whether the killing process should do the work of reaping
> the dying process's resources sometimes, always or never.  Currently,
> that is never (the process reaps its own resources); Suren is suggesting
> sometimes, and I'm asking "Why not always?"

If there are no downsides of doing this always (like using some
resources that can be utilized in a better way) then by all means,
let's do it unconditionally. My current implementation is not one of
such cases :)

I think with implementation when killing process is doing the reaping
of the victim's mm this can be done unconditionally because we don't
use resources which might otherwise be used in a better way. Overall I
like Daniel's idea of the process that requested killing and is
waiting for the victim to die helping in reaping its memory. It kind
of naturally elevates the priority of the reaping if the priority of
the waiting process is higher than victim's priority (a kind of
priority inheritance).
Roman Gushchin April 11, 2019, 9:45 p.m. UTC | #13
On Thu, Apr 11, 2019 at 10:09:06AM -0700, Suren Baghdasaryan wrote:
> On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > victim process. The usage of this flag is currently limited to SIGKILL
> > > signal and only to privileged users.
> >
> > What is the downside of doing expedited memory reclaim?  ie why not do it
> > every time a process is going to die?

Hello, Suren!

I also like the idea to reap always.

> I think with an implementation that does not use/abuse oom-reaper
> thread this could be done for any kill. As I mentioned oom-reaper is a
> limited resource which has access to memory reserves and should not be
> abused in the way I do in this reference implementation.

In most OOM cases it doesn't matter that much which task to reap,
so I don't think that reusing the oom-reaper thread is bad.
It should be relatively easy to tweak in a way, that it won't
wait for mmap_sem if there are other tasks waiting to be reaped.
Also, the oom code add to the head of the list, and the expedited
killing to the end, or something like this.

The only think, if we're going to reap all tasks, we probably
want to have a per-node oom_reaper thread.

Thanks!
Suren Baghdasaryan April 11, 2019, 9:59 p.m. UTC | #14
On Thu, Apr 11, 2019 at 2:45 PM Roman Gushchin <guro@fb.com> wrote:
>
> On Thu, Apr 11, 2019 at 10:09:06AM -0700, Suren Baghdasaryan wrote:
> > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > > victim process. The usage of this flag is currently limited to SIGKILL
> > > > signal and only to privileged users.
> > >
> > > What is the downside of doing expedited memory reclaim?  ie why not do it
> > > every time a process is going to die?
>
> Hello, Suren!
>
> I also like the idea to reap always.
>
> > I think with an implementation that does not use/abuse oom-reaper
> > thread this could be done for any kill. As I mentioned oom-reaper is a
> > limited resource which has access to memory reserves and should not be
> > abused in the way I do in this reference implementation.
>
> In most OOM cases it doesn't matter that much which task to reap,
> so I don't think that reusing the oom-reaper thread is bad.
> It should be relatively easy to tweak in a way, that it won't
> wait for mmap_sem if there are other tasks waiting to be reaped.
> Also, the oom code add to the head of the list, and the expedited
> killing to the end, or something like this.
>
> The only think, if we're going to reap all tasks, we probably
> want to have a per-node oom_reaper thread.

Thanks for the ideas Roman. I'll take some time to digest the input
from everybody. What I heard from everyone is that we want this to be
a part of generic kill functionality which does not require a change
in userspace API.

> Thanks!
>
> --
> You received this message because you are subscribed to the Google Groups "kernel-team" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Michal Hocko April 12, 2019, 6:49 a.m. UTC | #15
On Thu 11-04-19 10:47:50, Daniel Colascione wrote:
> On Thu, Apr 11, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Thu, Apr 11, 2019 at 10:33:32AM -0700, Daniel Colascione wrote:
> > > On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote:
> > > > >
> > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > > > > victim process. The usage of this flag is currently limited to SIGKILL
> > > > > > signal and only to privileged users.
> > > > >
> > > > > What is the downside of doing expedited memory reclaim?  ie why not do it
> > > > > every time a process is going to die?
> > > >
> > > > I think with an implementation that does not use/abuse oom-reaper
> > > > thread this could be done for any kill. As I mentioned oom-reaper is a
> > > > limited resource which has access to memory reserves and should not be
> > > > abused in the way I do in this reference implementation.
> > > > While there might be downsides that I don't know of, I'm not sure it's
> > > > required to hurry every kill's memory reclaim. I think there are cases
> > > > when resource deallocation is critical, for example when we kill to
> > > > relieve resource shortage and there are kills when reclaim speed is
> > > > not essential. It would be great if we can identify urgent cases
> > > > without userspace hints, so I'm open to suggestions that do not
> > > > involve additional flags.
> > >
> > > I was imagining a PI-ish approach where we'd reap in case an RT
> > > process was waiting on the death of some other process. I'd still
> > > prefer the API I proposed in the other message because it gets the
> > > kernel out of the business of deciding what the right signal is. I'm a
> > > huge believer in "mechanism, not policy".
> >
> > It's not a question of the kernel deciding what the right signal is.
> > The kernel knows whether a signal is fatal to a particular process or not.
> > The question is whether the killing process should do the work of reaping
> > the dying process's resources sometimes, always or never.  Currently,
> > that is never (the process reaps its own resources); Suren is suggesting
> > sometimes, and I'm asking "Why not always?"
> 
> FWIW, Suren's initial proposal is that the oom_reaper kthread do the
> reaping, not the process sending the kill. Are you suggesting that
> sending SIGKILL should spend a while in signal delivery reaping pages
> before returning? I thought about just doing it this way, but I didn't
> like the idea: it'd slow down mass-killing programs like killall(1).
> Programs expect sending SIGKILL to be a fast operation that returns
> immediately.

I was thinking about this as well. And SYNC_SIGKILL would workaround the
current expectations of how quick the current implementation is. The
harder part would what is the actual semantic. Does the kill wait until
the target task is TASK_DEAD or is there an intermediate step that would
we could call it end of the day and still have a reasonable semantic
(e.g. the original pid is really not alive anymore).
Michal Hocko April 12, 2019, 6:53 a.m. UTC | #16
On Thu 11-04-19 08:33:13, Matthew Wilcox wrote:
> On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > victim process. The usage of this flag is currently limited to SIGKILL
> > signal and only to privileged users.
> 
> What is the downside of doing expedited memory reclaim?  ie why not do it
> every time a process is going to die?
 
Well, you are tearing down an address space which might be still in use
because the task not fully dead yeat. So there are two downsides AFAICS.
Core dumping which will not see the reaped memory so the resulting
coredump might be incomplete. And unexpected #PF/gup on the reaped
memory will result in SIGBUS. These are things that we have closed our
eyes in the oom context because they likely do not matter. If we want to
use the same technique for other usecases then we have to think how much
that matter again.
Suren Baghdasaryan April 12, 2019, 2:02 p.m. UTC | #17
On Thu, Apr 11, 2019 at 11:53 PM Michal Hocko <mhocko@kernel.org> wrote:

> On Thu 11-04-19 08:33:13, Matthew Wilcox wrote:
> > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > victim process. The usage of this flag is currently limited to SIGKILL
> > > signal and only to privileged users.
> >
> > What is the downside of doing expedited memory reclaim?  ie why not do it
> > every time a process is going to die?
>
> Well, you are tearing down an address space which might be still in use
> because the task not fully dead yeat. So there are two downsides AFAICS.
> Core dumping which will not see the reaped memory so the resulting
> coredump might be incomplete. And unexpected #PF/gup on the reaped
> memory will result in SIGBUS. These are things that we have closed our
> eyes in the oom context because they likely do not matter. If we want to
> use the same technique for other usecases then we have to think how much
> that matter again.


After some internal discussions we realized that there is one additional
downside of doing reaping unconditionally. If this async reaping is done on
a more efficient and energy hungrier CPU irrespective of urgency of the
kill it should end up costing more power overall (I’m referring here to
assimetric architectures like ARM big.LITTLE). Obviously quantifying that
cost is not easy as it depends on the usecase and a particular system but
it won’t be zero. So I think we will need some gating condition after all.


>
> --
> Michal Hocko
> SUSE Labs
>
> --
> You received this message because you are subscribed to the Google Groups
> "kernel-team" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to kernel-team+unsubscribe@android.com.
>
>
<div><div><br></div><div><br><div class="gmail_quote"></div></div></div><div><div dir="ltr" class="gmail_attr">On Thu, Apr 11, 2019 at 11:53 PM Michal Hocko &lt;<a href="mailto:mhocko@kernel.org" target="_blank">mhocko@kernel.org</a>&gt; wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">On Thu 11-04-19 08:33:13, Matthew Wilcox wrote:<br>
&gt; On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:<br>
&gt; &gt; Add new SS_EXPEDITE flag to be used when sending SIGKILL via<br>
&gt; &gt; pidfd_send_signal() syscall to allow expedited memory reclaim of the<br>
&gt; &gt; victim process. The usage of this flag is currently limited to SIGKILL<br>
&gt; &gt; signal and only to privileged users.<br>
&gt; <br>
&gt; What is the downside of doing expedited memory reclaim?  ie why not do it<br>
&gt; every time a process is going to die?<br>
<br>
Well, you are tearing down an address space which might be still in use<br>
because the task not fully dead yeat. So there are two downsides AFAICS.<br>
Core dumping which will not see the reaped memory so the resulting<br>
coredump might be incomplete. And unexpected #PF/gup on the reaped<br>
memory will result in SIGBUS. These are things that we have closed our<br>
eyes in the oom context because they likely do not matter. If we want to<br>
use the same technique for other usecases then we have to think how much<br>
that matter again.</blockquote><div dir="auto"><br></div></div><div><div dir="auto">After some internal discussions we realized that there is one additional downside of doing reaping unconditionally. If this async reaping is done on a more efficient and energy hungrier CPU irrespective of urgency of the kill it should end up costing more power overall (I’m referring here to assimetric architectures like ARM big.LITTLE). Obviously quantifying that cost is not easy as it depends on the usecase and a particular system but it won’t be zero. So I think we will need some gating condition after all. </div></div><div><div><div class="gmail_quote"><div dir="auto"><br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><br>
<br>
-- <br>
Michal Hocko<br>
SUSE Labs<br>
<br>
-- <br>
You received this message because you are subscribed to the Google Groups &quot;kernel-team&quot; group.<br>
To unsubscribe from this group and stop receiving emails from it, send an email to <a href="mailto:kernel-team%2Bunsubscribe@android.com" target="_blank">kernel-team+unsubscribe@android.com</a>.<br>
<br>
</blockquote></div></div>
</div>
Suren Baghdasaryan April 12, 2019, 2:10 p.m. UTC | #18
On Thu, Apr 11, 2019 at 11:53 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Thu 11-04-19 08:33:13, Matthew Wilcox wrote:
> > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > victim process. The usage of this flag is currently limited to SIGKILL
> > > signal and only to privileged users.
> >
> > What is the downside of doing expedited memory reclaim?  ie why not do it
> > every time a process is going to die?
>
> Well, you are tearing down an address space which might be still in use
> because the task not fully dead yeat. So there are two downsides AFAICS.
> Core dumping which will not see the reaped memory so the resulting
> coredump might be incomplete. And unexpected #PF/gup on the reaped
> memory will result in SIGBUS. These are things that we have closed our
> eyes in the oom context because they likely do not matter. If we want to
> use the same technique for other usecases then we have to think how much
> that matter again.
>

Sorry, resending with corrected settings...
After some internal discussions we realized that there is one
additional downside of doing reaping unconditionally. If this async
reaping is done on a more efficient and energy hungrier CPU
irrespective of urgency of the kill it should end up costing more
power overall (I’m referring here to assimetric architectures like ARM
big.LITTLE). Obviously quantifying that cost is not easy as it depends
on the usecase and a particular system but it won’t be zero. So I
think we will need some gating condition after all.

> --
> Michal Hocko
> SUSE Labs
>
> --
> You received this message because you are subscribed to the Google Groups "kernel-team" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Daniel Colascione April 12, 2019, 2:14 p.m. UTC | #19
On Thu, Apr 11, 2019 at 11:53 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Thu 11-04-19 08:33:13, Matthew Wilcox wrote:
> > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > victim process. The usage of this flag is currently limited to SIGKILL
> > > signal and only to privileged users.
> >
> > What is the downside of doing expedited memory reclaim?  ie why not do it
> > every time a process is going to die?
>
> Well, you are tearing down an address space which might be still in use
> because the task not fully dead yeat. So there are two downsides AFAICS.
> Core dumping which will not see the reaped memory so the resulting

Test for SIGNAL_GROUP_COREDUMP before doing any of this then. If you
try to start a core dump after reaping begins, too bad: you could have
raced with process death anyway.

> coredump might be incomplete. And unexpected #PF/gup on the reaped
> memory will result in SIGBUS.

It's a dying process. Why even bother returning from the fault
handler? Just treat that situation as a thread exit. There's no need
to make this observable to userspace at all.
Suren Baghdasaryan April 12, 2019, 2:15 p.m. UTC | #20
On Thu, Apr 11, 2019 at 11:49 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Thu 11-04-19 10:47:50, Daniel Colascione wrote:
> > On Thu, Apr 11, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Thu, Apr 11, 2019 at 10:33:32AM -0700, Daniel Colascione wrote:
> > > > On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote:
> > > > > >
> > > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > > > > > victim process. The usage of this flag is currently limited to SIGKILL
> > > > > > > signal and only to privileged users.
> > > > > >
> > > > > > What is the downside of doing expedited memory reclaim?  ie why not do it
> > > > > > every time a process is going to die?
> > > > >
> > > > > I think with an implementation that does not use/abuse oom-reaper
> > > > > thread this could be done for any kill. As I mentioned oom-reaper is a
> > > > > limited resource which has access to memory reserves and should not be
> > > > > abused in the way I do in this reference implementation.
> > > > > While there might be downsides that I don't know of, I'm not sure it's
> > > > > required to hurry every kill's memory reclaim. I think there are cases
> > > > > when resource deallocation is critical, for example when we kill to
> > > > > relieve resource shortage and there are kills when reclaim speed is
> > > > > not essential. It would be great if we can identify urgent cases
> > > > > without userspace hints, so I'm open to suggestions that do not
> > > > > involve additional flags.
> > > >
> > > > I was imagining a PI-ish approach where we'd reap in case an RT
> > > > process was waiting on the death of some other process. I'd still
> > > > prefer the API I proposed in the other message because it gets the
> > > > kernel out of the business of deciding what the right signal is. I'm a
> > > > huge believer in "mechanism, not policy".
> > >
> > > It's not a question of the kernel deciding what the right signal is.
> > > The kernel knows whether a signal is fatal to a particular process or not.
> > > The question is whether the killing process should do the work of reaping
> > > the dying process's resources sometimes, always or never.  Currently,
> > > that is never (the process reaps its own resources); Suren is suggesting
> > > sometimes, and I'm asking "Why not always?"
> >
> > FWIW, Suren's initial proposal is that the oom_reaper kthread do the
> > reaping, not the process sending the kill. Are you suggesting that
> > sending SIGKILL should spend a while in signal delivery reaping pages
> > before returning? I thought about just doing it this way, but I didn't
> > like the idea: it'd slow down mass-killing programs like killall(1).
> > Programs expect sending SIGKILL to be a fast operation that returns
> > immediately.
>
> I was thinking about this as well. And SYNC_SIGKILL would workaround the
> current expectations of how quick the current implementation is. The
> harder part would what is the actual semantic. Does the kill wait until
> the target task is TASK_DEAD or is there an intermediate step that would
> we could call it end of the day and still have a reasonable semantic
> (e.g. the original pid is really not alive anymore).

I think Daniel's proposal was trying to address that. With an input of
how many pages user wants to reclaim asynchronously and return value
of how much was actually reclaimed it contains the condition when to
stop and the reply how successful we could accomplish that. Since it
returns the number of pages reclaimed I assume the call does not
return until it reaped enough pages.

> --
> Michal Hocko
> SUSE Labs
Daniel Colascione April 12, 2019, 2:20 p.m. UTC | #21
On Fri, Apr 12, 2019 at 7:15 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Apr 11, 2019 at 11:49 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Thu 11-04-19 10:47:50, Daniel Colascione wrote:
> > > On Thu, Apr 11, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote:
> > > >
> > > > On Thu, Apr 11, 2019 at 10:33:32AM -0700, Daniel Colascione wrote:
> > > > > On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote:
> > > > > > >
> > > > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > > > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > > > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > > > > > > victim process. The usage of this flag is currently limited to SIGKILL
> > > > > > > > signal and only to privileged users.
> > > > > > >
> > > > > > > What is the downside of doing expedited memory reclaim?  ie why not do it
> > > > > > > every time a process is going to die?
> > > > > >
> > > > > > I think with an implementation that does not use/abuse oom-reaper
> > > > > > thread this could be done for any kill. As I mentioned oom-reaper is a
> > > > > > limited resource which has access to memory reserves and should not be
> > > > > > abused in the way I do in this reference implementation.
> > > > > > While there might be downsides that I don't know of, I'm not sure it's
> > > > > > required to hurry every kill's memory reclaim. I think there are cases
> > > > > > when resource deallocation is critical, for example when we kill to
> > > > > > relieve resource shortage and there are kills when reclaim speed is
> > > > > > not essential. It would be great if we can identify urgent cases
> > > > > > without userspace hints, so I'm open to suggestions that do not
> > > > > > involve additional flags.
> > > > >
> > > > > I was imagining a PI-ish approach where we'd reap in case an RT
> > > > > process was waiting on the death of some other process. I'd still
> > > > > prefer the API I proposed in the other message because it gets the
> > > > > kernel out of the business of deciding what the right signal is. I'm a
> > > > > huge believer in "mechanism, not policy".
> > > >
> > > > It's not a question of the kernel deciding what the right signal is.
> > > > The kernel knows whether a signal is fatal to a particular process or not.
> > > > The question is whether the killing process should do the work of reaping
> > > > the dying process's resources sometimes, always or never.  Currently,
> > > > that is never (the process reaps its own resources); Suren is suggesting
> > > > sometimes, and I'm asking "Why not always?"
> > >
> > > FWIW, Suren's initial proposal is that the oom_reaper kthread do the
> > > reaping, not the process sending the kill. Are you suggesting that
> > > sending SIGKILL should spend a while in signal delivery reaping pages
> > > before returning? I thought about just doing it this way, but I didn't
> > > like the idea: it'd slow down mass-killing programs like killall(1).
> > > Programs expect sending SIGKILL to be a fast operation that returns
> > > immediately.
> >
> > I was thinking about this as well. And SYNC_SIGKILL would workaround the

SYNC_SIGKILL (which, I presume, blocks in kill(2)) was proposed in
many occasions while we discussed pidfd waits over the past six months
or so. We've decided to just make pidfds pollable instead. The kernel
already has several ways to express the idea that a task should wait
for another task to die, and I don't think we need another. If you
want a process that's waiting for a task to exit to help reap that
task, great --- that's an option we've talked about --- but we don't
need new interface to do it, since the kernel already has all the
information it needs.

> > current expectations of how quick the current implementation is. The
> > harder part would what is the actual semantic. Does the kill wait until
> > the target task is TASK_DEAD or is there an intermediate step that would
> > we could call it end of the day and still have a reasonable semantic
> > (e.g. the original pid is really not alive anymore).
>
> I think Daniel's proposal was trying to address that. With an input of
> how many pages user wants to reclaim asynchronously and return value
> of how much was actually reclaimed it contains the condition when to
> stop and the reply how successful we could accomplish that. Since it
> returns the number of pages reclaimed I assume the call does not
> return until it reaped enough pages.

Right. I want to punt as much "policy" as possible to userspace. Just
using a user thread to do the reaping not only solves the policy
problem (since it's userspace that controls priority, affinity,
retries, and so on), but also simplifies the implementation
kernel-side. I can imagine situations where, depending on device
energy state or even charger or screen state we might want to reap
more or less aggressively, or not at all. I wouldn't want to burden
the kernel with having to get that right when userspace could make the
decision.
Daniel Colascione April 12, 2019, 3:30 p.m. UTC | #22
On Fri, Apr 12, 2019 at 7:14 AM Daniel Colascione <dancol@google.com> wrote:
>
> On Thu, Apr 11, 2019 at 11:53 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Thu 11-04-19 08:33:13, Matthew Wilcox wrote:
> > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > > victim process. The usage of this flag is currently limited to SIGKILL
> > > > signal and only to privileged users.
> > >
> > > What is the downside of doing expedited memory reclaim?  ie why not do it
> > > every time a process is going to die?
> >
> > Well, you are tearing down an address space which might be still in use
> > because the task not fully dead yeat. So there are two downsides AFAICS.
> > Core dumping which will not see the reaped memory so the resulting
>
> Test for SIGNAL_GROUP_COREDUMP before doing any of this then. If you
> try to start a core dump after reaping begins, too bad: you could have
> raced with process death anyway.
>
> > coredump might be incomplete. And unexpected #PF/gup on the reaped
> > memory will result in SIGBUS.
>
> It's a dying process. Why even bother returning from the fault
> handler? Just treat that situation as a thread exit. There's no need
> to make this observable to userspace at all.

Just for clarity, checking the code, I think we already do this.
zap_other_threads sets SIGKILL pending on every thread in the group,
and we'll handle SIGKILL in the process of taking any page fault or
doing any system call, so I don't think it's actually possible for a
thread in a dying process to observe the SIGBUS that reaping in theory
can generate.
Matthew Wilcox April 12, 2019, 9:03 p.m. UTC | #23
On Thu, Apr 11, 2019 at 10:47:50AM -0700, Daniel Colascione wrote:
> On Thu, Apr 11, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote:
> > It's not a question of the kernel deciding what the right signal is.
> > The kernel knows whether a signal is fatal to a particular process or not.
> > The question is whether the killing process should do the work of reaping
> > the dying process's resources sometimes, always or never.  Currently,
> > that is never (the process reaps its own resources); Suren is suggesting
> > sometimes, and I'm asking "Why not always?"
> 
> FWIW, Suren's initial proposal is that the oom_reaper kthread do the
> reaping, not the process sending the kill. Are you suggesting that
> sending SIGKILL should spend a while in signal delivery reaping pages
> before returning? I thought about just doing it this way, but I didn't
> like the idea: it'd slow down mass-killing programs like killall(1).
> Programs expect sending SIGKILL to be a fast operation that returns
> immediately.

Suren said that the implementation in this proposal wasn't to be taken
literally.

You've raised some good points here though.  Do mass-killing programs
like kill with a pgid or killall expect the signal-sending process to
be fast, or do they not really care?

From the kernel's point of view, the same work has to be done, no matter
whether the killer or the victim does it.  Should the work be accounted
to the killer or the victim?  It probably doesn't matter.  The victim
doing the work allows for parallelisation, but I'm not convinced that's
important either.

I see another advantage for the killer doing the work -- if the task
is currently blocking on I/O (eg to an NFS mount that has gone away),
the killer can get rid of the task's page tables.  If we have to wait
for the I/O to complete before the victim reaps its own page tables,
we may be waiting forever.
Suren Baghdasaryan April 25, 2019, 4:09 p.m. UTC | #24
On Fri, Apr 12, 2019 at 7:14 AM Daniel Colascione <dancol@google.com> wrote:
>
> On Thu, Apr 11, 2019 at 11:53 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Thu 11-04-19 08:33:13, Matthew Wilcox wrote:
> > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > > > victim process. The usage of this flag is currently limited to SIGKILL
> > > > signal and only to privileged users.
> > >
> > > What is the downside of doing expedited memory reclaim?  ie why not do it
> > > every time a process is going to die?
> >
> > Well, you are tearing down an address space which might be still in use
> > because the task not fully dead yeat. So there are two downsides AFAICS.
> > Core dumping which will not see the reaped memory so the resulting
>
> Test for SIGNAL_GROUP_COREDUMP before doing any of this then. If you
> try to start a core dump after reaping begins, too bad: you could have
> raced with process death anyway.
>
> > coredump might be incomplete. And unexpected #PF/gup on the reaped
> > memory will result in SIGBUS.
>
> It's a dying process. Why even bother returning from the fault
> handler? Just treat that situation as a thread exit. There's no need
> to make this observable to userspace at all.

I've spent some more time to investigate possible effects of reaping
on coredumps and asked Oleg Nesterov who worked on patchsets that
prioritize SIGKILLs over coredump activity
(https://lkml.org/lkml/2013/2/17/118). Current do_coredump
implementation seems to handle the case of SIGKILL interruption by
bailing out whenever dump_interrupted() returns true and that would be
the case with pending SIGKILL. So in the case of race when coredump
happens first and SIGKILL comes next interrupting the coredump seems
to result in no change in behavior and reaping memory proactively
seems to have no side effects.
An opposite race when SIGKILL gets posted and then coredump happens
seems impossible because do_coredump won't be called from get_signal
due to signal_group_exit check (get_signal checks signal_group_exit
while holding sighand->siglock and complete_signal sets
SIGNAL_GROUP_EXIT while holding the same lock). There is a path from
__seccomp_filter calling do_coredump while processing
SECCOMP_RET_KILL_xxx but even then it should bail out when
coredump_wait()->zap_threads(current) checks signal_group_exit().
(Thanks Oleg for clarifying this for me).

If we are really concerned about possible increase in failed coredumps
because of the proactive reaping I could make it conditional on
whether coredumping mm is possible by placing this feature behind
!get_dumpable(mm) condition. Another possibility is to check
RLIMIT_CORE to decide if coredumps are possible (although if pipe is
used for coredump that limit seems to be ignored, so that check would
have to take this into consideration).

On the issue of SIGBUS happening when accessed memory was already
reaped, my understanding that SIGBUS being a synchronous signal will
still have to be fetched using dequeue_synchronous_signal from
get_signal but not before signal_group_exit is checked. So again if
SIGKILL is pending I think SIGBUS will be ignored (please correct me
if that's not correct).

One additional question I would like to clarify is whether per-node
reapers like Roman suggested would make a big difference (All CPUs
that I've seen used for Android are single-node ones, so looking for
more feedback here). If it's important then reaping victim's memory by
the killer is probably not an option.
diff mbox series

Patch

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index e412c092c1e8..8a227633a058 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -327,7 +327,8 @@  extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
 extern void force_sigsegv(int sig, struct task_struct *p);
 extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *);
 extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
-extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
+extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
+				bool expedite);
 extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *,
 				const struct cred *);
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 9702016734b1..34b7852aa4a0 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -446,8 +446,17 @@  int __save_altstack(stack_t __user *, unsigned long);
 } while (0);
 
 #ifdef CONFIG_PROC_FS
+
+/*
+ * SS_FLAGS values used in pidfd_send_signal:
+ *
+ * SS_EXPEDITE indicates desire to expedite the operation.
+ */
+#define SS_EXPEDITE	0x00000001
+
 struct seq_file;
 extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
-#endif
+
+#endif /* CONFIG_PROC_FS */
 
 #endif /* _LINUX_SIGNAL_H */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index aea30530c472..27c66296e08e 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -720,7 +720,7 @@  static void __do_notify(struct mqueue_inode_info *info)
 			rcu_read_unlock();
 
 			kill_pid_info(info->notify.sigev_signo,
-				      &sig_i, info->notify_owner);
+				      &sig_i, info->notify_owner, false);
 			break;
 		case SIGEV_THREAD:
 			set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
diff --git a/kernel/signal.c b/kernel/signal.c
index f98448cf2def..02ed4332d17c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -43,6 +43,7 @@ 
 #include <linux/compiler.h>
 #include <linux/posix-timers.h>
 #include <linux/livepatch.h>
+#include <linux/oom.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -1394,7 +1395,8 @@  int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
 	return success ? 0 : retval;
 }
 
-int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
+int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
+				  bool expedite)
 {
 	int error = -ESRCH;
 	struct task_struct *p;
@@ -1402,8 +1404,17 @@  int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
 	for (;;) {
 		rcu_read_lock();
 		p = pid_task(pid, PIDTYPE_PID);
-		if (p)
+		if (p) {
 			error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
+
+			/*
+			 * Ignore expedite_reclaim return value, it is best
+			 * effort only.
+			 */
+			if (!error && expedite)
+				expedite_reclaim(p);
+		}
+
 		rcu_read_unlock();
 		if (likely(!p || error != -ESRCH))
 			return error;
@@ -1420,7 +1431,7 @@  static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
 {
 	int error;
 	rcu_read_lock();
-	error = kill_pid_info(sig, info, find_vpid(pid));
+	error = kill_pid_info(sig, info, find_vpid(pid), false);
 	rcu_read_unlock();
 	return error;
 }
@@ -1487,7 +1498,7 @@  static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
 
 	if (pid > 0) {
 		rcu_read_lock();
-		ret = kill_pid_info(sig, info, find_vpid(pid));
+		ret = kill_pid_info(sig, info, find_vpid(pid), false);
 		rcu_read_unlock();
 		return ret;
 	}
@@ -1704,7 +1715,7 @@  EXPORT_SYMBOL(kill_pgrp);
 
 int kill_pid(struct pid *pid, int sig, int priv)
 {
-	return kill_pid_info(sig, __si_special(priv), pid);
+	return kill_pid_info(sig, __si_special(priv), pid, false);
 }
 EXPORT_SYMBOL(kill_pid);
 
@@ -3577,10 +3588,20 @@  SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 	struct pid *pid;
 	kernel_siginfo_t kinfo;
 
-	/* Enforce flags be set to 0 until we add an extension. */
-	if (flags)
+	/* Enforce no unknown flags. */
+	if (flags & ~SS_EXPEDITE)
 		return -EINVAL;
 
+	if (flags & SS_EXPEDITE) {
+		/* Enforce SS_EXPEDITE to be used with SIGKILL only. */
+		if (sig != SIGKILL)
+			return -EINVAL;
+
+		/* Limit expedited killing to privileged users only. */
+		if (!capable(CAP_SYS_NICE))
+			return -EPERM;
+	}
+
 	f = fdget_raw(pidfd);
 	if (!f.file)
 		return -EBADF;
@@ -3614,7 +3635,7 @@  SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 		prepare_kill_siginfo(sig, &kinfo);
 	}
 
-	ret = kill_pid_info(sig, &kinfo, pid);
+	ret = kill_pid_info(sig, &kinfo, pid, (flags & SS_EXPEDITE) != 0);
 
 err:
 	fdput(f);
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 02068b2d5862..c926483cdb53 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -140,7 +140,7 @@  enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 	struct pid *leader_pid = sig->pids[PIDTYPE_TGID];
 
 	trace_itimer_expire(ITIMER_REAL, leader_pid, 0);
-	kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid);
+	kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid, false);
 
 	return HRTIMER_NORESTART;
 }