Message ID | 20190411014353.113252-3-surenb@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | opportunistic memory reclaim of a killed process | expand |
On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > pidfd_send_signal() syscall to allow expedited memory reclaim of the > victim process. The usage of this flag is currently limited to SIGKILL > signal and only to privileged users. > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > --- > include/linux/sched/signal.h | 3 ++- > include/linux/signal.h | 11 ++++++++++- > ipc/mqueue.c | 2 +- > kernel/signal.c | 37 ++++++++++++++++++++++++++++-------- > kernel/time/itimer.c | 2 +- > 5 files changed, 43 insertions(+), 12 deletions(-) > > diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h > index e412c092c1e8..8a227633a058 100644 > --- a/include/linux/sched/signal.h > +++ b/include/linux/sched/signal.h > @@ -327,7 +327,8 @@ extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); > extern void force_sigsegv(int sig, struct task_struct *p); > extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *); > extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); > -extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); > +extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid, > + bool expedite); > extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *, > const struct cred *); > extern int kill_pgrp(struct pid *pid, int sig, int priv); > diff --git a/include/linux/signal.h b/include/linux/signal.h > index 9702016734b1..34b7852aa4a0 100644 > --- a/include/linux/signal.h > +++ b/include/linux/signal.h > @@ -446,8 +446,17 @@ int __save_altstack(stack_t __user *, unsigned long); > } while (0); > > #ifdef CONFIG_PROC_FS > + > +/* > + * SS_FLAGS values used in pidfd_send_signal: > + * > + * SS_EXPEDITE indicates desire to expedite the operation. > + */ > +#define SS_EXPEDITE 0x00000001 Does this make sense as an SS_* flag? How does this relate to the signal stack? Is there any intention to ever use this flag with stack_t? New flags should be PIDFD_SIGNAL_*. (E.g. the thread flag will be PIDFD_SIGNAL_THREAD.) And since this is exposed to userspace in contrast to the mm internal naming it should be something more easily understandable like PIDFD_SIGNAL_MM_RECLAIM{_FASTER} or something. > + > struct seq_file; > extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); > -#endif > + > +#endif /* CONFIG_PROC_FS */ > > #endif /* _LINUX_SIGNAL_H */ > diff --git a/ipc/mqueue.c b/ipc/mqueue.c > index aea30530c472..27c66296e08e 100644 > --- a/ipc/mqueue.c > +++ b/ipc/mqueue.c > @@ -720,7 +720,7 @@ static void __do_notify(struct mqueue_inode_info *info) > rcu_read_unlock(); > > kill_pid_info(info->notify.sigev_signo, > - &sig_i, info->notify_owner); > + &sig_i, info->notify_owner, false); > break; > case SIGEV_THREAD: > set_cookie(info->notify_cookie, NOTIFY_WOKENUP); > diff --git a/kernel/signal.c b/kernel/signal.c > index f98448cf2def..02ed4332d17c 100644 > --- a/kernel/signal.c > +++ b/kernel/signal.c > @@ -43,6 +43,7 @@ > #include <linux/compiler.h> > #include <linux/posix-timers.h> > #include <linux/livepatch.h> > +#include <linux/oom.h> > > #define CREATE_TRACE_POINTS > #include <trace/events/signal.h> > @@ -1394,7 +1395,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) > return success ? 0 : retval; > } > > -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) > +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid, > + bool expedite) > { > int error = -ESRCH; > struct task_struct *p; > @@ -1402,8 +1404,17 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) > for (;;) { > rcu_read_lock(); > p = pid_task(pid, PIDTYPE_PID); > - if (p) > + if (p) { > error = group_send_sig_info(sig, info, p, PIDTYPE_TGID); > + > + /* > + * Ignore expedite_reclaim return value, it is best > + * effort only. > + */ > + if (!error && expedite) > + expedite_reclaim(p); SIGKILL will take the whole thread group down so the reclaim should make sense here. > + } > + > rcu_read_unlock(); > if (likely(!p || error != -ESRCH)) > return error; > @@ -1420,7 +1431,7 @@ static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) > { > int error; > rcu_read_lock(); > - error = kill_pid_info(sig, info, find_vpid(pid)); > + error = kill_pid_info(sig, info, find_vpid(pid), false); > rcu_read_unlock(); > return error; > } > @@ -1487,7 +1498,7 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) > > if (pid > 0) { > rcu_read_lock(); > - ret = kill_pid_info(sig, info, find_vpid(pid)); > + ret = kill_pid_info(sig, info, find_vpid(pid), false); > rcu_read_unlock(); > return ret; > } > @@ -1704,7 +1715,7 @@ EXPORT_SYMBOL(kill_pgrp); > > int kill_pid(struct pid *pid, int sig, int priv) > { > - return kill_pid_info(sig, __si_special(priv), pid); > + return kill_pid_info(sig, __si_special(priv), pid, false); > } > EXPORT_SYMBOL(kill_pid); > > @@ -3577,10 +3588,20 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, > struct pid *pid; > kernel_siginfo_t kinfo; > > - /* Enforce flags be set to 0 until we add an extension. */ > - if (flags) > + /* Enforce no unknown flags. */ > + if (flags & ~SS_EXPEDITE) > return -EINVAL; > > + if (flags & SS_EXPEDITE) { > + /* Enforce SS_EXPEDITE to be used with SIGKILL only. */ > + if (sig != SIGKILL) > + return -EINVAL; Not super fond of this being a SIGKILL specific flag but I get why. > + > + /* Limit expedited killing to privileged users only. */ > + if (!capable(CAP_SYS_NICE)) > + return -EPERM; Do you have a specific (DOS or other) attack vector in mind that renders ns_capable unsuitable? > + } > + > f = fdget_raw(pidfd); > if (!f.file) > return -EBADF; > @@ -3614,7 +3635,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, > prepare_kill_siginfo(sig, &kinfo); > } > > - ret = kill_pid_info(sig, &kinfo, pid); > + ret = kill_pid_info(sig, &kinfo, pid, (flags & SS_EXPEDITE) != 0); > > err: > fdput(f); > diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c > index 02068b2d5862..c926483cdb53 100644 > --- a/kernel/time/itimer.c > +++ b/kernel/time/itimer.c > @@ -140,7 +140,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) > struct pid *leader_pid = sig->pids[PIDTYPE_TGID]; > > trace_itimer_expire(ITIMER_REAL, leader_pid, 0); > - kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid); > + kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid, false); > > return HRTIMER_NORESTART; > } > -- > 2.21.0.392.gf8f6787159e-goog >
Cc: Oleg too On Thu, Apr 11, 2019 at 12:30:18PM +0200, Christian Brauner wrote: > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > victim process. The usage of this flag is currently limited to SIGKILL > > signal and only to privileged users. > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > --- > > include/linux/sched/signal.h | 3 ++- > > include/linux/signal.h | 11 ++++++++++- > > ipc/mqueue.c | 2 +- > > kernel/signal.c | 37 ++++++++++++++++++++++++++++-------- > > kernel/time/itimer.c | 2 +- > > 5 files changed, 43 insertions(+), 12 deletions(-) > > > > diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h > > index e412c092c1e8..8a227633a058 100644 > > --- a/include/linux/sched/signal.h > > +++ b/include/linux/sched/signal.h > > @@ -327,7 +327,8 @@ extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); > > extern void force_sigsegv(int sig, struct task_struct *p); > > extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *); > > extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); > > -extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); > > +extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid, > > + bool expedite); > > extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *, > > const struct cred *); > > extern int kill_pgrp(struct pid *pid, int sig, int priv); > > diff --git a/include/linux/signal.h b/include/linux/signal.h > > index 9702016734b1..34b7852aa4a0 100644 > > --- a/include/linux/signal.h > > +++ b/include/linux/signal.h > > @@ -446,8 +446,17 @@ int __save_altstack(stack_t __user *, unsigned long); > > } while (0); > > > > #ifdef CONFIG_PROC_FS > > + > > +/* > > + * SS_FLAGS values used in pidfd_send_signal: > > + * > > + * SS_EXPEDITE indicates desire to expedite the operation. > > + */ > > +#define SS_EXPEDITE 0x00000001 > > Does this make sense as an SS_* flag? > How does this relate to the signal stack? > Is there any intention to ever use this flag with stack_t? > > New flags should be PIDFD_SIGNAL_*. (E.g. the thread flag will be > PIDFD_SIGNAL_THREAD.) > And since this is exposed to userspace in contrast to the mm internal > naming it should be something more easily understandable like > PIDFD_SIGNAL_MM_RECLAIM{_FASTER} or something. > > > + > > struct seq_file; > > extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); > > -#endif > > + > > +#endif /* CONFIG_PROC_FS */ > > > > #endif /* _LINUX_SIGNAL_H */ > > diff --git a/ipc/mqueue.c b/ipc/mqueue.c > > index aea30530c472..27c66296e08e 100644 > > --- a/ipc/mqueue.c > > +++ b/ipc/mqueue.c > > @@ -720,7 +720,7 @@ static void __do_notify(struct mqueue_inode_info *info) > > rcu_read_unlock(); > > > > kill_pid_info(info->notify.sigev_signo, > > - &sig_i, info->notify_owner); > > + &sig_i, info->notify_owner, false); > > break; > > case SIGEV_THREAD: > > set_cookie(info->notify_cookie, NOTIFY_WOKENUP); > > diff --git a/kernel/signal.c b/kernel/signal.c > > index f98448cf2def..02ed4332d17c 100644 > > --- a/kernel/signal.c > > +++ b/kernel/signal.c > > @@ -43,6 +43,7 @@ > > #include <linux/compiler.h> > > #include <linux/posix-timers.h> > > #include <linux/livepatch.h> > > +#include <linux/oom.h> > > > > #define CREATE_TRACE_POINTS > > #include <trace/events/signal.h> > > @@ -1394,7 +1395,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) > > return success ? 0 : retval; > > } > > > > -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) > > +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid, > > + bool expedite) > > { > > int error = -ESRCH; > > struct task_struct *p; > > @@ -1402,8 +1404,17 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) > > for (;;) { > > rcu_read_lock(); > > p = pid_task(pid, PIDTYPE_PID); > > - if (p) > > + if (p) { > > error = group_send_sig_info(sig, info, p, PIDTYPE_TGID); > > + > > + /* > > + * Ignore expedite_reclaim return value, it is best > > + * effort only. > > + */ > > + if (!error && expedite) > > + expedite_reclaim(p); > > SIGKILL will take the whole thread group down so the reclaim should make > sense here. > > > + } > > + > > rcu_read_unlock(); > > if (likely(!p || error != -ESRCH)) > > return error; > > @@ -1420,7 +1431,7 @@ static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) > > { > > int error; > > rcu_read_lock(); > > - error = kill_pid_info(sig, info, find_vpid(pid)); > > + error = kill_pid_info(sig, info, find_vpid(pid), false); > > rcu_read_unlock(); > > return error; > > } > > @@ -1487,7 +1498,7 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) > > > > if (pid > 0) { > > rcu_read_lock(); > > - ret = kill_pid_info(sig, info, find_vpid(pid)); > > + ret = kill_pid_info(sig, info, find_vpid(pid), false); > > rcu_read_unlock(); > > return ret; > > } > > @@ -1704,7 +1715,7 @@ EXPORT_SYMBOL(kill_pgrp); > > > > int kill_pid(struct pid *pid, int sig, int priv) > > { > > - return kill_pid_info(sig, __si_special(priv), pid); > > + return kill_pid_info(sig, __si_special(priv), pid, false); > > } > > EXPORT_SYMBOL(kill_pid); > > > > @@ -3577,10 +3588,20 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, > > struct pid *pid; > > kernel_siginfo_t kinfo; > > > > - /* Enforce flags be set to 0 until we add an extension. */ > > - if (flags) > > + /* Enforce no unknown flags. */ > > + if (flags & ~SS_EXPEDITE) > > return -EINVAL; > > > > + if (flags & SS_EXPEDITE) { > > + /* Enforce SS_EXPEDITE to be used with SIGKILL only. */ > > + if (sig != SIGKILL) > > + return -EINVAL; > > Not super fond of this being a SIGKILL specific flag but I get why. > > > + > > + /* Limit expedited killing to privileged users only. */ > > + if (!capable(CAP_SYS_NICE)) > > + return -EPERM; > > Do you have a specific (DOS or other) attack vector in mind that renders > ns_capable unsuitable? > > > + } > > + > > f = fdget_raw(pidfd); > > if (!f.file) > > return -EBADF; > > @@ -3614,7 +3635,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, > > prepare_kill_siginfo(sig, &kinfo); > > } > > > > - ret = kill_pid_info(sig, &kinfo, pid); > > + ret = kill_pid_info(sig, &kinfo, pid, (flags & SS_EXPEDITE) != 0); > > > > err: > > fdput(f); > > diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c > > index 02068b2d5862..c926483cdb53 100644 > > --- a/kernel/time/itimer.c > > +++ b/kernel/time/itimer.c > > @@ -140,7 +140,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) > > struct pid *leader_pid = sig->pids[PIDTYPE_TGID]; > > > > trace_itimer_expire(ITIMER_REAL, leader_pid, 0); > > - kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid); > > + kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid, false); > > > > return HRTIMER_NORESTART; > > } > > -- > > 2.21.0.392.gf8f6787159e-goog > >
Thanks for the feedback! Just to be clear, this implementation is used in this RFC as a reference to explain the intent. To be honest I don't think it will be adopted as is even if the idea survives scrutiny. On Thu, Apr 11, 2019 at 3:30 AM Christian Brauner <christian@brauner.io> wrote: > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > victim process. The usage of this flag is currently limited to SIGKILL > > signal and only to privileged users. > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > --- > > include/linux/sched/signal.h | 3 ++- > > include/linux/signal.h | 11 ++++++++++- > > ipc/mqueue.c | 2 +- > > kernel/signal.c | 37 ++++++++++++++++++++++++++++-------- > > kernel/time/itimer.c | 2 +- > > 5 files changed, 43 insertions(+), 12 deletions(-) > > > > diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h > > index e412c092c1e8..8a227633a058 100644 > > --- a/include/linux/sched/signal.h > > +++ b/include/linux/sched/signal.h > > @@ -327,7 +327,8 @@ extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); > > extern void force_sigsegv(int sig, struct task_struct *p); > > extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *); > > extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); > > -extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); > > +extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid, > > + bool expedite); > > extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *, > > const struct cred *); > > extern int kill_pgrp(struct pid *pid, int sig, int priv); > > diff --git a/include/linux/signal.h b/include/linux/signal.h > > index 9702016734b1..34b7852aa4a0 100644 > > --- a/include/linux/signal.h > > +++ b/include/linux/signal.h > > @@ -446,8 +446,17 @@ int __save_altstack(stack_t __user *, unsigned long); > > } while (0); > > > > #ifdef CONFIG_PROC_FS > > + > > +/* > > + * SS_FLAGS values used in pidfd_send_signal: > > + * > > + * SS_EXPEDITE indicates desire to expedite the operation. > > + */ > > +#define SS_EXPEDITE 0x00000001 > > Does this make sense as an SS_* flag? > How does this relate to the signal stack? It doesn't, so I agree that the name should be changed. PIDFD_SIGNAL_EXPEDITE_MM_RECLAIM would seem appropriate. > Is there any intention to ever use this flag with stack_t? > > New flags should be PIDFD_SIGNAL_*. (E.g. the thread flag will be > PIDFD_SIGNAL_THREAD.) > And since this is exposed to userspace in contrast to the mm internal > naming it should be something more easily understandable like > PIDFD_SIGNAL_MM_RECLAIM{_FASTER} or something. > > > + > > struct seq_file; > > extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); > > -#endif > > + > > +#endif /* CONFIG_PROC_FS */ > > > > #endif /* _LINUX_SIGNAL_H */ > > diff --git a/ipc/mqueue.c b/ipc/mqueue.c > > index aea30530c472..27c66296e08e 100644 > > --- a/ipc/mqueue.c > > +++ b/ipc/mqueue.c > > @@ -720,7 +720,7 @@ static void __do_notify(struct mqueue_inode_info *info) > > rcu_read_unlock(); > > > > kill_pid_info(info->notify.sigev_signo, > > - &sig_i, info->notify_owner); > > + &sig_i, info->notify_owner, false); > > break; > > case SIGEV_THREAD: > > set_cookie(info->notify_cookie, NOTIFY_WOKENUP); > > diff --git a/kernel/signal.c b/kernel/signal.c > > index f98448cf2def..02ed4332d17c 100644 > > --- a/kernel/signal.c > > +++ b/kernel/signal.c > > @@ -43,6 +43,7 @@ > > #include <linux/compiler.h> > > #include <linux/posix-timers.h> > > #include <linux/livepatch.h> > > +#include <linux/oom.h> > > > > #define CREATE_TRACE_POINTS > > #include <trace/events/signal.h> > > @@ -1394,7 +1395,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) > > return success ? 0 : retval; > > } > > > > -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) > > +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid, > > + bool expedite) > > { > > int error = -ESRCH; > > struct task_struct *p; > > @@ -1402,8 +1404,17 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) > > for (;;) { > > rcu_read_lock(); > > p = pid_task(pid, PIDTYPE_PID); > > - if (p) > > + if (p) { > > error = group_send_sig_info(sig, info, p, PIDTYPE_TGID); > > + > > + /* > > + * Ignore expedite_reclaim return value, it is best > > + * effort only. > > + */ > > + if (!error && expedite) > > + expedite_reclaim(p); > > SIGKILL will take the whole thread group down so the reclaim should make > sense here. > This sounds like confirmation. I hope I'm not missing some flaw that you are trying to point out. > > + } > > + > > rcu_read_unlock(); > > if (likely(!p || error != -ESRCH)) > > return error; > > @@ -1420,7 +1431,7 @@ static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) > > { > > int error; > > rcu_read_lock(); > > - error = kill_pid_info(sig, info, find_vpid(pid)); > > + error = kill_pid_info(sig, info, find_vpid(pid), false); > > rcu_read_unlock(); > > return error; > > } > > @@ -1487,7 +1498,7 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) > > > > if (pid > 0) { > > rcu_read_lock(); > > - ret = kill_pid_info(sig, info, find_vpid(pid)); > > + ret = kill_pid_info(sig, info, find_vpid(pid), false); > > rcu_read_unlock(); > > return ret; > > } > > @@ -1704,7 +1715,7 @@ EXPORT_SYMBOL(kill_pgrp); > > > > int kill_pid(struct pid *pid, int sig, int priv) > > { > > - return kill_pid_info(sig, __si_special(priv), pid); > > + return kill_pid_info(sig, __si_special(priv), pid, false); > > } > > EXPORT_SYMBOL(kill_pid); > > > > @@ -3577,10 +3588,20 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, > > struct pid *pid; > > kernel_siginfo_t kinfo; > > > > - /* Enforce flags be set to 0 until we add an extension. */ > > - if (flags) > > + /* Enforce no unknown flags. */ > > + if (flags & ~SS_EXPEDITE) > > return -EINVAL; > > > > + if (flags & SS_EXPEDITE) { > > + /* Enforce SS_EXPEDITE to be used with SIGKILL only. */ > > + if (sig != SIGKILL) > > + return -EINVAL; > > Not super fond of this being a SIGKILL specific flag but I get why. Understood. I was thinking that EXPEDITE flag might make sense for other signals in the future but from internal feedback sounds like if we go this way the flag name should be more specific. > > + > > + /* Limit expedited killing to privileged users only. */ > > + if (!capable(CAP_SYS_NICE)) > > + return -EPERM; > > Do you have a specific (DOS or other) attack vector in mind that renders > ns_capable unsuitable? > > > + } > > + > > f = fdget_raw(pidfd); > > if (!f.file) > > return -EBADF; > > @@ -3614,7 +3635,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, > > prepare_kill_siginfo(sig, &kinfo); > > } > > > > - ret = kill_pid_info(sig, &kinfo, pid); > > + ret = kill_pid_info(sig, &kinfo, pid, (flags & SS_EXPEDITE) != 0); > > > > err: > > fdput(f); > > diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c > > index 02068b2d5862..c926483cdb53 100644 > > --- a/kernel/time/itimer.c > > +++ b/kernel/time/itimer.c > > @@ -140,7 +140,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) > > struct pid *leader_pid = sig->pids[PIDTYPE_TGID]; > > > > trace_itimer_expire(ITIMER_REAL, leader_pid, 0); > > - kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid); > > + kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid, false); > > > > return HRTIMER_NORESTART; > > } > > -- > > 2.21.0.392.gf8f6787159e-goog > >
On Thu, Apr 11, 2019 at 8:18 AM Suren Baghdasaryan <surenb@google.com> wrote: > > Thanks for the feedback! > Just to be clear, this implementation is used in this RFC as a > reference to explain the intent. To be honest I don't think it will be > adopted as is even if the idea survives scrutiny. > > On Thu, Apr 11, 2019 at 3:30 AM Christian Brauner <christian@brauner.io> wrote: > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > victim process. The usage of this flag is currently limited to SIGKILL > > > signal and only to privileged users. > > > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > > --- > > > include/linux/sched/signal.h | 3 ++- > > > include/linux/signal.h | 11 ++++++++++- > > > ipc/mqueue.c | 2 +- > > > kernel/signal.c | 37 ++++++++++++++++++++++++++++-------- > > > kernel/time/itimer.c | 2 +- > > > 5 files changed, 43 insertions(+), 12 deletions(-) > > > > > > diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h > > > index e412c092c1e8..8a227633a058 100644 > > > --- a/include/linux/sched/signal.h > > > +++ b/include/linux/sched/signal.h > > > @@ -327,7 +327,8 @@ extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); > > > extern void force_sigsegv(int sig, struct task_struct *p); > > > extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *); > > > extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); > > > -extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); > > > +extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid, > > > + bool expedite); > > > extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *, > > > const struct cred *); > > > extern int kill_pgrp(struct pid *pid, int sig, int priv); > > > diff --git a/include/linux/signal.h b/include/linux/signal.h > > > index 9702016734b1..34b7852aa4a0 100644 > > > --- a/include/linux/signal.h > > > +++ b/include/linux/signal.h > > > @@ -446,8 +446,17 @@ int __save_altstack(stack_t __user *, unsigned long); > > > } while (0); > > > > > > #ifdef CONFIG_PROC_FS > > > + > > > +/* > > > + * SS_FLAGS values used in pidfd_send_signal: > > > + * > > > + * SS_EXPEDITE indicates desire to expedite the operation. > > > + */ > > > +#define SS_EXPEDITE 0x00000001 > > > > Does this make sense as an SS_* flag? > > How does this relate to the signal stack? > > It doesn't, so I agree that the name should be changed. > PIDFD_SIGNAL_EXPEDITE_MM_RECLAIM would seem appropriate. > > > Is there any intention to ever use this flag with stack_t? > > > > New flags should be PIDFD_SIGNAL_*. (E.g. the thread flag will be > > PIDFD_SIGNAL_THREAD.) > > And since this is exposed to userspace in contrast to the mm internal > > naming it should be something more easily understandable like > > PIDFD_SIGNAL_MM_RECLAIM{_FASTER} or something. > > > > > + > > > struct seq_file; > > > extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); > > > -#endif > > > + > > > +#endif /* CONFIG_PROC_FS */ > > > > > > #endif /* _LINUX_SIGNAL_H */ > > > diff --git a/ipc/mqueue.c b/ipc/mqueue.c > > > index aea30530c472..27c66296e08e 100644 > > > --- a/ipc/mqueue.c > > > +++ b/ipc/mqueue.c > > > @@ -720,7 +720,7 @@ static void __do_notify(struct mqueue_inode_info *info) > > > rcu_read_unlock(); > > > > > > kill_pid_info(info->notify.sigev_signo, > > > - &sig_i, info->notify_owner); > > > + &sig_i, info->notify_owner, false); > > > break; > > > case SIGEV_THREAD: > > > set_cookie(info->notify_cookie, NOTIFY_WOKENUP); > > > diff --git a/kernel/signal.c b/kernel/signal.c > > > index f98448cf2def..02ed4332d17c 100644 > > > --- a/kernel/signal.c > > > +++ b/kernel/signal.c > > > @@ -43,6 +43,7 @@ > > > #include <linux/compiler.h> > > > #include <linux/posix-timers.h> > > > #include <linux/livepatch.h> > > > +#include <linux/oom.h> > > > > > > #define CREATE_TRACE_POINTS > > > #include <trace/events/signal.h> > > > @@ -1394,7 +1395,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) > > > return success ? 0 : retval; > > > } > > > > > > -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) > > > +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid, > > > + bool expedite) > > > { > > > int error = -ESRCH; > > > struct task_struct *p; > > > @@ -1402,8 +1404,17 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) > > > for (;;) { > > > rcu_read_lock(); > > > p = pid_task(pid, PIDTYPE_PID); > > > - if (p) > > > + if (p) { > > > error = group_send_sig_info(sig, info, p, PIDTYPE_TGID); > > > + > > > + /* > > > + * Ignore expedite_reclaim return value, it is best > > > + * effort only. > > > + */ > > > + if (!error && expedite) > > > + expedite_reclaim(p); > > > > SIGKILL will take the whole thread group down so the reclaim should make > > sense here. > > > > This sounds like confirmation. I hope I'm not missing some flaw that > you are trying to point out. > > > > + } > > > + > > > rcu_read_unlock(); > > > if (likely(!p || error != -ESRCH)) > > > return error; > > > @@ -1420,7 +1431,7 @@ static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) > > > { > > > int error; > > > rcu_read_lock(); > > > - error = kill_pid_info(sig, info, find_vpid(pid)); > > > + error = kill_pid_info(sig, info, find_vpid(pid), false); > > > rcu_read_unlock(); > > > return error; > > > } > > > @@ -1487,7 +1498,7 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) > > > > > > if (pid > 0) { > > > rcu_read_lock(); > > > - ret = kill_pid_info(sig, info, find_vpid(pid)); > > > + ret = kill_pid_info(sig, info, find_vpid(pid), false); > > > rcu_read_unlock(); > > > return ret; > > > } > > > @@ -1704,7 +1715,7 @@ EXPORT_SYMBOL(kill_pgrp); > > > > > > int kill_pid(struct pid *pid, int sig, int priv) > > > { > > > - return kill_pid_info(sig, __si_special(priv), pid); > > > + return kill_pid_info(sig, __si_special(priv), pid, false); > > > } > > > EXPORT_SYMBOL(kill_pid); > > > > > > @@ -3577,10 +3588,20 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, > > > struct pid *pid; > > > kernel_siginfo_t kinfo; > > > > > > - /* Enforce flags be set to 0 until we add an extension. */ > > > - if (flags) > > > + /* Enforce no unknown flags. */ > > > + if (flags & ~SS_EXPEDITE) > > > return -EINVAL; > > > > > > + if (flags & SS_EXPEDITE) { > > > + /* Enforce SS_EXPEDITE to be used with SIGKILL only. */ > > > + if (sig != SIGKILL) > > > + return -EINVAL; > > > > Not super fond of this being a SIGKILL specific flag but I get why. > > Understood. I was thinking that EXPEDITE flag might make sense for > other signals in the future but from internal feedback sounds like if > we go this way the flag name should be more specific. > > > > + > > > + /* Limit expedited killing to privileged users only. */ > > > + if (!capable(CAP_SYS_NICE)) > > > + return -EPERM; > > > > Do you have a specific (DOS or other) attack vector in mind that renders > > ns_capable unsuitable? > > Missed this one. I was thinking of oom-reaper thread as a limited system resource (one thread which maintains a kill list and reaps process mms one at a time) and therefore should be protected from abuse. > > > + } > > > + > > > f = fdget_raw(pidfd); > > > if (!f.file) > > > return -EBADF; > > > @@ -3614,7 +3635,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, > > > prepare_kill_siginfo(sig, &kinfo); > > > } > > > > > > - ret = kill_pid_info(sig, &kinfo, pid); > > > + ret = kill_pid_info(sig, &kinfo, pid, (flags & SS_EXPEDITE) != 0); > > > > > > err: > > > fdput(f); > > > diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c > > > index 02068b2d5862..c926483cdb53 100644 > > > --- a/kernel/time/itimer.c > > > +++ b/kernel/time/itimer.c > > > @@ -140,7 +140,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) > > > struct pid *leader_pid = sig->pids[PIDTYPE_TGID]; > > > > > > trace_itimer_expire(ITIMER_REAL, leader_pid, 0); > > > - kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid); > > > + kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid, false); > > > > > > return HRTIMER_NORESTART; > > > } > > > -- > > > 2.21.0.392.gf8f6787159e-goog > > >
On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > pidfd_send_signal() syscall to allow expedited memory reclaim of the > victim process. The usage of this flag is currently limited to SIGKILL > signal and only to privileged users. What is the downside of doing expedited memory reclaim? ie why not do it every time a process is going to die?
On Thu, Apr 11, 2019 at 8:23 AM Suren Baghdasaryan <surenb@google.com> wrote: > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > > victim process. The usage of this flag is currently limited to SIGKILL > > > > signal and only to privileged users. FWIW, I like Suren's general idea, but I was thinking of a different way of exposing the same general functionality to userspace. The way I look at it, it's very useful for an auto-balancing memory system like Android (or, I presume, something that uses oomd) to recover memory *immediately* after a SIGKILL instead of waiting for the process to kill itself: a process's death can be delayed for a long time due to factors like scheduling and being blocked in various uninterruptible kernel-side operations. Suren's proposal is basically about pulling forward in time page reclaimation that would happen anyway. What if we let userspace control exactly when this reclaimation happens? I'm imagining a new* kernel facility that basically looks like this. It lets lmkd determine for itself how much work the system should expend on reclaiming memory from dying processes. size_t try_reap_dying_process( int pidfd, int flags /* must be zero */, size_t maximum_bytes_to_reap); Precondition: process is pending group-exit (someone already sent it SIGKILL) Postcondition: some memory reclaimed from dying process Invariant: doesn't sleep; stops reaping after MAXIMUM_BYTES_TO_REAP -> success: return number of bytes reaped -> failure: (size_t)-1 EBUSY: couldn't get mmap_sem EINVAL: PIDFD isn't a pidfd or otherwise invalid arguments EPERM: process hasn't been send SIGKILL: try_reap_dying_process on a process that isn't dying is illegal Kernel-side, try_reap_dying_process would try-acquire mmap_sem and just fail if it couldn't get it. Once acquired, it would release "easy" pages (using the same check the oom reaper uses) until it either ran out of pages or hit the MAXIMUM_BYTES_TO_REAP cap. The purpose of MAXIMUM_BYTES_TO_REAP is to let userspace bound-above the amount of time we spend reclaiming pages. It'd be up to userspace to set policy on retries, the actual value of the reap cap, the priority at which we run TRY_REAP_DYING_PROCESS, and so on. We return the number of bytes we managed to free this way so that lmkd can make an informed decision about what to do next, e.g., kill something else or wait a little while. Personally, I like th approach a bit more that recruiting the oom reaper through because it doesn't affect any kind of emergency memory reserve permission and because it frees us from having to think about whether the oom reaper's thread priority is right for this particular job. It also occurred to me that try_reap_dying_process might make a decent shrinker callback. Shrinkers are there, AIUI, to reclaim memory that's easy to free and that's not essential for correct kernel operation. Usually, it's some kind of cache that meets these criteria. But the private pages of a dying process also meet the criteria, don't they? I'm imagining the shrinker just picking an arbitrary doomed (dying but not yet dead) process and freeing some of its pages. I know there are concerns about slow shrinkers causing havoc throughout the system, but since this shrinker would be bounded above on CPU time and would never block, I feel like it'd be pretty safe. * insert standard missive about system calls being cheap, but we can talk about the way in which we expose this functionality after we agree that it's a good idea generally
On Thu, Apr 11, 2019 at 08:33:13AM -0700, Matthew Wilcox wrote: > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > victim process. The usage of this flag is currently limited to SIGKILL > > signal and only to privileged users. > > What is the downside of doing expedited memory reclaim? ie why not do it > every time a process is going to die? I agree with this. The oom reaper mostly does work the exiting task would have to do anyway, so this shouldn't be measurably more expensive to do it per default. I wouldn't want to add a new interface unless we really do need that type of control.
On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote: > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > victim process. The usage of this flag is currently limited to SIGKILL > > signal and only to privileged users. > > What is the downside of doing expedited memory reclaim? ie why not do it > every time a process is going to die? I think with an implementation that does not use/abuse oom-reaper thread this could be done for any kill. As I mentioned oom-reaper is a limited resource which has access to memory reserves and should not be abused in the way I do in this reference implementation. While there might be downsides that I don't know of, I'm not sure it's required to hurry every kill's memory reclaim. I think there are cases when resource deallocation is critical, for example when we kill to relieve resource shortage and there are kills when reclaim speed is not essential. It would be great if we can identify urgent cases without userspace hints, so I'm open to suggestions that do not involve additional flags.
On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote: > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > victim process. The usage of this flag is currently limited to SIGKILL > > > signal and only to privileged users. > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > every time a process is going to die? > > I think with an implementation that does not use/abuse oom-reaper > thread this could be done for any kill. As I mentioned oom-reaper is a > limited resource which has access to memory reserves and should not be > abused in the way I do in this reference implementation. > While there might be downsides that I don't know of, I'm not sure it's > required to hurry every kill's memory reclaim. I think there are cases > when resource deallocation is critical, for example when we kill to > relieve resource shortage and there are kills when reclaim speed is > not essential. It would be great if we can identify urgent cases > without userspace hints, so I'm open to suggestions that do not > involve additional flags. I was imagining a PI-ish approach where we'd reap in case an RT process was waiting on the death of some other process. I'd still prefer the API I proposed in the other message because it gets the kernel out of the business of deciding what the right signal is. I'm a huge believer in "mechanism, not policy".
On Thu, Apr 11, 2019 at 10:33:32AM -0700, Daniel Colascione wrote: > On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote: > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > > victim process. The usage of this flag is currently limited to SIGKILL > > > > signal and only to privileged users. > > > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > > every time a process is going to die? > > > > I think with an implementation that does not use/abuse oom-reaper > > thread this could be done for any kill. As I mentioned oom-reaper is a > > limited resource which has access to memory reserves and should not be > > abused in the way I do in this reference implementation. > > While there might be downsides that I don't know of, I'm not sure it's > > required to hurry every kill's memory reclaim. I think there are cases > > when resource deallocation is critical, for example when we kill to > > relieve resource shortage and there are kills when reclaim speed is > > not essential. It would be great if we can identify urgent cases > > without userspace hints, so I'm open to suggestions that do not > > involve additional flags. > > I was imagining a PI-ish approach where we'd reap in case an RT > process was waiting on the death of some other process. I'd still > prefer the API I proposed in the other message because it gets the > kernel out of the business of deciding what the right signal is. I'm a > huge believer in "mechanism, not policy". It's not a question of the kernel deciding what the right signal is. The kernel knows whether a signal is fatal to a particular process or not. The question is whether the killing process should do the work of reaping the dying process's resources sometimes, always or never. Currently, that is never (the process reaps its own resources); Suren is suggesting sometimes, and I'm asking "Why not always?"
On Thu, Apr 11, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote: > > On Thu, Apr 11, 2019 at 10:33:32AM -0700, Daniel Colascione wrote: > > On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote: > > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > > > victim process. The usage of this flag is currently limited to SIGKILL > > > > > signal and only to privileged users. > > > > > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > > > every time a process is going to die? > > > > > > I think with an implementation that does not use/abuse oom-reaper > > > thread this could be done for any kill. As I mentioned oom-reaper is a > > > limited resource which has access to memory reserves and should not be > > > abused in the way I do in this reference implementation. > > > While there might be downsides that I don't know of, I'm not sure it's > > > required to hurry every kill's memory reclaim. I think there are cases > > > when resource deallocation is critical, for example when we kill to > > > relieve resource shortage and there are kills when reclaim speed is > > > not essential. It would be great if we can identify urgent cases > > > without userspace hints, so I'm open to suggestions that do not > > > involve additional flags. > > > > I was imagining a PI-ish approach where we'd reap in case an RT > > process was waiting on the death of some other process. I'd still > > prefer the API I proposed in the other message because it gets the > > kernel out of the business of deciding what the right signal is. I'm a > > huge believer in "mechanism, not policy". > > It's not a question of the kernel deciding what the right signal is. > The kernel knows whether a signal is fatal to a particular process or not. > The question is whether the killing process should do the work of reaping > the dying process's resources sometimes, always or never. Currently, > that is never (the process reaps its own resources); Suren is suggesting > sometimes, and I'm asking "Why not always?" FWIW, Suren's initial proposal is that the oom_reaper kthread do the reaping, not the process sending the kill. Are you suggesting that sending SIGKILL should spend a while in signal delivery reaping pages before returning? I thought about just doing it this way, but I didn't like the idea: it'd slow down mass-killing programs like killall(1). Programs expect sending SIGKILL to be a fast operation that returns immediately.
On Thu, Apr 11, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote: > > On Thu, Apr 11, 2019 at 10:33:32AM -0700, Daniel Colascione wrote: > > On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote: > > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > > > victim process. The usage of this flag is currently limited to SIGKILL > > > > > signal and only to privileged users. > > > > > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > > > every time a process is going to die? > > > > > > I think with an implementation that does not use/abuse oom-reaper > > > thread this could be done for any kill. As I mentioned oom-reaper is a > > > limited resource which has access to memory reserves and should not be > > > abused in the way I do in this reference implementation. > > > While there might be downsides that I don't know of, I'm not sure it's > > > required to hurry every kill's memory reclaim. I think there are cases > > > when resource deallocation is critical, for example when we kill to > > > relieve resource shortage and there are kills when reclaim speed is > > > not essential. It would be great if we can identify urgent cases > > > without userspace hints, so I'm open to suggestions that do not > > > involve additional flags. > > > > I was imagining a PI-ish approach where we'd reap in case an RT > > process was waiting on the death of some other process. I'd still > > prefer the API I proposed in the other message because it gets the > > kernel out of the business of deciding what the right signal is. I'm a > > huge believer in "mechanism, not policy". > > It's not a question of the kernel deciding what the right signal is. > The kernel knows whether a signal is fatal to a particular process or not. > The question is whether the killing process should do the work of reaping > the dying process's resources sometimes, always or never. Currently, > that is never (the process reaps its own resources); Suren is suggesting > sometimes, and I'm asking "Why not always?" If there are no downsides of doing this always (like using some resources that can be utilized in a better way) then by all means, let's do it unconditionally. My current implementation is not one of such cases :) I think with implementation when killing process is doing the reaping of the victim's mm this can be done unconditionally because we don't use resources which might otherwise be used in a better way. Overall I like Daniel's idea of the process that requested killing and is waiting for the victim to die helping in reaping its memory. It kind of naturally elevates the priority of the reaping if the priority of the waiting process is higher than victim's priority (a kind of priority inheritance).
On Thu, Apr 11, 2019 at 10:09:06AM -0700, Suren Baghdasaryan wrote: > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > victim process. The usage of this flag is currently limited to SIGKILL > > > signal and only to privileged users. > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > every time a process is going to die? Hello, Suren! I also like the idea to reap always. > I think with an implementation that does not use/abuse oom-reaper > thread this could be done for any kill. As I mentioned oom-reaper is a > limited resource which has access to memory reserves and should not be > abused in the way I do in this reference implementation. In most OOM cases it doesn't matter that much which task to reap, so I don't think that reusing the oom-reaper thread is bad. It should be relatively easy to tweak in a way, that it won't wait for mmap_sem if there are other tasks waiting to be reaped. Also, the oom code add to the head of the list, and the expedited killing to the end, or something like this. The only think, if we're going to reap all tasks, we probably want to have a per-node oom_reaper thread. Thanks!
On Thu, Apr 11, 2019 at 2:45 PM Roman Gushchin <guro@fb.com> wrote: > > On Thu, Apr 11, 2019 at 10:09:06AM -0700, Suren Baghdasaryan wrote: > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > > victim process. The usage of this flag is currently limited to SIGKILL > > > > signal and only to privileged users. > > > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > > every time a process is going to die? > > Hello, Suren! > > I also like the idea to reap always. > > > I think with an implementation that does not use/abuse oom-reaper > > thread this could be done for any kill. As I mentioned oom-reaper is a > > limited resource which has access to memory reserves and should not be > > abused in the way I do in this reference implementation. > > In most OOM cases it doesn't matter that much which task to reap, > so I don't think that reusing the oom-reaper thread is bad. > It should be relatively easy to tweak in a way, that it won't > wait for mmap_sem if there are other tasks waiting to be reaped. > Also, the oom code add to the head of the list, and the expedited > killing to the end, or something like this. > > The only think, if we're going to reap all tasks, we probably > want to have a per-node oom_reaper thread. Thanks for the ideas Roman. I'll take some time to digest the input from everybody. What I heard from everyone is that we want this to be a part of generic kill functionality which does not require a change in userspace API. > Thanks! > > -- > You received this message because you are subscribed to the Google Groups "kernel-team" group. > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com. >
On Thu 11-04-19 10:47:50, Daniel Colascione wrote: > On Thu, Apr 11, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > On Thu, Apr 11, 2019 at 10:33:32AM -0700, Daniel Colascione wrote: > > > On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote: > > > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > > > > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > > > > victim process. The usage of this flag is currently limited to SIGKILL > > > > > > signal and only to privileged users. > > > > > > > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > > > > every time a process is going to die? > > > > > > > > I think with an implementation that does not use/abuse oom-reaper > > > > thread this could be done for any kill. As I mentioned oom-reaper is a > > > > limited resource which has access to memory reserves and should not be > > > > abused in the way I do in this reference implementation. > > > > While there might be downsides that I don't know of, I'm not sure it's > > > > required to hurry every kill's memory reclaim. I think there are cases > > > > when resource deallocation is critical, for example when we kill to > > > > relieve resource shortage and there are kills when reclaim speed is > > > > not essential. It would be great if we can identify urgent cases > > > > without userspace hints, so I'm open to suggestions that do not > > > > involve additional flags. > > > > > > I was imagining a PI-ish approach where we'd reap in case an RT > > > process was waiting on the death of some other process. I'd still > > > prefer the API I proposed in the other message because it gets the > > > kernel out of the business of deciding what the right signal is. I'm a > > > huge believer in "mechanism, not policy". > > > > It's not a question of the kernel deciding what the right signal is. > > The kernel knows whether a signal is fatal to a particular process or not. > > The question is whether the killing process should do the work of reaping > > the dying process's resources sometimes, always or never. Currently, > > that is never (the process reaps its own resources); Suren is suggesting > > sometimes, and I'm asking "Why not always?" > > FWIW, Suren's initial proposal is that the oom_reaper kthread do the > reaping, not the process sending the kill. Are you suggesting that > sending SIGKILL should spend a while in signal delivery reaping pages > before returning? I thought about just doing it this way, but I didn't > like the idea: it'd slow down mass-killing programs like killall(1). > Programs expect sending SIGKILL to be a fast operation that returns > immediately. I was thinking about this as well. And SYNC_SIGKILL would workaround the current expectations of how quick the current implementation is. The harder part would what is the actual semantic. Does the kill wait until the target task is TASK_DEAD or is there an intermediate step that would we could call it end of the day and still have a reasonable semantic (e.g. the original pid is really not alive anymore).
On Thu 11-04-19 08:33:13, Matthew Wilcox wrote: > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > victim process. The usage of this flag is currently limited to SIGKILL > > signal and only to privileged users. > > What is the downside of doing expedited memory reclaim? ie why not do it > every time a process is going to die? Well, you are tearing down an address space which might be still in use because the task not fully dead yeat. So there are two downsides AFAICS. Core dumping which will not see the reaped memory so the resulting coredump might be incomplete. And unexpected #PF/gup on the reaped memory will result in SIGBUS. These are things that we have closed our eyes in the oom context because they likely do not matter. If we want to use the same technique for other usecases then we have to think how much that matter again.
On Thu, Apr 11, 2019 at 11:53 PM Michal Hocko <mhocko@kernel.org> wrote: > On Thu 11-04-19 08:33:13, Matthew Wilcox wrote: > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > victim process. The usage of this flag is currently limited to SIGKILL > > > signal and only to privileged users. > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > every time a process is going to die? > > Well, you are tearing down an address space which might be still in use > because the task not fully dead yeat. So there are two downsides AFAICS. > Core dumping which will not see the reaped memory so the resulting > coredump might be incomplete. And unexpected #PF/gup on the reaped > memory will result in SIGBUS. These are things that we have closed our > eyes in the oom context because they likely do not matter. If we want to > use the same technique for other usecases then we have to think how much > that matter again. After some internal discussions we realized that there is one additional downside of doing reaping unconditionally. If this async reaping is done on a more efficient and energy hungrier CPU irrespective of urgency of the kill it should end up costing more power overall (I’m referring here to assimetric architectures like ARM big.LITTLE). Obviously quantifying that cost is not easy as it depends on the usecase and a particular system but it won’t be zero. So I think we will need some gating condition after all. > > -- > Michal Hocko > SUSE Labs > > -- > You received this message because you are subscribed to the Google Groups > "kernel-team" group. > To unsubscribe from this group and stop receiving emails from it, send an > email to kernel-team+unsubscribe@android.com. > > <div><div><br></div><div><br><div class="gmail_quote"></div></div></div><div><div dir="ltr" class="gmail_attr">On Thu, Apr 11, 2019 at 11:53 PM Michal Hocko <<a href="mailto:mhocko@kernel.org" target="_blank">mhocko@kernel.org</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">On Thu 11-04-19 08:33:13, Matthew Wilcox wrote:<br> > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:<br> > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via<br> > > pidfd_send_signal() syscall to allow expedited memory reclaim of the<br> > > victim process. The usage of this flag is currently limited to SIGKILL<br> > > signal and only to privileged users.<br> > <br> > What is the downside of doing expedited memory reclaim? ie why not do it<br> > every time a process is going to die?<br> <br> Well, you are tearing down an address space which might be still in use<br> because the task not fully dead yeat. So there are two downsides AFAICS.<br> Core dumping which will not see the reaped memory so the resulting<br> coredump might be incomplete. And unexpected #PF/gup on the reaped<br> memory will result in SIGBUS. These are things that we have closed our<br> eyes in the oom context because they likely do not matter. If we want to<br> use the same technique for other usecases then we have to think how much<br> that matter again.</blockquote><div dir="auto"><br></div></div><div><div dir="auto">After some internal discussions we realized that there is one additional downside of doing reaping unconditionally. If this async reaping is done on a more efficient and energy hungrier CPU irrespective of urgency of the kill it should end up costing more power overall (I’m referring here to assimetric architectures like ARM big.LITTLE). Obviously quantifying that cost is not easy as it depends on the usecase and a particular system but it won’t be zero. So I think we will need some gating condition after all. </div></div><div><div><div class="gmail_quote"><div dir="auto"><br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><br> <br> -- <br> Michal Hocko<br> SUSE Labs<br> <br> -- <br> You received this message because you are subscribed to the Google Groups "kernel-team" group.<br> To unsubscribe from this group and stop receiving emails from it, send an email to <a href="mailto:kernel-team%2Bunsubscribe@android.com" target="_blank">kernel-team+unsubscribe@android.com</a>.<br> <br> </blockquote></div></div> </div>
On Thu, Apr 11, 2019 at 11:53 PM Michal Hocko <mhocko@kernel.org> wrote: > > On Thu 11-04-19 08:33:13, Matthew Wilcox wrote: > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > victim process. The usage of this flag is currently limited to SIGKILL > > > signal and only to privileged users. > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > every time a process is going to die? > > Well, you are tearing down an address space which might be still in use > because the task not fully dead yeat. So there are two downsides AFAICS. > Core dumping which will not see the reaped memory so the resulting > coredump might be incomplete. And unexpected #PF/gup on the reaped > memory will result in SIGBUS. These are things that we have closed our > eyes in the oom context because they likely do not matter. If we want to > use the same technique for other usecases then we have to think how much > that matter again. > Sorry, resending with corrected settings... After some internal discussions we realized that there is one additional downside of doing reaping unconditionally. If this async reaping is done on a more efficient and energy hungrier CPU irrespective of urgency of the kill it should end up costing more power overall (I’m referring here to assimetric architectures like ARM big.LITTLE). Obviously quantifying that cost is not easy as it depends on the usecase and a particular system but it won’t be zero. So I think we will need some gating condition after all. > -- > Michal Hocko > SUSE Labs > > -- > You received this message because you are subscribed to the Google Groups "kernel-team" group. > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com. >
On Thu, Apr 11, 2019 at 11:53 PM Michal Hocko <mhocko@kernel.org> wrote: > > On Thu 11-04-19 08:33:13, Matthew Wilcox wrote: > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > victim process. The usage of this flag is currently limited to SIGKILL > > > signal and only to privileged users. > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > every time a process is going to die? > > Well, you are tearing down an address space which might be still in use > because the task not fully dead yeat. So there are two downsides AFAICS. > Core dumping which will not see the reaped memory so the resulting Test for SIGNAL_GROUP_COREDUMP before doing any of this then. If you try to start a core dump after reaping begins, too bad: you could have raced with process death anyway. > coredump might be incomplete. And unexpected #PF/gup on the reaped > memory will result in SIGBUS. It's a dying process. Why even bother returning from the fault handler? Just treat that situation as a thread exit. There's no need to make this observable to userspace at all.
On Thu, Apr 11, 2019 at 11:49 PM Michal Hocko <mhocko@kernel.org> wrote: > > On Thu 11-04-19 10:47:50, Daniel Colascione wrote: > > On Thu, Apr 11, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > > > On Thu, Apr 11, 2019 at 10:33:32AM -0700, Daniel Colascione wrote: > > > > On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote: > > > > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > > > > > > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > > > > > victim process. The usage of this flag is currently limited to SIGKILL > > > > > > > signal and only to privileged users. > > > > > > > > > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > > > > > every time a process is going to die? > > > > > > > > > > I think with an implementation that does not use/abuse oom-reaper > > > > > thread this could be done for any kill. As I mentioned oom-reaper is a > > > > > limited resource which has access to memory reserves and should not be > > > > > abused in the way I do in this reference implementation. > > > > > While there might be downsides that I don't know of, I'm not sure it's > > > > > required to hurry every kill's memory reclaim. I think there are cases > > > > > when resource deallocation is critical, for example when we kill to > > > > > relieve resource shortage and there are kills when reclaim speed is > > > > > not essential. It would be great if we can identify urgent cases > > > > > without userspace hints, so I'm open to suggestions that do not > > > > > involve additional flags. > > > > > > > > I was imagining a PI-ish approach where we'd reap in case an RT > > > > process was waiting on the death of some other process. I'd still > > > > prefer the API I proposed in the other message because it gets the > > > > kernel out of the business of deciding what the right signal is. I'm a > > > > huge believer in "mechanism, not policy". > > > > > > It's not a question of the kernel deciding what the right signal is. > > > The kernel knows whether a signal is fatal to a particular process or not. > > > The question is whether the killing process should do the work of reaping > > > the dying process's resources sometimes, always or never. Currently, > > > that is never (the process reaps its own resources); Suren is suggesting > > > sometimes, and I'm asking "Why not always?" > > > > FWIW, Suren's initial proposal is that the oom_reaper kthread do the > > reaping, not the process sending the kill. Are you suggesting that > > sending SIGKILL should spend a while in signal delivery reaping pages > > before returning? I thought about just doing it this way, but I didn't > > like the idea: it'd slow down mass-killing programs like killall(1). > > Programs expect sending SIGKILL to be a fast operation that returns > > immediately. > > I was thinking about this as well. And SYNC_SIGKILL would workaround the > current expectations of how quick the current implementation is. The > harder part would what is the actual semantic. Does the kill wait until > the target task is TASK_DEAD or is there an intermediate step that would > we could call it end of the day and still have a reasonable semantic > (e.g. the original pid is really not alive anymore). I think Daniel's proposal was trying to address that. With an input of how many pages user wants to reclaim asynchronously and return value of how much was actually reclaimed it contains the condition when to stop and the reply how successful we could accomplish that. Since it returns the number of pages reclaimed I assume the call does not return until it reaped enough pages. > -- > Michal Hocko > SUSE Labs
On Fri, Apr 12, 2019 at 7:15 AM Suren Baghdasaryan <surenb@google.com> wrote: > > On Thu, Apr 11, 2019 at 11:49 PM Michal Hocko <mhocko@kernel.org> wrote: > > > > On Thu 11-04-19 10:47:50, Daniel Colascione wrote: > > > On Thu, Apr 11, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > > > > > On Thu, Apr 11, 2019 at 10:33:32AM -0700, Daniel Colascione wrote: > > > > > On Thu, Apr 11, 2019 at 10:09 AM Suren Baghdasaryan <surenb@google.com> wrote: > > > > > > On Thu, Apr 11, 2019 at 8:33 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > > > > > > > > > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > > > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > > > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > > > > > > victim process. The usage of this flag is currently limited to SIGKILL > > > > > > > > signal and only to privileged users. > > > > > > > > > > > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > > > > > > every time a process is going to die? > > > > > > > > > > > > I think with an implementation that does not use/abuse oom-reaper > > > > > > thread this could be done for any kill. As I mentioned oom-reaper is a > > > > > > limited resource which has access to memory reserves and should not be > > > > > > abused in the way I do in this reference implementation. > > > > > > While there might be downsides that I don't know of, I'm not sure it's > > > > > > required to hurry every kill's memory reclaim. I think there are cases > > > > > > when resource deallocation is critical, for example when we kill to > > > > > > relieve resource shortage and there are kills when reclaim speed is > > > > > > not essential. It would be great if we can identify urgent cases > > > > > > without userspace hints, so I'm open to suggestions that do not > > > > > > involve additional flags. > > > > > > > > > > I was imagining a PI-ish approach where we'd reap in case an RT > > > > > process was waiting on the death of some other process. I'd still > > > > > prefer the API I proposed in the other message because it gets the > > > > > kernel out of the business of deciding what the right signal is. I'm a > > > > > huge believer in "mechanism, not policy". > > > > > > > > It's not a question of the kernel deciding what the right signal is. > > > > The kernel knows whether a signal is fatal to a particular process or not. > > > > The question is whether the killing process should do the work of reaping > > > > the dying process's resources sometimes, always or never. Currently, > > > > that is never (the process reaps its own resources); Suren is suggesting > > > > sometimes, and I'm asking "Why not always?" > > > > > > FWIW, Suren's initial proposal is that the oom_reaper kthread do the > > > reaping, not the process sending the kill. Are you suggesting that > > > sending SIGKILL should spend a while in signal delivery reaping pages > > > before returning? I thought about just doing it this way, but I didn't > > > like the idea: it'd slow down mass-killing programs like killall(1). > > > Programs expect sending SIGKILL to be a fast operation that returns > > > immediately. > > > > I was thinking about this as well. And SYNC_SIGKILL would workaround the SYNC_SIGKILL (which, I presume, blocks in kill(2)) was proposed in many occasions while we discussed pidfd waits over the past six months or so. We've decided to just make pidfds pollable instead. The kernel already has several ways to express the idea that a task should wait for another task to die, and I don't think we need another. If you want a process that's waiting for a task to exit to help reap that task, great --- that's an option we've talked about --- but we don't need new interface to do it, since the kernel already has all the information it needs. > > current expectations of how quick the current implementation is. The > > harder part would what is the actual semantic. Does the kill wait until > > the target task is TASK_DEAD or is there an intermediate step that would > > we could call it end of the day and still have a reasonable semantic > > (e.g. the original pid is really not alive anymore). > > I think Daniel's proposal was trying to address that. With an input of > how many pages user wants to reclaim asynchronously and return value > of how much was actually reclaimed it contains the condition when to > stop and the reply how successful we could accomplish that. Since it > returns the number of pages reclaimed I assume the call does not > return until it reaped enough pages. Right. I want to punt as much "policy" as possible to userspace. Just using a user thread to do the reaping not only solves the policy problem (since it's userspace that controls priority, affinity, retries, and so on), but also simplifies the implementation kernel-side. I can imagine situations where, depending on device energy state or even charger or screen state we might want to reap more or less aggressively, or not at all. I wouldn't want to burden the kernel with having to get that right when userspace could make the decision.
On Fri, Apr 12, 2019 at 7:14 AM Daniel Colascione <dancol@google.com> wrote: > > On Thu, Apr 11, 2019 at 11:53 PM Michal Hocko <mhocko@kernel.org> wrote: > > > > On Thu 11-04-19 08:33:13, Matthew Wilcox wrote: > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > > victim process. The usage of this flag is currently limited to SIGKILL > > > > signal and only to privileged users. > > > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > > every time a process is going to die? > > > > Well, you are tearing down an address space which might be still in use > > because the task not fully dead yeat. So there are two downsides AFAICS. > > Core dumping which will not see the reaped memory so the resulting > > Test for SIGNAL_GROUP_COREDUMP before doing any of this then. If you > try to start a core dump after reaping begins, too bad: you could have > raced with process death anyway. > > > coredump might be incomplete. And unexpected #PF/gup on the reaped > > memory will result in SIGBUS. > > It's a dying process. Why even bother returning from the fault > handler? Just treat that situation as a thread exit. There's no need > to make this observable to userspace at all. Just for clarity, checking the code, I think we already do this. zap_other_threads sets SIGKILL pending on every thread in the group, and we'll handle SIGKILL in the process of taking any page fault or doing any system call, so I don't think it's actually possible for a thread in a dying process to observe the SIGBUS that reaping in theory can generate.
On Thu, Apr 11, 2019 at 10:47:50AM -0700, Daniel Colascione wrote: > On Thu, Apr 11, 2019 at 10:36 AM Matthew Wilcox <willy@infradead.org> wrote: > > It's not a question of the kernel deciding what the right signal is. > > The kernel knows whether a signal is fatal to a particular process or not. > > The question is whether the killing process should do the work of reaping > > the dying process's resources sometimes, always or never. Currently, > > that is never (the process reaps its own resources); Suren is suggesting > > sometimes, and I'm asking "Why not always?" > > FWIW, Suren's initial proposal is that the oom_reaper kthread do the > reaping, not the process sending the kill. Are you suggesting that > sending SIGKILL should spend a while in signal delivery reaping pages > before returning? I thought about just doing it this way, but I didn't > like the idea: it'd slow down mass-killing programs like killall(1). > Programs expect sending SIGKILL to be a fast operation that returns > immediately. Suren said that the implementation in this proposal wasn't to be taken literally. You've raised some good points here though. Do mass-killing programs like kill with a pgid or killall expect the signal-sending process to be fast, or do they not really care? From the kernel's point of view, the same work has to be done, no matter whether the killer or the victim does it. Should the work be accounted to the killer or the victim? It probably doesn't matter. The victim doing the work allows for parallelisation, but I'm not convinced that's important either. I see another advantage for the killer doing the work -- if the task is currently blocking on I/O (eg to an NFS mount that has gone away), the killer can get rid of the task's page tables. If we have to wait for the I/O to complete before the victim reaps its own page tables, we may be waiting forever.
On Fri, Apr 12, 2019 at 7:14 AM Daniel Colascione <dancol@google.com> wrote: > > On Thu, Apr 11, 2019 at 11:53 PM Michal Hocko <mhocko@kernel.org> wrote: > > > > On Thu 11-04-19 08:33:13, Matthew Wilcox wrote: > > > On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote: > > > > Add new SS_EXPEDITE flag to be used when sending SIGKILL via > > > > pidfd_send_signal() syscall to allow expedited memory reclaim of the > > > > victim process. The usage of this flag is currently limited to SIGKILL > > > > signal and only to privileged users. > > > > > > What is the downside of doing expedited memory reclaim? ie why not do it > > > every time a process is going to die? > > > > Well, you are tearing down an address space which might be still in use > > because the task not fully dead yeat. So there are two downsides AFAICS. > > Core dumping which will not see the reaped memory so the resulting > > Test for SIGNAL_GROUP_COREDUMP before doing any of this then. If you > try to start a core dump after reaping begins, too bad: you could have > raced with process death anyway. > > > coredump might be incomplete. And unexpected #PF/gup on the reaped > > memory will result in SIGBUS. > > It's a dying process. Why even bother returning from the fault > handler? Just treat that situation as a thread exit. There's no need > to make this observable to userspace at all. I've spent some more time to investigate possible effects of reaping on coredumps and asked Oleg Nesterov who worked on patchsets that prioritize SIGKILLs over coredump activity (https://lkml.org/lkml/2013/2/17/118). Current do_coredump implementation seems to handle the case of SIGKILL interruption by bailing out whenever dump_interrupted() returns true and that would be the case with pending SIGKILL. So in the case of race when coredump happens first and SIGKILL comes next interrupting the coredump seems to result in no change in behavior and reaping memory proactively seems to have no side effects. An opposite race when SIGKILL gets posted and then coredump happens seems impossible because do_coredump won't be called from get_signal due to signal_group_exit check (get_signal checks signal_group_exit while holding sighand->siglock and complete_signal sets SIGNAL_GROUP_EXIT while holding the same lock). There is a path from __seccomp_filter calling do_coredump while processing SECCOMP_RET_KILL_xxx but even then it should bail out when coredump_wait()->zap_threads(current) checks signal_group_exit(). (Thanks Oleg for clarifying this for me). If we are really concerned about possible increase in failed coredumps because of the proactive reaping I could make it conditional on whether coredumping mm is possible by placing this feature behind !get_dumpable(mm) condition. Another possibility is to check RLIMIT_CORE to decide if coredumps are possible (although if pipe is used for coredump that limit seems to be ignored, so that check would have to take this into consideration). On the issue of SIGBUS happening when accessed memory was already reaped, my understanding that SIGBUS being a synchronous signal will still have to be fetched using dequeue_synchronous_signal from get_signal but not before signal_group_exit is checked. So again if SIGKILL is pending I think SIGBUS will be ignored (please correct me if that's not correct). One additional question I would like to clarify is whether per-node reapers like Roman suggested would make a big difference (All CPUs that I've seen used for Android are single-node ones, so looking for more feedback here). If it's important then reaping victim's memory by the killer is probably not an option.
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index e412c092c1e8..8a227633a058 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -327,7 +327,8 @@ extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig, struct task_struct *p); extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); -extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); +extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid, + bool expedite); extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *, const struct cred *); extern int kill_pgrp(struct pid *pid, int sig, int priv); diff --git a/include/linux/signal.h b/include/linux/signal.h index 9702016734b1..34b7852aa4a0 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -446,8 +446,17 @@ int __save_altstack(stack_t __user *, unsigned long); } while (0); #ifdef CONFIG_PROC_FS + +/* + * SS_FLAGS values used in pidfd_send_signal: + * + * SS_EXPEDITE indicates desire to expedite the operation. + */ +#define SS_EXPEDITE 0x00000001 + struct seq_file; extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); -#endif + +#endif /* CONFIG_PROC_FS */ #endif /* _LINUX_SIGNAL_H */ diff --git a/ipc/mqueue.c b/ipc/mqueue.c index aea30530c472..27c66296e08e 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -720,7 +720,7 @@ static void __do_notify(struct mqueue_inode_info *info) rcu_read_unlock(); kill_pid_info(info->notify.sigev_signo, - &sig_i, info->notify_owner); + &sig_i, info->notify_owner, false); break; case SIGEV_THREAD: set_cookie(info->notify_cookie, NOTIFY_WOKENUP); diff --git a/kernel/signal.c b/kernel/signal.c index f98448cf2def..02ed4332d17c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -43,6 +43,7 @@ #include <linux/compiler.h> #include <linux/posix-timers.h> #include <linux/livepatch.h> +#include <linux/oom.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -1394,7 +1395,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) return success ? 0 : retval; } -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid, + bool expedite) { int error = -ESRCH; struct task_struct *p; @@ -1402,8 +1404,17 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) for (;;) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); - if (p) + if (p) { error = group_send_sig_info(sig, info, p, PIDTYPE_TGID); + + /* + * Ignore expedite_reclaim return value, it is best + * effort only. + */ + if (!error && expedite) + expedite_reclaim(p); + } + rcu_read_unlock(); if (likely(!p || error != -ESRCH)) return error; @@ -1420,7 +1431,7 @@ static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) { int error; rcu_read_lock(); - error = kill_pid_info(sig, info, find_vpid(pid)); + error = kill_pid_info(sig, info, find_vpid(pid), false); rcu_read_unlock(); return error; } @@ -1487,7 +1498,7 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) if (pid > 0) { rcu_read_lock(); - ret = kill_pid_info(sig, info, find_vpid(pid)); + ret = kill_pid_info(sig, info, find_vpid(pid), false); rcu_read_unlock(); return ret; } @@ -1704,7 +1715,7 @@ EXPORT_SYMBOL(kill_pgrp); int kill_pid(struct pid *pid, int sig, int priv) { - return kill_pid_info(sig, __si_special(priv), pid); + return kill_pid_info(sig, __si_special(priv), pid, false); } EXPORT_SYMBOL(kill_pid); @@ -3577,10 +3588,20 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, struct pid *pid; kernel_siginfo_t kinfo; - /* Enforce flags be set to 0 until we add an extension. */ - if (flags) + /* Enforce no unknown flags. */ + if (flags & ~SS_EXPEDITE) return -EINVAL; + if (flags & SS_EXPEDITE) { + /* Enforce SS_EXPEDITE to be used with SIGKILL only. */ + if (sig != SIGKILL) + return -EINVAL; + + /* Limit expedited killing to privileged users only. */ + if (!capable(CAP_SYS_NICE)) + return -EPERM; + } + f = fdget_raw(pidfd); if (!f.file) return -EBADF; @@ -3614,7 +3635,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, prepare_kill_siginfo(sig, &kinfo); } - ret = kill_pid_info(sig, &kinfo, pid); + ret = kill_pid_info(sig, &kinfo, pid, (flags & SS_EXPEDITE) != 0); err: fdput(f); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 02068b2d5862..c926483cdb53 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -140,7 +140,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) struct pid *leader_pid = sig->pids[PIDTYPE_TGID]; trace_itimer_expire(ITIMER_REAL, leader_pid, 0); - kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid); + kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid, false); return HRTIMER_NORESTART; }
Add new SS_EXPEDITE flag to be used when sending SIGKILL via pidfd_send_signal() syscall to allow expedited memory reclaim of the victim process. The usage of this flag is currently limited to SIGKILL signal and only to privileged users. Signed-off-by: Suren Baghdasaryan <surenb@google.com> --- include/linux/sched/signal.h | 3 ++- include/linux/signal.h | 11 ++++++++++- ipc/mqueue.c | 2 +- kernel/signal.c | 37 ++++++++++++++++++++++++++++-------- kernel/time/itimer.c | 2 +- 5 files changed, 43 insertions(+), 12 deletions(-)