diff mbox series

kernel: add panic_on_taint

Message ID 20200506222815.274570-1-aquini@redhat.com (mailing list archive)
State New, archived
Headers show
Series kernel: add panic_on_taint | expand

Commit Message

Rafael Aquini May 6, 2020, 10:28 p.m. UTC
Analogously to the introduction of panic_on_warn, this patch
introduces a kernel option named panic_on_taint in order to
provide a simple and generic way to stop execution and catch
a coredump when the kernel gets tainted by any given taint flag.

This is useful for debugging sessions as it avoids rebuilding
the kernel to explicitly add calls to panic() or BUG() into
code sites that introduce the taint flags of interest.
Another, perhaps less frequent, use for this option would be
as a mean for assuring a security policy (in paranoid mode)
case where no single taint is allowed for the running system.

Suggested-by: Qian Cai <cai@lca.pw>
Signed-off-by: Rafael Aquini <aquini@redhat.com>
---
 Documentation/admin-guide/kdump/kdump.rst     | 10 ++++++
 .../admin-guide/kernel-parameters.txt         |  3 ++
 Documentation/admin-guide/sysctl/kernel.rst   | 36 +++++++++++++++++++
 include/linux/kernel.h                        |  1 +
 kernel/panic.c                                |  7 ++++
 kernel/sysctl.c                               |  7 ++++
 6 files changed, 64 insertions(+)

Comments

Luis Chamberlain May 6, 2020, 11:24 p.m. UTC | #1
On Wed, May 06, 2020 at 06:28:15PM -0400, Rafael Aquini wrote:
> Analogously to the introduction of panic_on_warn, this patch
> introduces a kernel option named panic_on_taint in order to
> provide a simple and generic way to stop execution and catch
> a coredump when the kernel gets tainted by any given taint flag.
> 
> This is useful for debugging sessions as it avoids rebuilding
> the kernel to explicitly add calls to panic() or BUG() into
> code sites that introduce the taint flags of interest.
> Another, perhaps less frequent, use for this option would be
> as a mean for assuring a security policy (in paranoid mode)
> case where no single taint is allowed for the running system.
> 
> Suggested-by: Qian Cai <cai@lca.pw>
> Signed-off-by: Rafael Aquini <aquini@redhat.com>
> ---
>  Documentation/admin-guide/kdump/kdump.rst     | 10 ++++++
>  .../admin-guide/kernel-parameters.txt         |  3 ++
>  Documentation/admin-guide/sysctl/kernel.rst   | 36 +++++++++++++++++++
>  include/linux/kernel.h                        |  1 +
>  kernel/panic.c                                |  7 ++++
>  kernel/sysctl.c                               |  7 ++++
>  6 files changed, 64 insertions(+)
> 
> diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst
> index ac7e131d2935..de3cf6d377cc 100644
> --- a/Documentation/admin-guide/kdump/kdump.rst
> +++ b/Documentation/admin-guide/kdump/kdump.rst
> @@ -521,6 +521,16 @@ will cause a kdump to occur at the panic() call.  In cases where a user wants
>  to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1
>  to achieve the same behaviour.
>  
> +Trigger Kdump on add_taint()
> +============================
> +
> +The kernel parameter, panic_on_taint, calls panic() from within add_taint(),
> +whenever the value set in this bitmask matches with the bit flag being set
> +by add_taint(). This will cause a kdump to occur at the panic() call.
> +In cases where a user wants to specify this during runtime,
> +/proc/sys/kernel/panic_on_taint can be set to a respective bitmask value
> +to achieve the same behaviour.
> +
>  Contact
>  =======
>  
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 7bc83f3d9bdf..75c02c1841b2 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -3404,6 +3404,9 @@
>  	panic_on_warn	panic() instead of WARN().  Useful to cause kdump
>  			on a WARN().
>  
> +	panic_on_taint	panic() when the kernel gets tainted, if the taint
> +			flag being set matches with the assigned bitmask.
> +
>  	crash_kexec_post_notifiers
>  			Run kdump after running panic-notifiers and dumping
>  			kmsg. This only for the users who doubt kdump always
> diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
> index 0d427fd10941..5b880102f2e3 100644
> --- a/Documentation/admin-guide/sysctl/kernel.rst
> +++ b/Documentation/admin-guide/sysctl/kernel.rst
> @@ -658,6 +658,42 @@ a kernel rebuild when attempting to kdump at the location of a WARN().
>  = ================================================
>  
>  
> +panic_on_taint
> +==============
> +
> +Bitmask for calling panic() in the add_taint() path.
> +This is useful to avoid a kernel rebuild when attempting to
> +kdump at the insertion of any specific TAINT flags.
> +When set to 0 (default) add_taint() default behavior is maintained.
> +
> +====== ============================
> +bit  0 TAINT_PROPRIETARY_MODULE
> +bit  1 TAINT_FORCED_MODULE
> +bit  2 TAINT_CPU_OUT_OF_SPEC
> +bit  3 TAINT_FORCED_RMMOD
> +bit  4 TAINT_MACHINE_CHECK
> +bit  5 TAINT_BAD_PAGE
> +bit  6 TAINT_USER
> +bit  7 TAINT_DIE
> +bit  8 TAINT_OVERRIDDEN_ACPI_TABLE
> +bit  9 TAINT_WARN
> +bit 10 TAINT_CRAP
> +bit 11 TAINT_FIRMWARE_WORKAROUND
> +bit 12 TAINT_OOT_MODULE
> +bit 13 TAINT_UNSIGNED_MODULE
> +bit 14 TAINT_SOFTLOCKUP
> +bit 15 TAINT_LIVEPATCH
> +bit 16 TAINT_AUX
> +bit 17 TAINT_RANDSTRUCT
> +bit 18 TAINT_FLAGS_COUNT
> +====== ============================
> +
> +So, for example, to panic if the kernel gets tainted due to
> +occurrences of bad pages and/or machine check errors, a user can::
> +
> +  echo 48 > /proc/sys/kernel/panic_on_taint
> +
> +
>  panic_print
>  ===========
>  
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index 9b7a8d74a9d6..518b9fd381c2 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -528,6 +528,7 @@ extern int panic_on_oops;
>  extern int panic_on_unrecovered_nmi;
>  extern int panic_on_io_nmi;
>  extern int panic_on_warn;
> +extern unsigned long panic_on_taint;
>  extern int sysctl_panic_on_rcu_stall;
>  extern int sysctl_panic_on_stackoverflow;
>  
> diff --git a/kernel/panic.c b/kernel/panic.c
> index b69ee9e76cb2..e2d4771ab911 100644
> --- a/kernel/panic.c
> +++ b/kernel/panic.c
> @@ -44,6 +44,7 @@ static int pause_on_oops_flag;
>  static DEFINE_SPINLOCK(pause_on_oops_lock);
>  bool crash_kexec_post_notifiers;
>  int panic_on_warn __read_mostly;
> +unsigned long panic_on_taint __read_mostly;

What justification do we have for using __read_mostly here?
See patch I just sent out, hope that helps.

>  int panic_timeout = CONFIG_PANIC_TIMEOUT;
>  EXPORT_SYMBOL_GPL(panic_timeout);
> @@ -434,6 +435,11 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
>  		pr_warn("Disabling lock debugging due to kernel taint\n");
>  
>  	set_bit(flag, &tainted_mask);
> +
> +	if (unlikely(tainted_mask & panic_on_taint)) {

unlikely() is telling the merit may not be that strong?

> +		panic_on_taint = 0;
> +		panic("panic_on_taint set ...");
> +	}
>  }
>  EXPORT_SYMBOL(add_taint);
>  
> @@ -675,6 +681,7 @@ core_param(panic, panic_timeout, int, 0644);
>  core_param(panic_print, panic_print, ulong, 0644);
>  core_param(pause_on_oops, pause_on_oops, int, 0644);
>  core_param(panic_on_warn, panic_on_warn, int, 0644);
> +core_param(panic_on_taint, panic_on_taint, ulong, 0644);
>  core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
>  
>  static int __init oops_setup(char *s)
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 8a176d8727a3..b80ab660d727 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1217,6 +1217,13 @@ static struct ctl_table kern_table[] = {
>  		.extra1		= SYSCTL_ZERO,
>  		.extra2		= SYSCTL_ONE,
>  	},
> +	{
> +		.procname	= "panic_on_taint",
> +		.data		= &panic_on_taint,
> +		.maxlen		= sizeof(unsigned long),
> +		.mode		= 0644,
> +		.proc_handler	= proc_doulongvec_minmax,

proc_doulongvec_minmax supports a min and max, do we want to
set it so that we have a sanity check for values used? To see
an example, refer to the file-max entry.

That would allow for example to error our if a value was
tried but it is a taint flag which we don't support on an older
kernel.

You know what would be *really* useful as well, is a way to
cat out our current taint, and perhaps another that spits it
out in English. This can allow scripts to check that for
validity, instead of scraping kernel logs.

For instance, I would love to easily just check if TAIN_WARN
was hit on some tests I am working on, but I don't want to scrape
the kernel log for this, as I think this is overkill.

  Luis
Rafael Aquini May 7, 2020, 12:12 a.m. UTC | #2
On Wed, May 06, 2020 at 11:24:48PM +0000, Luis Chamberlain wrote:
> On Wed, May 06, 2020 at 06:28:15PM -0400, Rafael Aquini wrote:
> > Analogously to the introduction of panic_on_warn, this patch
> > introduces a kernel option named panic_on_taint in order to
> > provide a simple and generic way to stop execution and catch
> > a coredump when the kernel gets tainted by any given taint flag.
> > 
> > This is useful for debugging sessions as it avoids rebuilding
> > the kernel to explicitly add calls to panic() or BUG() into
> > code sites that introduce the taint flags of interest.
> > Another, perhaps less frequent, use for this option would be
> > as a mean for assuring a security policy (in paranoid mode)
> > case where no single taint is allowed for the running system.
> > 
> > Suggested-by: Qian Cai <cai@lca.pw>
> > Signed-off-by: Rafael Aquini <aquini@redhat.com>
> > ---
> >  Documentation/admin-guide/kdump/kdump.rst     | 10 ++++++
> >  .../admin-guide/kernel-parameters.txt         |  3 ++
> >  Documentation/admin-guide/sysctl/kernel.rst   | 36 +++++++++++++++++++
> >  include/linux/kernel.h                        |  1 +
> >  kernel/panic.c                                |  7 ++++
> >  kernel/sysctl.c                               |  7 ++++
> >  6 files changed, 64 insertions(+)
> > 
> > diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst
> > index ac7e131d2935..de3cf6d377cc 100644
> > --- a/Documentation/admin-guide/kdump/kdump.rst
> > +++ b/Documentation/admin-guide/kdump/kdump.rst
> > @@ -521,6 +521,16 @@ will cause a kdump to occur at the panic() call.  In cases where a user wants
> >  to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1
> >  to achieve the same behaviour.
> >  
> > +Trigger Kdump on add_taint()
> > +============================
> > +
> > +The kernel parameter, panic_on_taint, calls panic() from within add_taint(),
> > +whenever the value set in this bitmask matches with the bit flag being set
> > +by add_taint(). This will cause a kdump to occur at the panic() call.
> > +In cases where a user wants to specify this during runtime,
> > +/proc/sys/kernel/panic_on_taint can be set to a respective bitmask value
> > +to achieve the same behaviour.
> > +
> >  Contact
> >  =======
> >  
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> > index 7bc83f3d9bdf..75c02c1841b2 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -3404,6 +3404,9 @@
> >  	panic_on_warn	panic() instead of WARN().  Useful to cause kdump
> >  			on a WARN().
> >  
> > +	panic_on_taint	panic() when the kernel gets tainted, if the taint
> > +			flag being set matches with the assigned bitmask.
> > +
> >  	crash_kexec_post_notifiers
> >  			Run kdump after running panic-notifiers and dumping
> >  			kmsg. This only for the users who doubt kdump always
> > diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
> > index 0d427fd10941..5b880102f2e3 100644
> > --- a/Documentation/admin-guide/sysctl/kernel.rst
> > +++ b/Documentation/admin-guide/sysctl/kernel.rst
> > @@ -658,6 +658,42 @@ a kernel rebuild when attempting to kdump at the location of a WARN().
> >  = ================================================
> >  
> >  
> > +panic_on_taint
> > +==============
> > +
> > +Bitmask for calling panic() in the add_taint() path.
> > +This is useful to avoid a kernel rebuild when attempting to
> > +kdump at the insertion of any specific TAINT flags.
> > +When set to 0 (default) add_taint() default behavior is maintained.
> > +
> > +====== ============================
> > +bit  0 TAINT_PROPRIETARY_MODULE
> > +bit  1 TAINT_FORCED_MODULE
> > +bit  2 TAINT_CPU_OUT_OF_SPEC
> > +bit  3 TAINT_FORCED_RMMOD
> > +bit  4 TAINT_MACHINE_CHECK
> > +bit  5 TAINT_BAD_PAGE
> > +bit  6 TAINT_USER
> > +bit  7 TAINT_DIE
> > +bit  8 TAINT_OVERRIDDEN_ACPI_TABLE
> > +bit  9 TAINT_WARN
> > +bit 10 TAINT_CRAP
> > +bit 11 TAINT_FIRMWARE_WORKAROUND
> > +bit 12 TAINT_OOT_MODULE
> > +bit 13 TAINT_UNSIGNED_MODULE
> > +bit 14 TAINT_SOFTLOCKUP
> > +bit 15 TAINT_LIVEPATCH
> > +bit 16 TAINT_AUX
> > +bit 17 TAINT_RANDSTRUCT
> > +bit 18 TAINT_FLAGS_COUNT
> > +====== ============================
> > +
> > +So, for example, to panic if the kernel gets tainted due to
> > +occurrences of bad pages and/or machine check errors, a user can::
> > +
> > +  echo 48 > /proc/sys/kernel/panic_on_taint
> > +
> > +
> >  panic_print
> >  ===========
> >  
> > diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> > index 9b7a8d74a9d6..518b9fd381c2 100644
> > --- a/include/linux/kernel.h
> > +++ b/include/linux/kernel.h
> > @@ -528,6 +528,7 @@ extern int panic_on_oops;
> >  extern int panic_on_unrecovered_nmi;
> >  extern int panic_on_io_nmi;
> >  extern int panic_on_warn;
> > +extern unsigned long panic_on_taint;
> >  extern int sysctl_panic_on_rcu_stall;
> >  extern int sysctl_panic_on_stackoverflow;
> >  
> > diff --git a/kernel/panic.c b/kernel/panic.c
> > index b69ee9e76cb2..e2d4771ab911 100644
> > --- a/kernel/panic.c
> > +++ b/kernel/panic.c
> > @@ -44,6 +44,7 @@ static int pause_on_oops_flag;
> >  static DEFINE_SPINLOCK(pause_on_oops_lock);
> >  bool crash_kexec_post_notifiers;
> >  int panic_on_warn __read_mostly;
> > +unsigned long panic_on_taint __read_mostly;
> 
> What justification do we have for using __read_mostly here?
> See patch I just sent out, hope that helps.
>

Given the rationale on the hint usage (from your re-sent patch)
this one should not be hinted. I'll get rid of the hint.

 
> >  int panic_timeout = CONFIG_PANIC_TIMEOUT;
> >  EXPORT_SYMBOL_GPL(panic_timeout);
> > @@ -434,6 +435,11 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
> >  		pr_warn("Disabling lock debugging due to kernel taint\n");
> >  
> >  	set_bit(flag, &tainted_mask);
> > +
> > +	if (unlikely(tainted_mask & panic_on_taint)) {
> 
> unlikely() is telling the merit may not be that strong?
> 
> > +		panic_on_taint = 0;
> > +		panic("panic_on_taint set ...");
> > +	}
> >  }
> >  EXPORT_SYMBOL(add_taint);
> >  
> > @@ -675,6 +681,7 @@ core_param(panic, panic_timeout, int, 0644);
> >  core_param(panic_print, panic_print, ulong, 0644);
> >  core_param(pause_on_oops, pause_on_oops, int, 0644);
> >  core_param(panic_on_warn, panic_on_warn, int, 0644);
> > +core_param(panic_on_taint, panic_on_taint, ulong, 0644);
> >  core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
> >  
> >  static int __init oops_setup(char *s)
> > diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> > index 8a176d8727a3..b80ab660d727 100644
> > --- a/kernel/sysctl.c
> > +++ b/kernel/sysctl.c
> > @@ -1217,6 +1217,13 @@ static struct ctl_table kern_table[] = {
> >  		.extra1		= SYSCTL_ZERO,
> >  		.extra2		= SYSCTL_ONE,
> >  	},
> > +	{
> > +		.procname	= "panic_on_taint",
> > +		.data		= &panic_on_taint,
> > +		.maxlen		= sizeof(unsigned long),
> > +		.mode		= 0644,
> > +		.proc_handler	= proc_doulongvec_minmax,
> 
> proc_doulongvec_minmax supports a min and max, do we want to
> set it so that we have a sanity check for values used? To see
> an example, refer to the file-max entry.
>
It didn't seem necessary to declare the range limits here, as
albeit he current set of taint flags would cause tainted_mask
to strecth all the way from 0 (none set) to ULONG_MAX (all set), 
that's its valid range given the usage. 
That's why I didn't declare the extra values for range-checking.
I can do it, though, if you rather have it that way.


 
> That would allow for example to error our if a value was
> tried but it is a taint flag which we don't support on an older
> kernel.
> 
> You know what would be *really* useful as well, is a way to
> cat out our current taint, and perhaps another that spits it
> out in English. This can allow scripts to check that for
> validity, instead of scraping kernel logs.
> 
> For instance, I would love to easily just check if TAIN_WARN
> was hit on some tests I am working on, but I don't want to scrape
> the kernel log for this, as I think this is overkill.
> 
I can definitely take a look into these suggestions for a later 
patch, as I think they're nice but they don't look as a deal-breaker 
for the simple feature being proposed here.

Thanks for your feedback!
-- Rafael
Randy Dunlap May 7, 2020, 12:20 a.m. UTC | #3
On 5/6/20 3:28 PM, Rafael Aquini wrote:
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 7bc83f3d9bdf..75c02c1841b2 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -3404,6 +3404,9 @@
>  	panic_on_warn	panic() instead of WARN().  Useful to cause kdump
>  			on a WARN().
>  
> +	panic_on_taint	panic() when the kernel gets tainted, if the taint
> +			flag being set matches with the assigned bitmask.
> +

Where is the bitmask assigned?

I.e., maybe this text should be more like:

	panic_on_taint=<bitmask>
and then the bits are explained.  See e.g. panic= and panic_print=
in this same file.


>  	crash_kexec_post_notifiers
>  			Run kdump after running panic-notifiers and dumping
>  			kmsg. This only for the users who doubt kdump always
Qian Cai May 7, 2020, 2:50 a.m. UTC | #4
> On May 6, 2020, at 6:28 PM, Rafael Aquini <aquini@redhat.com> wrote:
> 
> Analogously to the introduction of panic_on_warn, this patch
> introduces a kernel option named panic_on_taint in order to
> provide a simple and generic way to stop execution and catch
> a coredump when the kernel gets tainted by any given taint flag.
> 
> This is useful for debugging sessions as it avoids rebuilding
> the kernel to explicitly add calls to panic() or BUG() into
> code sites that introduce the taint flags of interest.
> Another, perhaps less frequent, use for this option would be
> as a mean for assuring a security policy (in paranoid mode)
> case where no single taint is allowed for the running system.

Andrew, you can drop the patch below from -mm now because that one is now obsolete,

mm-slub-add-panic_on_error-to-the-debug-facilities.patch
Rafael Aquini May 7, 2020, 8:42 p.m. UTC | #5
On Wed, May 06, 2020 at 10:50:19PM -0400, Qian Cai wrote:
> 
> 
> > On May 6, 2020, at 6:28 PM, Rafael Aquini <aquini@redhat.com> wrote:
> > 
> > Analogously to the introduction of panic_on_warn, this patch
> > introduces a kernel option named panic_on_taint in order to
> > provide a simple and generic way to stop execution and catch
> > a coredump when the kernel gets tainted by any given taint flag.
> > 
> > This is useful for debugging sessions as it avoids rebuilding
> > the kernel to explicitly add calls to panic() or BUG() into
> > code sites that introduce the taint flags of interest.
> > Another, perhaps less frequent, use for this option would be
> > as a mean for assuring a security policy (in paranoid mode)
> > case where no single taint is allowed for the running system.
> 
> Andrew, you can drop the patch below from -mm now because that one is now obsolete,
> 
> mm-slub-add-panic_on_error-to-the-debug-facilities.patch
>
Please, don't drop it yet. I'll send a patch to get rid of the bits,
once this one gets accepted, if it gets accepted.

-- Rafael
Qian Cai May 7, 2020, 10:05 p.m. UTC | #6
> On May 7, 2020, at 4:42 PM, Rafael Aquini <aquini@redhat.com> wrote:
> 
> On Wed, May 06, 2020 at 10:50:19PM -0400, Qian Cai wrote:
>> 
>> 
>>> On May 6, 2020, at 6:28 PM, Rafael Aquini <aquini@redhat.com> wrote:
>>> 
>>> Analogously to the introduction of panic_on_warn, this patch
>>> introduces a kernel option named panic_on_taint in order to
>>> provide a simple and generic way to stop execution and catch
>>> a coredump when the kernel gets tainted by any given taint flag.
>>> 
>>> This is useful for debugging sessions as it avoids rebuilding
>>> the kernel to explicitly add calls to panic() or BUG() into
>>> code sites that introduce the taint flags of interest.
>>> Another, perhaps less frequent, use for this option would be
>>> as a mean for assuring a security policy (in paranoid mode)
>>> case where no single taint is allowed for the running system.
>> 
>> Andrew, you can drop the patch below from -mm now because that one is now obsolete,
>> 
>> mm-slub-add-panic_on_error-to-the-debug-facilities.patch
>> 
> Please, don't drop it yet. I'll send a patch to get rid of the bits,
> once this one gets accepted, if it gets accepted.

Why do you ever want that obsolete patch even show up in linux-next to potentailly waste other people/bots time to test it and develop things on top of it?
Rafael Aquini May 7, 2020, 10:15 p.m. UTC | #7
On Thu, May 07, 2020 at 06:05:27PM -0400, Qian Cai wrote:
> 
> 
> > On May 7, 2020, at 4:42 PM, Rafael Aquini <aquini@redhat.com> wrote:
> > 
> > On Wed, May 06, 2020 at 10:50:19PM -0400, Qian Cai wrote:
> >> 
> >> 
> >>> On May 6, 2020, at 6:28 PM, Rafael Aquini <aquini@redhat.com> wrote:
> >>> 
> >>> Analogously to the introduction of panic_on_warn, this patch
> >>> introduces a kernel option named panic_on_taint in order to
> >>> provide a simple and generic way to stop execution and catch
> >>> a coredump when the kernel gets tainted by any given taint flag.
> >>> 
> >>> This is useful for debugging sessions as it avoids rebuilding
> >>> the kernel to explicitly add calls to panic() or BUG() into
> >>> code sites that introduce the taint flags of interest.
> >>> Another, perhaps less frequent, use for this option would be
> >>> as a mean for assuring a security policy (in paranoid mode)
> >>> case where no single taint is allowed for the running system.
> >> 
> >> Andrew, you can drop the patch below from -mm now because that one is now obsolete,
> >> 
> >> mm-slub-add-panic_on_error-to-the-debug-facilities.patch
> >> 
> > Please, don't drop it yet. I'll send a patch to get rid of the bits,
> > once this one gets accepted, if it gets accepted.
> 
> Why do you ever want that obsolete patch even show up in linux-next to potentailly waste other people/bots time to test it and develop things on top of it?
>

It's a reasonable and self-contained feature that we have a valid use for. 
I honestly fail to see it causing that amount of annoyance as you are 
suggesting here.
Qian Cai May 7, 2020, 11:07 p.m. UTC | #8
> On May 7, 2020, at 6:15 PM, Rafael Aquini <aquini@redhat.com> wrote:
> 
> It's a reasonable and self-contained feature that we have a valid use for. 
> I honestly fail to see it causing that amount of annoyance as you are 
> suggesting here.

It is not a big trouble yet, but keeping an obsolete patch that not very straightforward to figure out that it will be superseded by the panic_on_taint patch will only cause more confusion the longer it has stayed in linux-next.

The thing is that even if you can’t get this panic_on_taint (the superior solution) patch accepted for some reasons, someone else could still work on it until it get merged.

Thus, I failed to see any possibility we will go back to the inferior solution (mm-slub-add-panic_on_error-to-the-debug-facilities.patch) by all means.
Rafael Aquini May 7, 2020, 11:36 p.m. UTC | #9
On Thu, May 07, 2020 at 07:07:20PM -0400, Qian Cai wrote:
> 
> 
> > On May 7, 2020, at 6:15 PM, Rafael Aquini <aquini@redhat.com> wrote:
> > 
> > It's a reasonable and self-contained feature that we have a valid use for. 
> > I honestly fail to see it causing that amount of annoyance as you are 
> > suggesting here.
> 
> It is not a big trouble yet, but keeping an obsolete patch that not very straightforward to figure out that it will be superseded by the panic_on_taint patch will only cause more confusion the longer it has stayed in linux-next.
> 
> The thing is that even if you can’t get this panic_on_taint (the superior solution) patch accepted for some reasons, someone else could still work on it until it get merged.
> 
> Thus, I failed to see any possibility we will go back to the inferior solution (mm-slub-add-panic_on_error-to-the-debug-facilities.patch) by all means.
>

There are plenty of examples of things being added, changed, and
removed in -next. IOW, living in a transient state. I think it's 
a reasonable compromise to keep it while the other one is beind 
ironed out.

The fact that you prefer one solution to another doesn't
invalidate the one you dislike. 

Cheers,
-- Rafael
Qian Cai May 8, 2020, 12:28 a.m. UTC | #10
> On May 7, 2020, at 7:36 PM, Rafael Aquini <aquini@redhat.com> wrote:
> 
> On Thu, May 07, 2020 at 07:07:20PM -0400, Qian Cai wrote:
>> 
>> 
>>> On May 7, 2020, at 6:15 PM, Rafael Aquini <aquini@redhat.com> wrote:
>>> 
>>> It's a reasonable and self-contained feature that we have a valid use for. 
>>> I honestly fail to see it causing that amount of annoyance as you are 
>>> suggesting here.
>> 
>> It is not a big trouble yet, but keeping an obsolete patch that not very straightforward to figure out that it will be superseded by the panic_on_taint patch will only cause more confusion the longer it has stayed in linux-next.
>> 
>> The thing is that even if you can’t get this panic_on_taint (the superior solution) patch accepted for some reasons, someone else could still work on it until it get merged.
>> 
>> Thus, I failed to see any possibility we will go back to the inferior solution (mm-slub-add-panic_on_error-to-the-debug-facilities.patch) by all means.
>> 
> 
> There are plenty of examples of things being added, changed, and
> removed in -next. IOW, living in a transient state. I think it's 
> a reasonable compromise to keep it while the other one is beind 
> ironed out.
> 
> The fact that you prefer one solution to another doesn't
> invalidate the one you dislike. 

As far I can tell, the bar of the other core subsystems are quite high even for linux-next. People have been voiced over and over again to urge Andrew not picking up patches so eagerly, but I will save that discussion for the next time.

Anyway, thanks for working for the panic_on_taint patch. I believe it could be useful for all testing agents to catch those bad pages earlier.
diff mbox series

Patch

diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst
index ac7e131d2935..de3cf6d377cc 100644
--- a/Documentation/admin-guide/kdump/kdump.rst
+++ b/Documentation/admin-guide/kdump/kdump.rst
@@ -521,6 +521,16 @@  will cause a kdump to occur at the panic() call.  In cases where a user wants
 to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1
 to achieve the same behaviour.
 
+Trigger Kdump on add_taint()
+============================
+
+The kernel parameter, panic_on_taint, calls panic() from within add_taint(),
+whenever the value set in this bitmask matches with the bit flag being set
+by add_taint(). This will cause a kdump to occur at the panic() call.
+In cases where a user wants to specify this during runtime,
+/proc/sys/kernel/panic_on_taint can be set to a respective bitmask value
+to achieve the same behaviour.
+
 Contact
 =======
 
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 7bc83f3d9bdf..75c02c1841b2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3404,6 +3404,9 @@ 
 	panic_on_warn	panic() instead of WARN().  Useful to cause kdump
 			on a WARN().
 
+	panic_on_taint	panic() when the kernel gets tainted, if the taint
+			flag being set matches with the assigned bitmask.
+
 	crash_kexec_post_notifiers
 			Run kdump after running panic-notifiers and dumping
 			kmsg. This only for the users who doubt kdump always
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 0d427fd10941..5b880102f2e3 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -658,6 +658,42 @@  a kernel rebuild when attempting to kdump at the location of a WARN().
 = ================================================
 
 
+panic_on_taint
+==============
+
+Bitmask for calling panic() in the add_taint() path.
+This is useful to avoid a kernel rebuild when attempting to
+kdump at the insertion of any specific TAINT flags.
+When set to 0 (default) add_taint() default behavior is maintained.
+
+====== ============================
+bit  0 TAINT_PROPRIETARY_MODULE
+bit  1 TAINT_FORCED_MODULE
+bit  2 TAINT_CPU_OUT_OF_SPEC
+bit  3 TAINT_FORCED_RMMOD
+bit  4 TAINT_MACHINE_CHECK
+bit  5 TAINT_BAD_PAGE
+bit  6 TAINT_USER
+bit  7 TAINT_DIE
+bit  8 TAINT_OVERRIDDEN_ACPI_TABLE
+bit  9 TAINT_WARN
+bit 10 TAINT_CRAP
+bit 11 TAINT_FIRMWARE_WORKAROUND
+bit 12 TAINT_OOT_MODULE
+bit 13 TAINT_UNSIGNED_MODULE
+bit 14 TAINT_SOFTLOCKUP
+bit 15 TAINT_LIVEPATCH
+bit 16 TAINT_AUX
+bit 17 TAINT_RANDSTRUCT
+bit 18 TAINT_FLAGS_COUNT
+====== ============================
+
+So, for example, to panic if the kernel gets tainted due to
+occurrences of bad pages and/or machine check errors, a user can::
+
+  echo 48 > /proc/sys/kernel/panic_on_taint
+
+
 panic_print
 ===========
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 9b7a8d74a9d6..518b9fd381c2 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -528,6 +528,7 @@  extern int panic_on_oops;
 extern int panic_on_unrecovered_nmi;
 extern int panic_on_io_nmi;
 extern int panic_on_warn;
+extern unsigned long panic_on_taint;
 extern int sysctl_panic_on_rcu_stall;
 extern int sysctl_panic_on_stackoverflow;
 
diff --git a/kernel/panic.c b/kernel/panic.c
index b69ee9e76cb2..e2d4771ab911 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -44,6 +44,7 @@  static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 bool crash_kexec_post_notifiers;
 int panic_on_warn __read_mostly;
+unsigned long panic_on_taint __read_mostly;
 
 int panic_timeout = CONFIG_PANIC_TIMEOUT;
 EXPORT_SYMBOL_GPL(panic_timeout);
@@ -434,6 +435,11 @@  void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
 		pr_warn("Disabling lock debugging due to kernel taint\n");
 
 	set_bit(flag, &tainted_mask);
+
+	if (unlikely(tainted_mask & panic_on_taint)) {
+		panic_on_taint = 0;
+		panic("panic_on_taint set ...");
+	}
 }
 EXPORT_SYMBOL(add_taint);
 
@@ -675,6 +681,7 @@  core_param(panic, panic_timeout, int, 0644);
 core_param(panic_print, panic_print, ulong, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
 core_param(panic_on_warn, panic_on_warn, int, 0644);
+core_param(panic_on_taint, panic_on_taint, ulong, 0644);
 core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
 
 static int __init oops_setup(char *s)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a176d8727a3..b80ab660d727 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1217,6 +1217,13 @@  static struct ctl_table kern_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
+	{
+		.procname	= "panic_on_taint",
+		.data		= &panic_on_taint,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 	{
 		.procname	= "timer_migration",