diff mbox series

kunit: tool: continue past invalid utf-8 output

Message ID 20211008210752.1109785-1-dlatypov@google.com (mailing list archive)
State New
Delegated to: Brendan Higgins
Headers show
Series kunit: tool: continue past invalid utf-8 output | expand

Commit Message

Daniel Latypov Oct. 8, 2021, 9:07 p.m. UTC
kunit.py currently crashes and fails to parse kernel output if it's not
fully valid utf-8.

This can come from memory corruption or or just inadvertently printing
out binary data as strings.

E.g. adding this line into a kunit test
  pr_info("\x80")
will cause this exception
  UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 1961: invalid start byte

We can tell Python how to handle errors, see
https://docs.python.org/3/library/codecs.html#error-handlers

Unfortunately, it doesn't seem like there's a way to specify this in
just one location, so we need to repeat ourselves quite a bit.

Specify `errors='backslashreplace'` so we instead:
* print out the offending byte as '\x80'
* try and continue parsing the output.
  * as long as the TAP lines themselves are valid, we're fine.

Signed-off-by: Daniel Latypov <dlatypov@google.com>
---
 tools/testing/kunit/kunit.py        | 3 ++-
 tools/testing/kunit/kunit_kernel.py | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)


base-commit: a032094fc1ed17070df01de4a7883da7bb8d5741

Comments

Brendan Higgins Oct. 8, 2021, 9:15 p.m. UTC | #1
On Fri, Oct 8, 2021 at 2:08 PM Daniel Latypov <dlatypov@google.com> wrote:
>
> kunit.py currently crashes and fails to parse kernel output if it's not
> fully valid utf-8.
>
> This can come from memory corruption or or just inadvertently printing
> out binary data as strings.
>
> E.g. adding this line into a kunit test
>   pr_info("\x80")
> will cause this exception
>   UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 1961: invalid start byte
>
> We can tell Python how to handle errors, see
> https://docs.python.org/3/library/codecs.html#error-handlers
>
> Unfortunately, it doesn't seem like there's a way to specify this in
> just one location, so we need to repeat ourselves quite a bit.
>
> Specify `errors='backslashreplace'` so we instead:
> * print out the offending byte as '\x80'
> * try and continue parsing the output.
>   * as long as the TAP lines themselves are valid, we're fine.
>
> Signed-off-by: Daniel Latypov <dlatypov@google.com>

Thanks for fixing this!

Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Daniel Latypov Oct. 8, 2021, 11:51 p.m. UTC | #2
On Fri, Oct 8, 2021 at 2:08 PM Daniel Latypov <dlatypov@google.com> wrote:
>
> kunit.py currently crashes and fails to parse kernel output if it's not
> fully valid utf-8.
>
> This can come from memory corruption or or just inadvertently printing
> out binary data as strings.
>
> E.g. adding this line into a kunit test
>   pr_info("\x80")
> will cause this exception
>   UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 1961: invalid start byte
>
> We can tell Python how to handle errors, see
> https://docs.python.org/3/library/codecs.html#error-handlers
>
> Unfortunately, it doesn't seem like there's a way to specify this in
> just one location, so we need to repeat ourselves quite a bit.
>
> Specify `errors='backslashreplace'` so we instead:
> * print out the offending byte as '\x80'
> * try and continue parsing the output.
>   * as long as the TAP lines themselves are valid, we're fine.
>
> Signed-off-by: Daniel Latypov <dlatypov@google.com>
> ---
>  tools/testing/kunit/kunit.py        | 3 ++-
>  tools/testing/kunit/kunit_kernel.py | 4 ++--
>  2 files changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
> index 9c9ed4071e9e..28ae096d4b53 100755
> --- a/tools/testing/kunit/kunit.py
> +++ b/tools/testing/kunit/kunit.py
> @@ -457,9 +457,10 @@ def main(argv, linux=None):
>                         sys.exit(1)
>         elif cli_args.subcommand == 'parse':
>                 if cli_args.file == None:
> +                       sys.stdin.reconfigure(errors='backslashreplace')

Ugh, pytype doesn't like this even though it's valid.
I can squash the error with
  sys.stdin.reconfigure(errors='backslashreplace')  # pytype:
disable=attribute-error

I had wanted us to avoid having anything specific to pytype in the code.
But mypy (the more common typechecker iirc) hasn't been smart enough
to typecheck our code since the QEMU support landed.

If we don't add this directive, both typecheckers will report at least
one spurious warning.
Should I go ahead and add it, Brendan/David?

>                         kunit_output = sys.stdin
>                 else:
> -                       with open(cli_args.file, 'r') as f:
> +                       with open(cli_args.file, 'r', errors='backslashreplace') as f:
>                                 kunit_output = f.read().splitlines()
>                 request = KunitParseRequest(cli_args.raw_output,
>                                             None,
> diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
> index faa6320e900e..f08c6c36a947 100644
> --- a/tools/testing/kunit/kunit_kernel.py
> +++ b/tools/testing/kunit/kunit_kernel.py
> @@ -135,7 +135,7 @@ class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations):
>                                            stdin=subprocess.PIPE,
>                                            stdout=subprocess.PIPE,
>                                            stderr=subprocess.STDOUT,
> -                                          text=True, shell=True)
> +                                          text=True, shell=True, errors='backslashreplace')
>
>  class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations):
>         """An abstraction over command line operations performed on a source tree."""
> @@ -172,7 +172,7 @@ class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations):
>                                            stdin=subprocess.PIPE,
>                                            stdout=subprocess.PIPE,
>                                            stderr=subprocess.STDOUT,
> -                                          text=True)
> +                                          text=True, errors='backslashreplace')
>
>  def get_kconfig_path(build_dir) -> str:
>         return get_file_path(build_dir, KCONFIG_PATH)
>
> base-commit: a032094fc1ed17070df01de4a7883da7bb8d5741
> --
> 2.33.0.882.g93a45727a2-goog
>
Daniel Latypov Oct. 13, 2021, 4:51 p.m. UTC | #3
On Fri, Oct 8, 2021 at 4:51 PM Daniel Latypov <dlatypov@google.com> wrote:
>
> On Fri, Oct 8, 2021 at 2:08 PM Daniel Latypov <dlatypov@google.com> wrote:
> >
> > kunit.py currently crashes and fails to parse kernel output if it's not
> > fully valid utf-8.
> >
> > This can come from memory corruption or or just inadvertently printing
> > out binary data as strings.
> >
> > E.g. adding this line into a kunit test
> >   pr_info("\x80")
> > will cause this exception
> >   UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 1961: invalid start byte
> >
> > We can tell Python how to handle errors, see
> > https://docs.python.org/3/library/codecs.html#error-handlers
> >
> > Unfortunately, it doesn't seem like there's a way to specify this in
> > just one location, so we need to repeat ourselves quite a bit.
> >
> > Specify `errors='backslashreplace'` so we instead:
> > * print out the offending byte as '\x80'
> > * try and continue parsing the output.
> >   * as long as the TAP lines themselves are valid, we're fine.
> >
> > Signed-off-by: Daniel Latypov <dlatypov@google.com>
> > ---
> >  tools/testing/kunit/kunit.py        | 3 ++-
> >  tools/testing/kunit/kunit_kernel.py | 4 ++--
> >  2 files changed, 4 insertions(+), 3 deletions(-)
> >
> > diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
> > index 9c9ed4071e9e..28ae096d4b53 100755
> > --- a/tools/testing/kunit/kunit.py
> > +++ b/tools/testing/kunit/kunit.py
> > @@ -457,9 +457,10 @@ def main(argv, linux=None):
> >                         sys.exit(1)
> >         elif cli_args.subcommand == 'parse':
> >                 if cli_args.file == None:
> > +                       sys.stdin.reconfigure(errors='backslashreplace')
>
> Ugh, pytype doesn't like this even though it's valid.
> I can squash the error with
>   sys.stdin.reconfigure(errors='backslashreplace')  # pytype:
> disable=attribute-error
>
> I had wanted us to avoid having anything specific to pytype in the code.
> But mypy (the more common typechecker iirc) hasn't been smart enough
> to typecheck our code since the QEMU support landed.
>
> If we don't add this directive, both typecheckers will report at least
> one spurious warning.
> Should I go ahead and add it, Brendan/David?

Friendly ping.
Should we go ahead and add "# pytype: disable=attribute-error" here?

>
> >                         kunit_output = sys.stdin
> >                 else:
> > -                       with open(cli_args.file, 'r') as f:
> > +                       with open(cli_args.file, 'r', errors='backslashreplace') as f:
> >                                 kunit_output = f.read().splitlines()
> >                 request = KunitParseRequest(cli_args.raw_output,
> >                                             None,
> > diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
> > index faa6320e900e..f08c6c36a947 100644
> > --- a/tools/testing/kunit/kunit_kernel.py
> > +++ b/tools/testing/kunit/kunit_kernel.py
> > @@ -135,7 +135,7 @@ class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations):
> >                                            stdin=subprocess.PIPE,
> >                                            stdout=subprocess.PIPE,
> >                                            stderr=subprocess.STDOUT,
> > -                                          text=True, shell=True)
> > +                                          text=True, shell=True, errors='backslashreplace')
> >
> >  class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations):
> >         """An abstraction over command line operations performed on a source tree."""
> > @@ -172,7 +172,7 @@ class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations):
> >                                            stdin=subprocess.PIPE,
> >                                            stdout=subprocess.PIPE,
> >                                            stderr=subprocess.STDOUT,
> > -                                          text=True)
> > +                                          text=True, errors='backslashreplace')
> >
> >  def get_kconfig_path(build_dir) -> str:
> >         return get_file_path(build_dir, KCONFIG_PATH)
> >
> > base-commit: a032094fc1ed17070df01de4a7883da7bb8d5741
> > --
> > 2.33.0.882.g93a45727a2-goog
> >
Daniel Latypov Oct. 20, 2021, 11:22 p.m. UTC | #4
On Wed, Oct 13, 2021 at 9:51 AM Daniel Latypov <dlatypov@google.com> wrote:
>
> On Fri, Oct 8, 2021 at 4:51 PM Daniel Latypov <dlatypov@google.com> wrote:
> >
> > On Fri, Oct 8, 2021 at 2:08 PM Daniel Latypov <dlatypov@google.com> wrote:
> > >
> > > kunit.py currently crashes and fails to parse kernel output if it's not
> > > fully valid utf-8.
> > >
> > > This can come from memory corruption or or just inadvertently printing
> > > out binary data as strings.
> > >
> > > E.g. adding this line into a kunit test
> > >   pr_info("\x80")
> > > will cause this exception
> > >   UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 1961: invalid start byte
> > >
> > > We can tell Python how to handle errors, see
> > > https://docs.python.org/3/library/codecs.html#error-handlers
> > >
> > > Unfortunately, it doesn't seem like there's a way to specify this in
> > > just one location, so we need to repeat ourselves quite a bit.
> > >
> > > Specify `errors='backslashreplace'` so we instead:
> > > * print out the offending byte as '\x80'
> > > * try and continue parsing the output.
> > >   * as long as the TAP lines themselves are valid, we're fine.
> > >
> > > Signed-off-by: Daniel Latypov <dlatypov@google.com>
> > > ---
> > >  tools/testing/kunit/kunit.py        | 3 ++-
> > >  tools/testing/kunit/kunit_kernel.py | 4 ++--
> > >  2 files changed, 4 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
> > > index 9c9ed4071e9e..28ae096d4b53 100755
> > > --- a/tools/testing/kunit/kunit.py
> > > +++ b/tools/testing/kunit/kunit.py
> > > @@ -457,9 +457,10 @@ def main(argv, linux=None):
> > >                         sys.exit(1)
> > >         elif cli_args.subcommand == 'parse':
> > >                 if cli_args.file == None:
> > > +                       sys.stdin.reconfigure(errors='backslashreplace')
> >
> > Ugh, pytype doesn't like this even though it's valid.
> > I can squash the error with
> >   sys.stdin.reconfigure(errors='backslashreplace')  # pytype:
> > disable=attribute-error
> >
> > I had wanted us to avoid having anything specific to pytype in the code.
> > But mypy (the more common typechecker iirc) hasn't been smart enough
> > to typecheck our code since the QEMU support landed.
> >
> > If we don't add this directive, both typecheckers will report at least
> > one spurious warning.
> > Should I go ahead and add it, Brendan/David?
>
> Friendly ping.
> Should we go ahead and add "# pytype: disable=attribute-error" here?

I've sent out a v2 with this:
https://lore.kernel.org/linux-kselftest/20211020232121.1748376-1-dlatypov@google.com

>
> >
> > >                         kunit_output = sys.stdin
> > >                 else:
> > > -                       with open(cli_args.file, 'r') as f:
> > > +                       with open(cli_args.file, 'r', errors='backslashreplace') as f:
> > >                                 kunit_output = f.read().splitlines()
> > >                 request = KunitParseRequest(cli_args.raw_output,
> > >                                             None,
> > > diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
> > > index faa6320e900e..f08c6c36a947 100644
> > > --- a/tools/testing/kunit/kunit_kernel.py
> > > +++ b/tools/testing/kunit/kunit_kernel.py
> > > @@ -135,7 +135,7 @@ class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations):
> > >                                            stdin=subprocess.PIPE,
> > >                                            stdout=subprocess.PIPE,
> > >                                            stderr=subprocess.STDOUT,
> > > -                                          text=True, shell=True)
> > > +                                          text=True, shell=True, errors='backslashreplace')
> > >
> > >  class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations):
> > >         """An abstraction over command line operations performed on a source tree."""
> > > @@ -172,7 +172,7 @@ class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations):
> > >                                            stdin=subprocess.PIPE,
> > >                                            stdout=subprocess.PIPE,
> > >                                            stderr=subprocess.STDOUT,
> > > -                                          text=True)
> > > +                                          text=True, errors='backslashreplace')
> > >
> > >  def get_kconfig_path(build_dir) -> str:
> > >         return get_file_path(build_dir, KCONFIG_PATH)
> > >
> > > base-commit: a032094fc1ed17070df01de4a7883da7bb8d5741
> > > --
> > > 2.33.0.882.g93a45727a2-goog
> > >
Brendan Higgins Oct. 25, 2021, 9:29 p.m. UTC | #5
On Wed, Oct 13, 2021 at 9:52 AM Daniel Latypov <dlatypov@google.com> wrote:
>
> On Fri, Oct 8, 2021 at 4:51 PM Daniel Latypov <dlatypov@google.com> wrote:
> >
> > On Fri, Oct 8, 2021 at 2:08 PM Daniel Latypov <dlatypov@google.com> wrote:
> > >
> > > kunit.py currently crashes and fails to parse kernel output if it's not
> > > fully valid utf-8.
> > >
> > > This can come from memory corruption or or just inadvertently printing
> > > out binary data as strings.
> > >
> > > E.g. adding this line into a kunit test
> > >   pr_info("\x80")
> > > will cause this exception
> > >   UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 1961: invalid start byte
> > >
> > > We can tell Python how to handle errors, see
> > > https://docs.python.org/3/library/codecs.html#error-handlers
> > >
> > > Unfortunately, it doesn't seem like there's a way to specify this in
> > > just one location, so we need to repeat ourselves quite a bit.
> > >
> > > Specify `errors='backslashreplace'` so we instead:
> > > * print out the offending byte as '\x80'
> > > * try and continue parsing the output.
> > >   * as long as the TAP lines themselves are valid, we're fine.
> > >
> > > Signed-off-by: Daniel Latypov <dlatypov@google.com>
> > > ---
> > >  tools/testing/kunit/kunit.py        | 3 ++-
> > >  tools/testing/kunit/kunit_kernel.py | 4 ++--
> > >  2 files changed, 4 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
> > > index 9c9ed4071e9e..28ae096d4b53 100755
> > > --- a/tools/testing/kunit/kunit.py
> > > +++ b/tools/testing/kunit/kunit.py
> > > @@ -457,9 +457,10 @@ def main(argv, linux=None):
> > >                         sys.exit(1)
> > >         elif cli_args.subcommand == 'parse':
> > >                 if cli_args.file == None:
> > > +                       sys.stdin.reconfigure(errors='backslashreplace')
> >
> > Ugh, pytype doesn't like this even though it's valid.
> > I can squash the error with
> >   sys.stdin.reconfigure(errors='backslashreplace')  # pytype:
> > disable=attribute-error
> >
> > I had wanted us to avoid having anything specific to pytype in the code.
> > But mypy (the more common typechecker iirc) hasn't been smart enough
> > to typecheck our code since the QEMU support landed.
> >
> > If we don't add this directive, both typecheckers will report at least
> > one spurious warning.
> > Should I go ahead and add it, Brendan/David?
>
> Friendly ping.
> Should we go ahead and add "# pytype: disable=attribute-error" here?

Sorry, missed this.

Yeah, I am fine with disabling the type checkers if they fail to
understand valid code.

> > >                         kunit_output = sys.stdin
> > >                 else:
> > > -                       with open(cli_args.file, 'r') as f:
> > > +                       with open(cli_args.file, 'r', errors='backslashreplace') as f:
> > >                                 kunit_output = f.read().splitlines()
> > >                 request = KunitParseRequest(cli_args.raw_output,
> > >                                             None,
> > > diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
> > > index faa6320e900e..f08c6c36a947 100644
> > > --- a/tools/testing/kunit/kunit_kernel.py
> > > +++ b/tools/testing/kunit/kunit_kernel.py
> > > @@ -135,7 +135,7 @@ class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations):
> > >                                            stdin=subprocess.PIPE,
> > >                                            stdout=subprocess.PIPE,
> > >                                            stderr=subprocess.STDOUT,
> > > -                                          text=True, shell=True)
> > > +                                          text=True, shell=True, errors='backslashreplace')
> > >
> > >  class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations):
> > >         """An abstraction over command line operations performed on a source tree."""
> > > @@ -172,7 +172,7 @@ class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations):
> > >                                            stdin=subprocess.PIPE,
> > >                                            stdout=subprocess.PIPE,
> > >                                            stderr=subprocess.STDOUT,
> > > -                                          text=True)
> > > +                                          text=True, errors='backslashreplace')
> > >
> > >  def get_kconfig_path(build_dir) -> str:
> > >         return get_file_path(build_dir, KCONFIG_PATH)
> > >
> > > base-commit: a032094fc1ed17070df01de4a7883da7bb8d5741
> > > --
> > > 2.33.0.882.g93a45727a2-goog
> > >
diff mbox series

Patch

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index 9c9ed4071e9e..28ae096d4b53 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -457,9 +457,10 @@  def main(argv, linux=None):
 			sys.exit(1)
 	elif cli_args.subcommand == 'parse':
 		if cli_args.file == None:
+			sys.stdin.reconfigure(errors='backslashreplace')
 			kunit_output = sys.stdin
 		else:
-			with open(cli_args.file, 'r') as f:
+			with open(cli_args.file, 'r', errors='backslashreplace') as f:
 				kunit_output = f.read().splitlines()
 		request = KunitParseRequest(cli_args.raw_output,
 					    None,
diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index faa6320e900e..f08c6c36a947 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -135,7 +135,7 @@  class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations):
 					   stdin=subprocess.PIPE,
 					   stdout=subprocess.PIPE,
 					   stderr=subprocess.STDOUT,
-					   text=True, shell=True)
+					   text=True, shell=True, errors='backslashreplace')
 
 class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations):
 	"""An abstraction over command line operations performed on a source tree."""
@@ -172,7 +172,7 @@  class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations):
 					   stdin=subprocess.PIPE,
 					   stdout=subprocess.PIPE,
 					   stderr=subprocess.STDOUT,
-					   text=True)
+					   text=True, errors='backslashreplace')
 
 def get_kconfig_path(build_dir) -> str:
 	return get_file_path(build_dir, KCONFIG_PATH)