mbox series

[v4,0/5] Fix and extend encoding handling in fast export/import

Message ID 20190513164722.31534-1-newren@gmail.com (mailing list archive)
Headers show
Series Fix and extend encoding handling in fast export/import | expand

Message

Elijah Newren May 13, 2019, 4:47 p.m. UTC
While stress testing `git filter-repo`, I noticed an issue with
encoding; further digging led to the fixes and features in this series.
See the individual commit messages for details.

Changes since v3 (full range-diff below):
  * YES/NO changes suggested by Torsten
  * more boolean synonyms as suggested by Junio
  * check for the exact expected special bytes, in addition to the size
    (Dscho pointed out that it was GitForWindows that munged bytes, not
     Windows, so while I need to be careful in what I pass to git, printf
     and grep can work directly with the special bytes)
  * also checked on gitgitgadget that it passes on the major platforms

[1] https://github.com/gitgitgadget/git/pull/191

Elijah Newren (5):
  t9350: fix encoding test to actually test reencoding
  fast-import: support 'encoding' commit header
  fast-export: avoid stripping encoding header if we cannot reencode
  fast-export: differentiate between explicitly utf-8 and implicitly
    utf-8
  fast-export: do automatic reencoding of commit messages only if
    requested

 Documentation/git-fast-import.txt            |  7 ++
 builtin/fast-export.c                        | 44 +++++++++--
 fast-import.c                                | 11 ++-
 t/t9300-fast-import.sh                       | 20 +++++
 t/t9350-fast-export.sh                       | 78 +++++++++++++++++---
 t/t9350/broken-iso-8859-7-commit-message.txt |  1 +
 t/t9350/simple-iso-8859-7-commit-message.txt |  1 +
 7 files changed, 145 insertions(+), 17 deletions(-)
 create mode 100644 t/t9350/broken-iso-8859-7-commit-message.txt
 create mode 100644 t/t9350/simple-iso-8859-7-commit-message.txt

Range-diff:
1:  2d7bb64acf ! 1:  37a68a0ffd t9350: fix encoding test to actually test reencoding
    @@ -39,18 +39,16 @@
      		 git fast-import &&
     +		 # The commit object, if not re-encoded, would be 240 bytes.
     +		 # Removing the "encoding iso-8859-7\n" header drops 20 bytes.
    -+		 # Re-encoding the Pi character from \xF0 in iso-8859-7 to
    -+		 # \xCF\x80 in utf-8 adds a byte.  Grepping for specific bytes
    -+		 # would be nice, but Windows apparently munges user data
    -+		 # in the form of bytes on the command line to force them to
    -+		 # be characters instead, so we are limited for portability
    -+		 # reasons in subsequent similar tests in this file to check
    -+		 # for size rather than what bytes are present.
    ++		 # Re-encoding the Pi character from \xF0 (\360) in iso-8859-7
    ++		 # to \xCF\x80 (\317\200) in utf-8 adds a byte.  Check for
    ++		 # the expected size.
     +		 test 221 -eq "$(git cat-file -s i18n)" &&
    -+		 # Also make sure the commit does not have the "encoding" header
    ++		 # ...and for the expected translation of bytes.
      		 git cat-file commit i18n >actual &&
     -		 grep "Áéí óú" actual)
     -
    ++		 grep $(printf "\317\200") actual &&
    ++		 # Also make sure the commit does not have the "encoding" header
     +		 ! grep ^encoding actual)
      '
     +
2:  9fa5695017 = 2:  3d84f4613d fast-import: support 'encoding' commit header
3:  dfc76573e9 ! 3:  baa8394a3a fast-export: avoid stripping encoding header if we cannot reencode
    @@ -49,10 +49,14 @@
     +		(cd new &&
     +		 git fast-import &&
     +		 git cat-file commit i18n-invalid >actual &&
    ++		 # Make sure the commit still has the encoding header
     +		 grep ^encoding actual &&
    -+		 # Also verify that the commit has the expected size; i.e.
    ++		 # Verify that the commit has the expected size; i.e.
     +		 # that no bytes were re-encoded to a different encoding.
    -+		 test 252 -eq "$(git cat-file -s i18n-invalid)")
    ++		 test 252 -eq "$(git cat-file -s i18n-invalid)" &&
    ++		 # ...and check for the original special bytes
    ++		 grep $(printf "\360") actual &&
    ++		 grep $(printf "\377") actual)
     +'
     +
      test_expect_success 'import/export-marks' '
4:  83b3656b76 = 4:  49960164c6 fast-export: differentiate between explicitly utf-8 and implicitly utf-8
5:  2063122293 ! 5:  571613a09e fast-export: do automatic reencoding of commit messages only if requested
    @@ -20,7 +20,7 @@
      static int progress;
      static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT;
      static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT;
    -+static enum { REENCODE_ABORT, REENCODE_PLEASE, REENCODE_NEVER } reencode_mode = REENCODE_ABORT;
    ++static enum { REENCODE_ABORT, REENCODE_YES, REENCODE_NO } reencode_mode = REENCODE_ABORT;
      static int fake_missing_tagger;
      static int use_done_feature;
      static int no_data;
    @@ -33,10 +33,10 @@
     +{
     +	if (unset || !strcmp(arg, "abort"))
     +		reencode_mode = REENCODE_ABORT;
    -+	else if (!strcmp(arg, "yes"))
    -+		reencode_mode = REENCODE_PLEASE;
    -+	else if (!strcmp(arg, "no"))
    -+		reencode_mode = REENCODE_NEVER;
    ++	else if (!strcmp(arg, "yes") || !strcmp(arg, "true") || !strcmp(arg, "on"))
    ++		reencode_mode = REENCODE_YES;
    ++	else if (!strcmp(arg, "no") || !strcmp(arg, "false") || !strcmp(arg, "off"))
    ++		reencode_mode = REENCODE_NO;
     +	else
     +		return error("Unknown reencoding mode: %s", arg);
     +	return 0;
    @@ -56,14 +56,14 @@
     -		reencoded = reencode_string(message, "UTF-8", encoding);
     +	} else if (encoding) {
     +		switch(reencode_mode) {
    -+		case REENCODE_PLEASE:
    ++		case REENCODE_YES:
     +			reencoded = reencode_string(message, "UTF-8", encoding);
     +			break;
    -+		case REENCODE_NEVER:
    ++		case REENCODE_NO:
     +			break;
     +		case REENCODE_ABORT:
     +			die("Encountered commit-specific encoding %s in commit "
    -+			    "%s; use --reencode=<mode> to handle it",
    ++			    "%s; use --reencode=[yes|no] to handle it",
     +			    encoding, oid_to_hex(&commit->object.oid));
     +		}
     +	}
    @@ -126,13 +126,14 @@
     +		 git fast-import &&
     +		 # The commit object, if not re-encoded, is 240 bytes.
     +		 # Removing the "encoding iso-8859-7\n" header would drops 20
    -+		 # bytes.  Re-encoding the Pi character from \xF0 in
    -+		 # iso-8859-7 to \xCF\x80 in utf-8 would add a byte.  I would
    -+		 # grep for the # specific bytes, but Windows lamely does not
    -+		 # allow that, so just search for the expected size.
    ++		 # bytes.  Re-encoding the Pi character from \xF0 (\360) in
    ++		 # iso-8859-7 to \xCF\x80 (\317\200) in utf-8 adds a byte.
    ++		 # Check for the expected size...
     +		 test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
    -+		 # Also make sure the commit has the "encoding" header
    ++		 # ...as well as the expected byte.
     +		 git cat-file commit i18n-no-recoding >actual &&
    ++		 grep $(printf "\360") actual &&
    ++		 # Also make sure the commit has the "encoding" header
     +		 grep ^encoding actual)
     +'
     +