diff mbox series

[v5,5/5] fast-export: do automatic reencoding of commit messages only if requested

Message ID 20190513231726.16218-6-newren@gmail.com (mailing list archive)
State New, archived
Headers show
Series Fix and extend encoding handling in fast export/import | expand

Commit Message

Elijah Newren May 13, 2019, 11:17 p.m. UTC
Automatic re-encoding of commit messages (and dropping of the encoding
header) hurts attempts to do reversible history rewrites (e.g. sha1sum
<-> sha256sum transitions, some subtree rewrites), and seems
inconsistent with the general principle followed elsewhere in
fast-export of requiring explicit user requests to modify the output
(e.g. --signed-tags=strip, --tag-of-filtered-object=rewrite).  Add a
--reencode flag that the user can use to specify, and like other
fast-export flags, default it to 'abort'.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 Documentation/git-fast-export.txt |  7 +++++
 builtin/fast-export.c             | 46 +++++++++++++++++++++++++++++--
 t/t9350-fast-export.sh            | 38 +++++++++++++++++++++++--
 3 files changed, 85 insertions(+), 6 deletions(-)

Comments

Eric Sunshine May 14, 2019, 12:19 a.m. UTC | #1
On Mon, May 13, 2019 at 7:17 PM Elijah Newren <newren@gmail.com> wrote:
> Automatic re-encoding of commit messages (and dropping of the encoding
> header) hurts attempts to do reversible history rewrites (e.g. sha1sum
> <-> sha256sum transitions, some subtree rewrites), and seems
> inconsistent with the general principle followed elsewhere in
> fast-export of requiring explicit user requests to modify the output
> (e.g. --signed-tags=strip, --tag-of-filtered-object=rewrite).  Add a
> --reencode flag that the user can use to specify, and like other
> fast-export flags, default it to 'abort'.
>
> Signed-off-by: Elijah Newren <newren@gmail.com>
> ---
> diff --git a/builtin/fast-export.c b/builtin/fast-export.c
> @@ -77,6 +78,31 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt,
> +static int parse_opt_reencode_mode(const struct option *opt,
> +                                  const char *arg, int unset)
> +{
> +       if (unset) {
> +               reencode_mode = REENCODE_ABORT;
> +               return 0;
> +       }
> +
> +       switch (git_parse_maybe_bool(arg)) {
> +       case 0:
> +               reencode_mode = REENCODE_NO;
> +               break;
> +       case 1:
> +               reencode_mode = REENCODE_YES;
> +               break;
> +       default:
> +               if (arg && !strcasecmp(arg, "abort"))

If 'arg' is NULL, git_parse_maybe_bool() will have returned 1, which
is already handled above, so no need to check it here. So, dropping
the "arg &&" from the conditional...

> +                       reencode_mode = REENCODE_ABORT;
> +               else
> +                       return error("Unknown reencoding mode: %s", arg);

...makes it clear that you won't ever be passing NULL to "%s", which
is undefined behavior.

> +       }
> +
> +       return 0;
> +}
diff mbox series

Patch

diff --git a/Documentation/git-fast-export.txt b/Documentation/git-fast-export.txt
index 64c01ba918..11427acdde 100644
--- a/Documentation/git-fast-export.txt
+++ b/Documentation/git-fast-export.txt
@@ -129,6 +129,13 @@  marks the same across runs.
 	for intermediary filters (e.g. for rewriting commit messages
 	which refer to older commits, or for stripping blobs by id).
 
+--reencode=(yes|no|abort)::
+	Specify how to handle `encoding` header in commit objects.  When
+	asking to 'abort' (which is the default), this program will die
+	when encountering such a commit object.  With 'yes', the commit
+	message will be reencoded into UTF-8.  With 'no', the original
+	encoding will be preserved.
+
 --refspec::
 	Apply the specified refspec to each ref exported. Multiple of them can
 	be specified.
diff --git a/builtin/fast-export.c b/builtin/fast-export.c
index 66331fa401..0bb65b3886 100644
--- a/builtin/fast-export.c
+++ b/builtin/fast-export.c
@@ -33,6 +33,7 @@  static const char *fast_export_usage[] = {
 static int progress;
 static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT;
 static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT;
+static enum { REENCODE_ABORT, REENCODE_YES, REENCODE_NO } reencode_mode = REENCODE_ABORT;
 static int fake_missing_tagger;
 static int use_done_feature;
 static int no_data;
@@ -77,6 +78,31 @@  static int parse_opt_tag_of_filtered_mode(const struct option *opt,
 	return 0;
 }
 
+static int parse_opt_reencode_mode(const struct option *opt,
+				   const char *arg, int unset)
+{
+	if (unset) {
+		reencode_mode = REENCODE_ABORT;
+		return 0;
+	}
+
+	switch (git_parse_maybe_bool(arg)) {
+	case 0:
+		reencode_mode = REENCODE_NO;
+		break;
+	case 1:
+		reencode_mode = REENCODE_YES;
+		break;
+	default:
+		if (arg && !strcasecmp(arg, "abort"))
+			reencode_mode = REENCODE_ABORT;
+		else
+			return error("Unknown reencoding mode: %s", arg);
+	}
+
+	return 0;
+}
+
 static struct decoration idnums;
 static uint32_t last_idnum;
 
@@ -633,10 +659,21 @@  static void handle_commit(struct commit *commit, struct rev_info *rev,
 	}
 
 	mark_next_object(&commit->object);
-	if (anonymize)
+	if (anonymize) {
 		reencoded = anonymize_commit_message(message);
-	else if (!is_encoding_utf8(encoding))
-		reencoded = reencode_string(message, "UTF-8", encoding);
+	} else if (encoding) {
+		switch(reencode_mode) {
+		case REENCODE_YES:
+			reencoded = reencode_string(message, "UTF-8", encoding);
+			break;
+		case REENCODE_NO:
+			break;
+		case REENCODE_ABORT:
+			die("Encountered commit-specific encoding %s in commit "
+			    "%s; use --reencode=[yes|no] to handle it",
+			    encoding, oid_to_hex(&commit->object.oid));
+		}
+	}
 	if (!commit->parents)
 		printf("reset %s\n", refname);
 	printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
@@ -1091,6 +1128,9 @@  int cmd_fast_export(int argc, const char **argv, const char *prefix)
 		OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"),
 			     N_("select handling of tags that tag filtered objects"),
 			     parse_opt_tag_of_filtered_mode),
+		OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"),
+			     N_("select handling of commit messages in an alternate encoding"),
+			     parse_opt_reencode_mode),
 		OPT_STRING(0, "export-marks", &export_filename, N_("file"),
 			     N_("Dump marks to this file")),
 		OPT_STRING(0, "import-marks", &import_filename, N_("file"),
diff --git a/t/t9350-fast-export.sh b/t/t9350-fast-export.sh
index 4fd637312a..d21d7bf9a9 100755
--- a/t/t9350-fast-export.sh
+++ b/t/t9350-fast-export.sh
@@ -94,14 +94,14 @@  test_expect_success 'fast-export --show-original-ids | git fast-import' '
 	test $MUSS = $(git rev-parse --verify refs/tags/muss)
 '
 
-test_expect_success 'iso-8859-7' '
+test_expect_success 'reencoding iso-8859-7' '
 
 	test_when_finished "git reset --hard HEAD~1" &&
 	test_config i18n.commitencoding iso-8859-7 &&
 	test_tick &&
 	echo rosten >file &&
 	git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
-	git fast-export wer^..wer >iso-8859-7.fi &&
+	git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
 	sed "s/wer/i18n/" iso-8859-7.fi |
 		(cd new &&
 		 git fast-import &&
@@ -118,13 +118,45 @@  test_expect_success 'iso-8859-7' '
 		 ! grep ^encoding actual)
 '
 
+test_expect_success 'aborting on iso-8859-7' '
+
+	test_when_finished "git reset --hard HEAD~1" &&
+	test_config i18n.commitencoding iso-8859-7 &&
+	echo rosten >file &&
+	git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
+	test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi
+'
+
+test_expect_success 'preserving iso-8859-7' '
+
+	test_when_finished "git reset --hard HEAD~1" &&
+	test_config i18n.commitencoding iso-8859-7 &&
+	echo rosten >file &&
+	git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
+	git fast-export --reencode=no wer^..wer >iso-8859-7.fi &&
+	sed "s/wer/i18n-no-recoding/" iso-8859-7.fi |
+		(cd new &&
+		 git fast-import &&
+		 # The commit object, if not re-encoded, is 240 bytes.
+		 # Removing the "encoding iso-8859-7\n" header would drops 20
+		 # bytes.  Re-encoding the Pi character from \xF0 (\360) in
+		 # iso-8859-7 to \xCF\x80 (\317\200) in utf-8 adds a byte.
+		 # Check for the expected size...
+		 test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
+		 # ...as well as the expected byte.
+		 git cat-file commit i18n-no-recoding >actual &&
+		 grep $(printf "\360") actual &&
+		 # Also make sure the commit has the "encoding" header
+		 grep ^encoding actual)
+'
+
 test_expect_success 'encoding preserved if reencoding fails' '
 
 	test_when_finished "git reset --hard HEAD~1" &&
 	test_config i18n.commitencoding iso-8859-7 &&
 	echo rosten >file &&
 	git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
-	git fast-export wer^..wer >iso-8859-7.fi &&
+	git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
 	sed "s/wer/i18n-invalid/" iso-8859-7.fi |
 		(cd new &&
 		 git fast-import &&