diff mbox series

[2/5] fast-import: support 'encoding' commit header

Message ID 20190425155118.7918-3-newren@gmail.com (mailing list archive)
State New, archived
Headers show
Series Fix and extend encoding handling in fast export/import | expand

Commit Message

Elijah Newren April 25, 2019, 3:51 p.m. UTC
Since git supports commit messages with an encoding other than utf-8,
allow fast-import to import such commits.  This may be useful for folks
who do not want to reencode commit messages from an external system, and
may also be useful to achieve reversible history rewrites (e.g. sha1sum
<-> sha256sum transitions or subtree work) with git repositories that
have used specialized encodings in their commit history.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 Documentation/git-fast-import.txt |  7 +++++++
 fast-import.c                     | 12 ++++++++++--
 t/t9300-fast-import.sh            | 20 ++++++++++++++++++++
 3 files changed, 37 insertions(+), 2 deletions(-)

Comments

Eric Sunshine April 25, 2019, 7:36 p.m. UTC | #1
On Thu, Apr 25, 2019 at 11:51 AM Elijah Newren <newren@gmail.com> wrote:
> Since git supports commit messages with an encoding other than utf-8,
> allow fast-import to import such commits.  This may be useful for folks
> who do not want to reencode commit messages from an external system, and
> may also be useful to achieve reversible history rewrites (e.g. sha1sum
> <-> sha256sum transitions or subtree work) with git repositories that
> have used specialized encodings in their commit history.
>
> Signed-off-by: Elijah Newren <newren@gmail.com>
> ---
> diff --git a/fast-import.c b/fast-import.c
> @@ -2607,6 +2608,9 @@ static void parse_new_commit(const char *arg)
>         if (!committer)
>                 die("Expected committer but didn't get one");
> +       if (skip_prefix(command_buf.buf, "encoding ", &encoding)) {
> +               read_next_command();
> +       }

Style nit: unnecessary braces

> @@ -2670,9 +2674,13 @@ static void parse_new_commit(const char *arg)
>         strbuf_addf(&new_data,
>                 "author %s\n"
> -               "committer %s\n"
> -               "\n",
> +               "committer %s\n",
>                 author ? author : committer, committer);
> +       if (encoding)
> +               strbuf_addf(&new_data,
> +                       "encoding %s\n",
> +                       encoding);
> +       strbuf_addf(&new_data, "\n");

Alternately:

    strbuf_addch(&new_data, '\n');

> diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
> @@ -3299,4 +3299,24 @@ test_expect_success !MINGW 'W: get-mark & empty orphan commit with erroneous thi
> +test_expect_success 'X: handling encoding' '
> +       test_tick &&
> +       [...]
> +       git cat-file -p encoding | grep $(printf "\360") &&
> +       git log -1 --format=%B encoding | grep $(printf "\317\200")

This script is already full of instances of Git commands upstream of
pipes, so this usage is consistent (despite recent work to eliminate
such situations). Okay.
Elijah Newren April 26, 2019, 11:39 a.m. UTC | #2
Hi Eric,

On Thu, Apr 25, 2019 at 1:37 PM Eric Sunshine <sunshine@sunshineco.com> wrote:
>
> On Thu, Apr 25, 2019 at 11:51 AM Elijah Newren <newren@gmail.com> wrote:
> > Since git supports commit messages with an encoding other than utf-8,
> > allow fast-import to import such commits.  This may be useful for folks
> > who do not want to reencode commit messages from an external system, and
> > may also be useful to achieve reversible history rewrites (e.g. sha1sum
> > <-> sha256sum transitions or subtree work) with git repositories that
> > have used specialized encodings in their commit history.
> >
> > Signed-off-by: Elijah Newren <newren@gmail.com>
> > ---
> > diff --git a/fast-import.c b/fast-import.c
> > @@ -2607,6 +2608,9 @@ static void parse_new_commit(const char *arg)
> >         if (!committer)
> >                 die("Expected committer but didn't get one");
> > +       if (skip_prefix(command_buf.buf, "encoding ", &encoding)) {
> > +               read_next_command();
> > +       }
>
> Style nit: unnecessary braces
>
> > @@ -2670,9 +2674,13 @@ static void parse_new_commit(const char *arg)
> >         strbuf_addf(&new_data,
> >                 "author %s\n"
> > -               "committer %s\n"
> > -               "\n",
> > +               "committer %s\n",
> >                 author ? author : committer, committer);
> > +       if (encoding)
> > +               strbuf_addf(&new_data,
> > +                       "encoding %s\n",
> > +                       encoding);
> > +       strbuf_addf(&new_data, "\n");
>
> Alternately:
>
>     strbuf_addch(&new_data, '\n');

Thanks for taking a look.  I'll fix both of these items you
highlighted and the test_config item you pointed out in 1/5 in the
next re-roll.

> > diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
> > @@ -3299,4 +3299,24 @@ test_expect_success !MINGW 'W: get-mark & empty orphan commit with erroneous thi
> > +test_expect_success 'X: handling encoding' '
> > +       test_tick &&
> > +       [...]
> > +       git cat-file -p encoding | grep $(printf "\360") &&
> > +       git log -1 --format=%B encoding | grep $(printf "\317\200")
>
> This script is already full of instances of Git commands upstream of
> pipes, so this usage is consistent (despite recent work to eliminate
> such situations). Okay.
diff mbox series

Patch

diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
index d65cdb3d08..7baf9e47b5 100644
--- a/Documentation/git-fast-import.txt
+++ b/Documentation/git-fast-import.txt
@@ -388,6 +388,7 @@  change to the project.
 	original-oid?
 	('author' (SP <name>)? SP LT <email> GT SP <when> LF)?
 	'committer' (SP <name>)? SP LT <email> GT SP <when> LF
+	('encoding' SP <encoding>)?
 	data
 	('from' SP <commit-ish> LF)?
 	('merge' SP <commit-ish> LF)?
@@ -455,6 +456,12 @@  that was selected by the --date-format=<fmt> command-line option.
 See ``Date Formats'' above for the set of supported formats, and
 their syntax.
 
+`encoding`
+^^^^^^^^^^
+The optional `encoding` command indicates the encoding of the commit
+message.  Most commits are UTF-8 and the encoding is omitted, but this
+allows importing commit messages into git without first reencoding them.
+
 `from`
 ^^^^^^
 The `from` command is used to specify the commit to initialize
diff --git a/fast-import.c b/fast-import.c
index f38d04fa58..25026c068a 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -2585,6 +2585,7 @@  static void parse_new_commit(const char *arg)
 	struct branch *b;
 	char *author = NULL;
 	char *committer = NULL;
+	const char *encoding = NULL;
 	struct hash_list *merge_list = NULL;
 	unsigned int merge_count;
 	unsigned char prev_fanout, new_fanout;
@@ -2607,6 +2608,9 @@  static void parse_new_commit(const char *arg)
 	}
 	if (!committer)
 		die("Expected committer but didn't get one");
+	if (skip_prefix(command_buf.buf, "encoding ", &encoding)) {
+		read_next_command();
+	}
 	parse_data(&msg, 0, NULL);
 	read_next_command();
 	parse_from(b);
@@ -2670,9 +2674,13 @@  static void parse_new_commit(const char *arg)
 	}
 	strbuf_addf(&new_data,
 		"author %s\n"
-		"committer %s\n"
-		"\n",
+		"committer %s\n",
 		author ? author : committer, committer);
+	if (encoding)
+		strbuf_addf(&new_data,
+			"encoding %s\n",
+			encoding);
+	strbuf_addf(&new_data, "\n");
 	strbuf_addbuf(&new_data, &msg);
 	free(author);
 	free(committer);
diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
index 3668263c40..141b7fa35e 100755
--- a/t/t9300-fast-import.sh
+++ b/t/t9300-fast-import.sh
@@ -3299,4 +3299,24 @@  test_expect_success !MINGW 'W: get-mark & empty orphan commit with erroneous thi
 	sed -e s/LFs/LLL/ W-input | tr L "\n" | test_must_fail git fast-import
 '
 
+###
+### series X (other new features)
+###
+
+test_expect_success 'X: handling encoding' '
+	test_tick &&
+	cat >input <<-INPUT_END &&
+	commit refs/heads/encoding
+	committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+	encoding iso-8859-7
+	data <<COMMIT
+	INPUT_END
+
+	printf "Pi: \360\nCOMMIT\n" >>input &&
+
+	git fast-import <input &&
+	git cat-file -p encoding | grep $(printf "\360") &&
+	git log -1 --format=%B encoding | grep $(printf "\317\200")
+'
+
 test_done