diff mbox series

[v4,4/5] fast-export: do not modify memory from get_commit_buffer

Message ID 20210430232537.1131641-5-lukeshu@lukeshu.com (mailing list archive)
State New, archived
Headers show
Series fast-export, fast-import: add support for signed-commits | expand

Commit Message

Luke Shumaker April 30, 2021, 11:25 p.m. UTC
From: Luke Shumaker <lukeshu@datawire.io>

fast-export's helper function find_encoding() takes a `const char *`, but
modifies that memory despite the `const`.  Ultimately, this memory came
from get_commit_buffer(), and you're not supposed to modify the memory
that you get from get_commit_buffer().

So, get rid of find_encoding() in favor of commit.h:find_commit_header(),
which gives back a string length, rather than mutating the memory to
insert a '\0' terminator.

Because find_commit_header() detects the "\n\n" string that separates the
headers and the commit message, move the call to be above the
`message = strstr(..., "\n\n")` call.  This helps readability, and allows
for the value of `encoding` to be used for a better value of "..." so that
the same memory doesn't need to be checked twice.  Introduce a
`commit_buffer_cursor` variable to avoid writing an awkward
`encoding ? encoding + encoding_len : committer_end` expression.

Signed-off-by: Luke Shumaker <lukeshu@datawire.io>
---

Notes:
    v4: This commit is new in v4.

 builtin/fast-export.c | 65 ++++++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 28 deletions(-)

Comments

Junio C Hamano May 3, 2021, 4:41 a.m. UTC | #1
Luke Shumaker <lukeshu@lukeshu.com> writes:

> +static char *reencode_message(const char *in_msg,
> +			      const char *in_encoding, size_t in_encoding_len)
> +{
> +	static struct strbuf in_encoding_buf = STRBUF_INIT;
> +
> +	strbuf_reset(&in_encoding_buf);
> +	strbuf_add(&in_encoding_buf, in_encoding, in_encoding_len);
> +
> +	return reencode_string(in_msg, "UTF-8", in_encoding_buf.buf);
> +}

There is only a single caller of this, so making it caller's
responsibility to do the strbuf thing would allow us to make this
thread-safe quite easily (and at that point we might not even have
this helper function).

> +	committer = strstr(commit_buffer_cursor, "\ncommitter ");
>  	if (!committer)
>  		die("could not find committer in commit %s",
>  		    oid_to_hex(&commit->object.oid));
>  	committer++;
> -	committer_end = strchrnul(committer, '\n');
> -	message = strstr(committer_end, "\n\n");
> -	encoding = find_encoding(committer_end, message);
> +	commit_buffer_cursor = committer_end = strchrnul(committer, '\n');
> +
> +	/* find_commit_header() gets a `+ 1` because
> +	 * commit_buffer_cursor points at the trailing "\n" at the end
> +	 * of the previous line, but find_commit_header() wants a
> +	 * pointer to the beginning of the next line. */
> +	encoding = find_commit_header(commit_buffer_cursor + 1, "encoding", &encoding_len);

	/*
	 * Our multi-line comments have opening and closing
	 * slash-asterisk and asterisk-slash on their own
	 * lines.
	 */

What if strchrnul() returned a pointer to the terminating NUL
instead of the LF at the end of the line?  +1 will run past the end
of the buffer.

> +	if (encoding)
> +		commit_buffer_cursor = encoding + encoding_len;
> +
> +	message = strstr(commit_buffer_cursor, "\n\n");

Good.

> @@ -685,14 +693,15 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
>  	} else if (encoding) {
>  		switch(reencode_mode) {
>  		case REENCODE_YES:
> -			reencoded = reencode_string(message, "UTF-8", encoding);
> +			reencoded = reencode_message(message, encoding, encoding_len);
>  			break;

Here is where we can do the temporary strbuf to hold encoding[0,
encoding_len] and directly call reencode_string().

Other than that, this step looks good to me.

Thanks.
diff mbox series

Patch

diff --git a/builtin/fast-export.c b/builtin/fast-export.c
index d1cb8a3183..81f3fb1f05 100644
--- a/builtin/fast-export.c
+++ b/builtin/fast-export.c
@@ -499,21 +499,6 @@  static void show_filemodify(struct diff_queue_struct *q,
 	}
 }
 
-static const char *find_encoding(const char *begin, const char *end)
-{
-	const char *needle = "\nencoding ";
-	char *bol, *eol;
-
-	bol = memmem(begin, end ? end - begin : strlen(begin),
-		     needle, strlen(needle));
-	if (!bol)
-		return NULL;
-	bol += strlen(needle);
-	eol = strchrnul(bol, '\n');
-	*eol = '\0';
-	return bol;
-}
-
 static char *anonymize_ref_component(void *data)
 {
 	static int counter;
@@ -615,13 +600,26 @@  static void anonymize_ident_line(const char **beg, const char **end)
 	*end = out->buf + out->len;
 }
 
+static char *reencode_message(const char *in_msg,
+			      const char *in_encoding, size_t in_encoding_len)
+{
+	static struct strbuf in_encoding_buf = STRBUF_INIT;
+
+	strbuf_reset(&in_encoding_buf);
+	strbuf_add(&in_encoding_buf, in_encoding, in_encoding_len);
+
+	return reencode_string(in_msg, "UTF-8", in_encoding_buf.buf);
+}
+
 static void handle_commit(struct commit *commit, struct rev_info *rev,
 			  struct string_list *paths_of_changed_objects)
 {
 	int saved_output_format = rev->diffopt.output_format;
-	const char *commit_buffer;
+	const char *commit_buffer, *commit_buffer_cursor;
 	const char *author, *author_end, *committer, *committer_end;
-	const char *encoding, *message;
+	const char *encoding;
+	size_t encoding_len;
+	const char *message;
 	char *reencoded = NULL;
 	struct commit_list *p;
 	const char *refname;
@@ -630,21 +628,31 @@  static void handle_commit(struct commit *commit, struct rev_info *rev,
 	rev->diffopt.output_format = DIFF_FORMAT_CALLBACK;
 
 	parse_commit_or_die(commit);
-	commit_buffer = get_commit_buffer(commit, NULL);
-	author = strstr(commit_buffer, "\nauthor ");
+	commit_buffer_cursor = commit_buffer = get_commit_buffer(commit, NULL);
+
+	author = strstr(commit_buffer_cursor, "\nauthor ");
 	if (!author)
 		die("could not find author in commit %s",
 		    oid_to_hex(&commit->object.oid));
 	author++;
-	author_end = strchrnul(author, '\n');
-	committer = strstr(author_end, "\ncommitter ");
+	commit_buffer_cursor = author_end = strchrnul(author, '\n');
+
+	committer = strstr(commit_buffer_cursor, "\ncommitter ");
 	if (!committer)
 		die("could not find committer in commit %s",
 		    oid_to_hex(&commit->object.oid));
 	committer++;
-	committer_end = strchrnul(committer, '\n');
-	message = strstr(committer_end, "\n\n");
-	encoding = find_encoding(committer_end, message);
+	commit_buffer_cursor = committer_end = strchrnul(committer, '\n');
+
+	/* find_commit_header() gets a `+ 1` because
+	 * commit_buffer_cursor points at the trailing "\n" at the end
+	 * of the previous line, but find_commit_header() wants a
+	 * pointer to the beginning of the next line. */
+	encoding = find_commit_header(commit_buffer_cursor + 1, "encoding", &encoding_len);
+	if (encoding)
+		commit_buffer_cursor = encoding + encoding_len;
+
+	message = strstr(commit_buffer_cursor, "\n\n");
 	if (message)
 		message += 2;
 
@@ -685,14 +693,15 @@  static void handle_commit(struct commit *commit, struct rev_info *rev,
 	} else if (encoding) {
 		switch(reencode_mode) {
 		case REENCODE_YES:
-			reencoded = reencode_string(message, "UTF-8", encoding);
+			reencoded = reencode_message(message, encoding, encoding_len);
 			break;
 		case REENCODE_NO:
 			break;
 		case REENCODE_ABORT:
-			die("Encountered commit-specific encoding %s in commit "
+			die("Encountered commit-specific encoding %.*s in commit "
 			    "%s; use --reencode=[yes|no] to handle it",
-			    encoding, oid_to_hex(&commit->object.oid));
+			    (int)encoding_len, encoding,
+			    oid_to_hex(&commit->object.oid));
 		}
 	}
 	if (!commit->parents)
@@ -704,7 +713,7 @@  static void handle_commit(struct commit *commit, struct rev_info *rev,
 	       (int)(author_end - author), author,
 	       (int)(committer_end - committer), committer);
 	if (!reencoded && encoding)
-		printf("encoding %s\n", encoding);
+		printf("encoding %.*s\n", (int)encoding_len, encoding);
 	printf("data %u\n%s",
 	       (unsigned)(reencoded
 			  ? strlen(reencoded) : message