diff mbox series

[2/4] refs: trim newline from reflog message

Message ID dfb639373234a6b8d5f9110380a66ffccbe0b1d6.1637590855.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series Inspect reflog data programmatically in more tests | expand

Commit Message

Han-Wen Nienhuys Nov. 22, 2021, 2:20 p.m. UTC
From: Han-Wen Nienhuys <hanwen@google.com>

Commit 523fa69c ("reflog: cleanse messages in the refs.c layer") standardizes
how write entries into the reflog. This commit standardizes how we get messages
out of the reflog. Before, the files backend implicitly added '\n' to the end of
reflog message on reading, which creates a subtle incompatibility with alternate
ref storage backends, such as reftable.

We address this by stripping LF from the message before we pass it to the
user-provided callback.

Signed-off-by: Han-Wen Nienhuys <hanwen@google.com>
---
 builtin/show-branch.c          |  5 -----
 reflog-walk.c                  |  6 ++----
 refs/files-backend.c           | 30 +++++++++++++++---------------
 t/t1405-main-ref-store.sh      |  5 ++---
 t/t1406-submodule-ref-store.sh |  4 ++--
 5 files changed, 21 insertions(+), 29 deletions(-)

Comments

Junio C Hamano Nov. 22, 2021, 10:27 p.m. UTC | #1
"Han-Wen Nienhuys via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Han-Wen Nienhuys <hanwen@google.com>
>
> Commit 523fa69c ("reflog: cleanse messages in the refs.c layer") standardizes
> how write entries into the reflog. This commit standardizes how we get messages
> out of the reflog. Before, the files backend implicitly added '\n' to the end of
> reflog message on reading, which creates a subtle incompatibility with alternate
> ref storage backends, such as reftable.
>
> We address this by stripping LF from the message before we pass it to the
> user-provided callback.

If this were truly "user-provided", then I'd argue that all backends
should follow whatever the files backend has been doing forever---if
the files added LF implicitly, others should, too, because that is
pretty much what these "user-provided" callbacks have been expecting
to see.

In other words, it would be a bug for newer backends to behave
differently.

But I _think_ these callbacks are all under our control, and if that
is the case, I am fine either way, even though I would have a strong
preference not to have to change the API without a good reason, even
if it is a purely internal one.

So, let's go and see if we can find a good reason in the changes we
can make to the callback functions ;-)

> -			end = strchr(logmsg, '\n');
> -			if (end)
> -				*end = '\0';
> -

We could argue that the lack of LF at the end from the API output
made this caller simpler, which may be a plus.

> diff --git a/reflog-walk.c b/reflog-walk.c
> index 8ac4b284b6b..3ee59a98d2f 100644
> --- a/reflog-walk.c
> +++ b/reflog-walk.c
> @@ -244,8 +244,6 @@ void get_reflog_message(struct strbuf *sb,
>  
>  	info = &commit_reflog->reflogs->items[commit_reflog->recno+1];
>  	len = strlen(info->message);
> -	if (len > 0)
> -		len--; /* strip away trailing newline */

Likewise.

> @@ -284,10 +282,10 @@ void show_reflog_message(struct reflog_walk_info *reflog_info, int oneline,
>  		info = &commit_reflog->reflogs->items[commit_reflog->recno+1];
>  		get_reflog_selector(&selector, reflog_info, dmode, force_date, 0);
>  		if (oneline) {
> -			printf("%s: %s", selector.buf, info->message);
> +			printf("%s: %s\n", selector.buf, info->message);
>  		}
>  		else {
> -			printf("Reflog: %s (%s)\nReflog message: %s",
> +			printf("Reflog: %s (%s)\nReflog message: %s\n",
>  			       selector.buf, info->email, info->message);
>  		}

This is Meh either way.

> diff --git a/refs/files-backend.c b/refs/files-backend.c
> index 151b0056fe5..583bbc5f8b9 100644
> --- a/refs/files-backend.c
> +++ b/refs/files-backend.c
> @@ -1936,17 +1936,15 @@ static int show_one_reflog_ent(struct strbuf *sb, each_reflog_ent_fn fn, void *c
>  	int tz;
>  	const char *p = sb->buf;
>  
> -	/* old SP new SP name <email> SP time TAB msg LF */
> -	if (!sb->len || sb->buf[sb->len - 1] != '\n' ||
> -	    parse_oid_hex(p, &ooid, &p) || *p++ != ' ' ||
> +	/* old SP new SP name <email> SP time TAB msg */
> +	if (!sb->len || parse_oid_hex(p, &ooid, &p) || *p++ != ' ' ||
>  	    parse_oid_hex(p, &noid, &p) || *p++ != ' ' ||
> -	    !(email_end = strchr(p, '>')) ||
> -	    email_end[1] != ' ' ||
> +	    !(email_end = strchr(p, '>')) || email_end[1] != ' ' ||
>  	    !(timestamp = parse_timestamp(email_end + 2, &message, 10)) ||
>  	    !message || message[0] != ' ' ||
> -	    (message[1] != '+' && message[1] != '-') ||
> -	    !isdigit(message[2]) || !isdigit(message[3]) ||
> -	    !isdigit(message[4]) || !isdigit(message[5]))
> +	    (message[1] != '+' && message[1] != '-') || !isdigit(message[2]) ||
> +	    !isdigit(message[3]) || !isdigit(message[4]) ||
> +	    !isdigit(message[5]))
>  		return 0; /* corrupt? */
>  	email_end[1] = '\0';
>  	tz = strtol(message + 1, NULL, 10);
> @@ -2038,6 +2036,7 @@ static int files_for_each_reflog_ent_reverse(struct ref_store *ref_store,
>  				strbuf_splice(&sb, 0, 0, bp + 1, endp - (bp + 1));
>  				scanp = bp;
>  				endp = bp + 1;
> +				strbuf_trim_trailing_newline(&sb);
>  				ret = show_one_reflog_ent(&sb, fn, cb_data);
>  				strbuf_reset(&sb);
>  				if (ret)
> @@ -2050,6 +2049,7 @@ static int files_for_each_reflog_ent_reverse(struct ref_store *ref_store,
>  				 * Process it, and we can end the loop.
>  				 */
>  				strbuf_splice(&sb, 0, 0, buf, endp - buf);
> +				strbuf_trim_trailing_newline(&sb);
>  				ret = show_one_reflog_ent(&sb, fn, cb_data);
>  				strbuf_reset(&sb);
>  				break;
> @@ -2099,7 +2099,7 @@ static int files_for_each_reflog_ent(struct ref_store *ref_store,
>  	if (!logfp)
>  		return -1;
>  
> -	while (!ret && !strbuf_getwholeline(&sb, logfp, '\n'))
> +	while (!ret && !strbuf_getline(&sb, logfp))
>  		ret = show_one_reflog_ent(&sb, fn, cb_data);
>  	fclose(logfp);
>  	strbuf_release(&sb);
> @@ -3059,18 +3059,18 @@ static int expire_reflog_ent(struct object_id *ooid, struct object_id *noid,
>  	if ((*cb->should_prune_fn)(ooid, noid, email, timestamp, tz,
>  				   message, policy_cb)) {
>  		if (!cb->newlog)
> -			printf("would prune %s", message);
> +			printf("would prune %s\n", message);
>  		else if (cb->flags & EXPIRE_REFLOGS_VERBOSE)
> -			printf("prune %s", message);
> +			printf("prune %s\n", message);
>  	} else {
>  		if (cb->newlog) {
> -			fprintf(cb->newlog, "%s %s %s %"PRItime" %+05d\t%s",
> -				oid_to_hex(ooid), oid_to_hex(noid),
> -				email, timestamp, tz, message);
> +			fprintf(cb->newlog, "%s %s %s %" PRItime " %+05d\t%s\n",
> +				oid_to_hex(ooid), oid_to_hex(noid), email,
> +				timestamp, tz, message);
>  			oidcpy(&cb->last_kept_oid, noid);
>  		}
>  		if (cb->flags & EXPIRE_REFLOGS_VERBOSE)
> -			printf("keep %s", message);
> +			printf("keep %s\n", message);
>  	}
>  	return 0;
>  }

Hmmmm.  I'll defer my judgment to the end.

> diff --git a/t/t1405-main-ref-store.sh b/t/t1405-main-ref-store.sh
> index 49718b7ea7f..a600bedf2cd 100755
> --- a/t/t1405-main-ref-store.sh
> +++ b/t/t1405-main-ref-store.sh
> @@ -89,13 +89,12 @@ test_expect_success 'for_each_reflog()' '
>  test_expect_success 'for_each_reflog_ent()' '
>  	$RUN for-each-reflog-ent HEAD >actual &&
>  	head -n1 actual | grep one &&
> -	tail -n2 actual | head -n1 | grep recreate-main
> +	tail -n1 actual | grep recreate-main
>  '
>  
>  test_expect_success 'for_each_reflog_ent_reverse()' '
>  	$RUN for-each-reflog-ent-reverse HEAD >actual &&
> -	head -n1 actual | grep recreate-main &&
> -	tail -n2 actual | head -n1 | grep one
> +	tail -n1 actual | grep one
>  '
>  
>  test_expect_success 'reflog_exists(HEAD)' '
> diff --git a/t/t1406-submodule-ref-store.sh b/t/t1406-submodule-ref-store.sh
> index 0a87058971e..b0365c1fee0 100755
> --- a/t/t1406-submodule-ref-store.sh
> +++ b/t/t1406-submodule-ref-store.sh
> @@ -74,13 +74,13 @@ test_expect_success 'for_each_reflog()' '
>  test_expect_success 'for_each_reflog_ent()' '
>  	$RUN for-each-reflog-ent HEAD >actual &&
>  	head -n1 actual | grep first &&
> -	tail -n2 actual | head -n1 | grep main.to.new
> +	tail -n1 actual | grep main.to.new
>  '
>  
>  test_expect_success 'for_each_reflog_ent_reverse()' '
>  	$RUN for-each-reflog-ent-reverse HEAD >actual &&
>  	head -n1 actual | grep main.to.new &&
> -	tail -n2 actual | head -n1 | grep first
> +	tail -n1 actual | grep first
>  '
>  
>  test_expect_success 'reflog_exists(HEAD)' '

I got an impression from the proposed log message and the changes to
the code (except for the refs/files-backend.c, which I only skimmed)
that the idea is that the refs API stops adding LF at the end, and
the callers got a matching change to compensate for the (now)
missing LF.  If that is the idea behind the change, why do we need
to change any existing test?  The only way any tests need to be
modified due to such a change I can think of is when we forget to or
failed to make compensating change to the callers of the API.

Puzzled...

Ah, the $RUN is hiding what is really going on; it is running the
"test-tool ref-store" helper, and we did not adjust that helper.  So
if we make a compensating change to the test-tool then we do not
have to have these changes at all?  But that point may be moot.

In any case, in order to lose 5 lines from show-branch.c, and 2
lines from reflog-walk.c, I see that we had to touch 30+ lines in
refs/files-backend.c.  I find it a bit hard to sell this as an
improvement to the API, to be honest.

Luckily, it looks to me that this step is mostly unreleated to the
main thrust of these patches in the series, which is "reading
.git/logs/ in the test would work only when testing files backend;
use for-each-ref test helper to recreate what would have been read
by such tests from the files backend's files and inspect that
instead, and that would allow us test other backends for free".

So I suspect that this step can be safely dropped?

Thanks.
Ævar Arnfjörð Bjarmason Nov. 23, 2021, 10:24 a.m. UTC | #2
On Mon, Nov 22 2021, Han-Wen Nienhuys via GitGitGadget wrote:

> From: Han-Wen Nienhuys <hanwen@google.com>
> [...]
> @@ -3059,18 +3059,18 @@ static int expire_reflog_ent(struct object_id *ooid, struct object_id *noid,
>  	if ((*cb->should_prune_fn)(ooid, noid, email, timestamp, tz,
>  				   message, policy_cb)) {
>  		if (!cb->newlog)
> -			printf("would prune %s", message);
> +			printf("would prune %s\n", message);
>  		else if (cb->flags & EXPIRE_REFLOGS_VERBOSE)
> -			printf("prune %s", message);
> +			printf("prune %s\n", message);
>  	} else {
>  		if (cb->newlog) {
> -			fprintf(cb->newlog, "%s %s %s %"PRItime" %+05d\t%s",
> -				oid_to_hex(ooid), oid_to_hex(noid),
> -				email, timestamp, tz, message);
> +			fprintf(cb->newlog, "%s %s %s %" PRItime " %+05d\t%s\n",
> +				oid_to_hex(ooid), oid_to_hex(noid), email,
> +				timestamp, tz, message);
>  			oidcpy(&cb->last_kept_oid, noid);
>  		}
>  		if (cb->flags & EXPIRE_REFLOGS_VERBOSE)
> -			printf("keep %s", message);
> +			printf("keep %s\n", message);
>  	}
>  	return 0;
>  }

I haven't looked deeply into this topic as a whole, but FWIW this
conflicts with a topic I've got locally and was going to submit sometime
after the current batch of my own ref fixes in "next".

That we've got verbose output at all in file-backend.c is a wart, and it
should just be moved out if it entirely, this commit (stacked on top of
various other fixes I've got in the area) does that:
https://github.com/avar/git/commit/eff40d2d81b

With that change the reftable code will need to handle far less of this,
as it'll be handled in builtin/reflog.c, it just needs to behave
properly in the appropriate "expire reflog?" callback. I.e. the backend
tells us "does this expire?", the builtin/reflog.c code consumes that
and does any verbose or non-verbose (or progress etc.) output.

Now, the merge conflict between that and this looks rather trivial, but
I can't help but wonder if we're going in the wrong direction here
API-wise, or maybe "wrong direction" is too strong, but "sticking with
the wrong patterN?".

I think your cleanup works, but wouldn't a better thing be to move this
to callbacks rather than tweaking the fprintf formats?

I.e. in my version the whole body of this function has become:
	
	static int expire_reflog_ent(struct object_id *ooid, struct object_id *noid,
				     const char *email, timestamp_t timestamp, int tz,
				     const char *message, void *cb_data)
	{
		struct expire_reflog_cb *cb = cb_data;
		reflog_expiry_should_prune_fn *fn = cb->should_prune_fn;
	
		if (cb->rewrite)
			ooid = &cb->last_kept_oid;
	
		if (fn(ooid, noid, email, timestamp, tz, message, cb->policy_cb))
			return 0;
	
		if (cb->dry_run)
			return 0; /* --dry-run */
	
		fprintf(cb->newlog, "%s %s %s %"PRItime" %+05d\t%s", oid_to_hex(ooid),
			oid_to_hex(noid), email, timestamp, tz, message);
		oidcpy(&cb->last_kept_oid, noid);
	
		return 0;
	}

The only file-backend specific part of it is that fprintf(). If we moved
towards making that part of it be:

	write_reflog_entry_cb(oid_to_hex(ooid), oid_to_hex(noid), email, timestamp, tz, message);

Then we could lift the whole of this API to a level that makes more
sense for a backed to implement.

The refs/files-backend.c shouldn't need to have one function calll the
"should_prune_fn" *and* write out the data. Instead some code common to
all backends should call the "should prune?", and then call the
backend's "here's an entry for you to write" callback.

But maybe I'm overthinking this whole thing. I'm just wondering if we
have say a sqlite reflog backend as opposed to the file/reftable backend
that wants to store this data in a schema. Such a a backend would need
to unpack the data, as we're sprintf() formatting function parameters
before it gets passed to the backends.
Han-Wen Nienhuys Nov. 23, 2021, 4:35 p.m. UTC | #3
On Mon, Nov 22, 2021 at 11:27 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> "Han-Wen Nienhuys via GitGitGadget" <gitgitgadget@gmail.com> writes:
>
> > From: Han-Wen Nienhuys <hanwen@google.com>
> >
> > Commit 523fa69c ("reflog: cleanse messages in the refs.c layer") standardizes
> > how write entries into the reflog. This commit standardizes how we get messages
> > out of the reflog. Before, the files backend implicitly added '\n' to the end of
> > reflog message on reading, which creates a subtle incompatibility with alternate
> > ref storage backends, such as reftable.
> >
> > We address this by stripping LF from the message before we pass it to the
> > user-provided callback.
>
> If this were truly "user-provided", then I'd argue that all backends
> should follow whatever the files backend has been doing forever---if
> the files added LF implicitly, others should, too, because that is
> pretty much what these "user-provided" callbacks have been expecting
> to see.

I think it's just wrong. If you pass `msg` to a storage API, you
should get `msg` when you read it back, not (msg + "\n").

> Ah, the $RUN is hiding what is really going on; it is running the
> "test-tool ref-store" helper, and we did not adjust that helper.  So
> if we make a compensating change to the test-tool then we do not
> have to have these changes at all?  But that point may be moot.
>
> In any case, in order to lose 5 lines from show-branch.c, and 2
> lines from reflog-walk.c, I see that we had to touch 30+ lines in
> refs/files-backend.c.  I find it a bit hard to sell this as an
> improvement to the API, to be honest.

The test-tool ref-store adds its own '\n', so you always get a blank
line in the output. That serves no purpose, and leads to the

  tail-n2  | head -n1

in order to read the last log line. I think it's silly, and should be dropped.
Han-Wen Nienhuys Nov. 23, 2021, 4:44 p.m. UTC | #4
On Tue, Nov 23, 2021 at 11:40 AM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
> I think your cleanup works, but wouldn't a better thing be to move this
> to callbacks rather than tweaking the fprintf formats?

sure. In the reftable glue, I have

        if (!(flags & EXPIRE_REFLOGS_DRY_RUN)) {
                /* XXX - skip writing records that were not changed. */
                err = reftable_addition_commit(add);
        } else {
                /* XXX - print something */
        }

letting the callbacks do the printing means less work for reftable.

> The refs/files-backend.c shouldn't need to have one function calll the
> "should_prune_fn" *and* write out the data. Instead some code common to
> all backends should call the "should prune?", and then call the
> backend's "here's an entry for you to write" callback.

not sure if that will work. For reftable, you have to write something
(a tombstone) if you _do_ want to prune the entry.

> But maybe I'm overthinking this whole thing. I'm just wondering if we
> have say a sqlite reflog backend as opposed to the file/reftable backend
> that wants to store this data in a schema. Such a a backend would need

reftable also stores this in a schema: there are separate fields for
e-mail, timezone, timestamp etc.
Junio C Hamano Nov. 23, 2021, 5:09 p.m. UTC | #5
Han-Wen Nienhuys <hanwen@google.com> writes:

> On Mon, Nov 22, 2021 at 11:27 PM Junio C Hamano <gitster@pobox.com> wrote:
>>
>> If this were truly "user-provided", then I'd argue that all backends
>> should follow whatever the files backend has been doing forever---if
>> the files added LF implicitly, others should, too, because that is
>> pretty much what these "user-provided" callbacks have been expecting
>> to see.
>
> I think it's just wrong. If you pass `msg` to a storage API, you
> should get `msg` when you read it back, not (msg + "\n").

If you give a log message "git commit -m 'single line'", you get LF
at the end of the commit message for free.  This is no different.
And you know that this is not a "storage API" that stores the input
in verbatim after looking at refs.c::copy_reflog_msg().

>> Ah, the $RUN is hiding what is really going on; it is running the
>> "test-tool ref-store" helper, and we did not adjust that helper.  So
>> if we make a compensating change to the test-tool then we do not
>> have to have these changes at all?  But that point may be moot.
>>
>> In any case, in order to lose 5 lines from show-branch.c, and 2
>> lines from reflog-walk.c, I see that we had to touch 30+ lines in
>> refs/files-backend.c.  I find it a bit hard to sell this as an
>> improvement to the API, to be honest.
>
> The test-tool ref-store adds its own '\n', so you always get a blank
> line in the output. That serves no purpose, and leads to the

But that is only because test-tool is wrong, no?  If we know that
the API gives the trailing LF, what purpose does it serve to add
another one?

>   tail-n2  | head -n1
>
> in order to read the last log line. I think it's silly, and should be dropped.

Yes, it is silly and should be dropped, but if you drop it on the
tool side, then it may become even easier to do the "instead of
reading from .git/refs/logs files, have the tool dump and inspect
that" step, no?

This being test-tool, I do not mind losing backward compatibility
that forces us a silly "tail -n 2 | head -n 1" pipeline at all.  But
if silliness comes from the test-tool thta does not work well with
the internal API, that is what should be fixed.  Surely you can
change the API and its current callers to compensate for its silliness,
but I do not think that is a good direction to go in.

Thanks.
Han-Wen Nienhuys Nov. 23, 2021, 5:28 p.m. UTC | #6
On Tue, Nov 23, 2021 at 6:10 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Han-Wen Nienhuys <hanwen@google.com> writes:
>
> > On Mon, Nov 22, 2021 at 11:27 PM Junio C Hamano <gitster@pobox.com> wrote:
> >>
> >> If this were truly "user-provided", then I'd argue that all backends
> >> should follow whatever the files backend has been doing forever---if
> >> the files added LF implicitly, others should, too, because that is
> >> pretty much what these "user-provided" callbacks have been expecting
> >> to see.
> >
> > I think it's just wrong. If you pass `msg` to a storage API, you
> > should get `msg` when you read it back, not (msg + "\n").
>
> If you give a log message "git commit -m 'single line'", you get LF
> at the end of the commit message for free.  This is no different.
> And you know that this is not a "storage API" that stores the input
> in verbatim after looking at refs.c::copy_reflog_msg().

I'm talking about refs/refs-internal.h. It seems you want to add something like

/* The ref backend should add a '\n' relative to the message supplied
to the delete/symref/update functions. */
typedef int for_each_reflog_ent_fn(struct ref_store *ref_store,
                                   const char *refname,
                                   each_reflog_ent_fn fn,
                                   void *cb_data);

?

--
Han-Wen Nienhuys - Google Munich
I work 80%. Don't expect answers from me on Fridays.
--
Google Germany GmbH, Erika-Mann-Strasse 33, 80636 Munich
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg
Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
--

Google Germany GmbH, Erika-Mann-Strasse 33, 80636 Munich

Registergericht und -nummer: Hamburg, HRB 86891

Sitz der Gesellschaft: Hamburg

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Junio C Hamano Nov. 23, 2021, 8:34 p.m. UTC | #7
Han-Wen Nienhuys <hanwen@google.com> writes:

> I'm talking about refs/refs-internal.h. It seems you want to add something like
>
> /* The ref backend should add a '\n' relative to the message supplied
> to the delete/symref/update functions. */
> typedef int for_each_reflog_ent_fn(struct ref_store *ref_store,
>                                    const char *refname,
>                                    each_reflog_ent_fn fn,
>                                    void *cb_data);
>
> ?

Sorry, I do not follow.  Doesn't the ref backend already ensure that
the message is not an incomplete line?  If you feed a single
complete line when updating, I do not think the backend should add
any extra LF relative to the given message:

    $ git update-ref -m 'creating a new branch manually
    ' refs/heads/newtest master
    $ git update-ref -m 'updating a new branch manually
    ' refs/heads/newtest master~1
    $ git reflog refs/heads/newtest
    4150a1677b refs/heads/newtest@{0}: updating a new branch manually
    5f439a0ecf refs/heads/newtest@{1}: updating the reference

I think what deserves such a comment more is the prototype for
each_reflog_ent_fn describing what the parameters to that callback
function, to help the callers of the iterator what to expect.  That
is the end-user facing part.

/*
 * Callback to process a reflog entry found by the iteration functions (see
 * below)
 */
typedef int each_reflog_ent_fn(
		struct object_id *old_oid, struct object_id *new_oid,
		const char *committer, timestamp_t timestamp,
		int tz, const char *msg, void *cb_data);

Currently it only says "Callback (see below)" but "below" has only
comments about the difference between refs_for_each_reflog_ent() and
refs_for_each_reflog_entreverse() functions, and does not talk about
what "committer" looks like (i.e. does it give user.name equivalent,
user.name plus <user.email>, or something else?), and what "msg"
looks like (i.e. if a multi-line message was given when a ref was
updated or created, do we get these lines intact? if it gets
mangled, how? if the original was a single-liner incomplete line, do
we lack the final LF?), and how tz is encoded.

I think the rule for "msg" is that:

   a multi-line message, or a message on a single incomplete-line,
   are normalized into a single complete line, and callback gets a
   single complete line.

Once these rules get specified tightly and fully, it is up to the
ref(log) backends how to implement the interface.  If files-backend
wants to keep LF at the end of the message when storing (so that it
can easily use it as record delimiter), it can do so and reuse the
LF at the end of the message as part of the msg parameter to the
callback.  If another backend wants to drop LF at the end of the
message when storing (to save space), it can do so as long as the
callback function gets the LF just like the other backend.
Han-Wen Nienhuys Nov. 24, 2021, 11:17 a.m. UTC | #8
On Tue, Nov 23, 2021 at 9:34 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Han-Wen Nienhuys <hanwen@google.com> writes:
>
> > I'm talking about refs/refs-internal.h. It seems you want to add something like
> >
> > /* The ref backend should add a '\n' relative to the message supplied
> > to the delete/symref/update functions. */
> > typedef int for_each_reflog_ent_fn(struct ref_store *ref_store,
> >                                    const char *refname,
> >                                    each_reflog_ent_fn fn,
> >                                    void *cb_data);
> >
> > ?
>
> Sorry, I do not follow.  Doesn't the ref backend already ensure that
> the message is not an incomplete line?  If you feed a single
> complete line when updating, I do not think the backend should add
> any extra LF relative to the given message:

But it does, currently. As of

  commit 523fa69c36744ae6779e38614cb9bfb2be552923
  Author: Junio C Hamano <gitster@pobox.com>
  ..
  - We expect that a reflog message consists of a single line.  The
       file format used by the files backend may add a LF after the

it is the job of the files ref backend to add a LF, and the input to
the ref backend is a string without trailing LF. But the files backend
then produces that message with a LF, when reading back the data, eg.

$ git --version
git version 2.34.0.rc2.393.gf8c9666880-goog

$ GIT_TRACE_REFS="1" git branch -m bla blub
..
12:03:59.408705 refs/debug.c:162        rename_ref: refs/heads/bla ->
refs/heads/blub "Branch: renamed refs/heads/bla to refs/heads/blub": 0

$ GIT_TRACE_REFS=1 git reflog show refs/heads/blub
12:04:23.277805 refs/debug.c:294        reflog_ent refs/heads/blub
(ret 0): cd3e606211bb1cf8bc57f7d76bab98cc17a150bc ->
cd3e606211bb1cf8bc57f7d76bab98cc17a150bc, Han-Wen Nienhuys
<hanwen@google.com> 1637751839 "Branch: renamed refs/heads/bla to
refs/heads/blub
"

> I think the rule for "msg" is that:
>
>    a multi-line message, or a message on a single incomplete-line,
>    are normalized into a single complete line, and callback gets a
>    single complete line.
>

That is not how it works today. The files backend verbatimly dumps the
message supplied to it. (Maybe it should crash if there is a '\n' in
the message).
Junio C Hamano Nov. 24, 2021, 6:53 p.m. UTC | #9
Han-Wen Nienhuys <hanwen@google.com> writes:

>> ...  Doesn't the ref backend already ensure that
>> the message is not an incomplete line?  If you feed a single
>> complete line when updating, I do not think the backend should add
>> any extra LF relative to the given message:
>
> But it does, currently. As of
>
>   commit 523fa69c36744ae6779e38614cb9bfb2be552923
>   Author: Junio C Hamano <gitster@pobox.com>
>   ..
>   - We expect that a reflog message consists of a single line.  The
>        file format used by the files backend may add a LF after the
>
> it is the job of the files ref backend to add a LF, and the input to
> the ref backend is a string without trailing LF. But the files backend
> then produces that message with a LF, when reading back the data, eg.

Ah, perhaps our confusion comes from the fact that I view the ref
API as a whole and do not draw a fine line of distinction between
the "ref API implementation common across backends" vs "what ref
files backend does".  But as the implementor of one backend, you
obviously have to care.

In any case, when I did that commit, I clearly meant that the
normalization implemented by the patch belong to the common part to
be used by all backends for uniform handling of refs.  If a call to
refs API in a repository that is configured to use reftable backend
bypasses the normalization function introduced in that commit, that
is a bug.

So, yes "ref API ensures that the message is not an incomplete line
to turn 0, 1, or multiple lines input into a single line", which is
why the experiments you omitted from your quote (reproduced here)

    $ git update-ref -m 'creating a new branch manually
    ' refs/heads/newtest master
    $ git update-ref -m 'updating a new branch manually
    ' refs/heads/newtest master~1
    $ git reflog refs/heads/newtest
    4150a1677b refs/heads/newtest@{0}: updating a new branch manually
    5f439a0ecf refs/heads/newtest@{1}: updating the reference

to give a complete line when recording the reflog entries does not
result in an extra LF shown on the output.

>> I think the rule for "msg" is that:
>>
>>    a multi-line message, or a message on a single incomplete-line,
>>    are normalized into a single complete line, and callback gets a
>>    single complete line.
>
> That is not how it works today. The files backend verbatimly dumps the
> message supplied to it. (Maybe it should crash if there is a '\n' in
> the message).

As I said, if parts of refs API implementation forgets to call
normalization, it is a bug.  Is there different codepath other than
the one that is exercised with the "git update -m ''" experiment
above and you found such a bug there?  It needs to be fixed.

Thanks.
Han-Wen Nienhuys Nov. 24, 2021, 7:06 p.m. UTC | #10
On Wed, Nov 24, 2021 at 7:53 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Han-Wen Nienhuys <hanwen@google.com> writes:
>
> >> ...  Doesn't the ref backend already ensure that
> >> the message is not an incomplete line?  If you feed a single
> >> complete line when updating, I do not think the backend should add
> >> any extra LF relative to the given message:
> >
> > But it does, currently. As of
> >
> >   commit 523fa69c36744ae6779e38614cb9bfb2be552923
> >   Author: Junio C Hamano <gitster@pobox.com>
> >   ..
> >   - We expect that a reflog message consists of a single line.  The
> >        file format used by the files backend may add a LF after the
> >
> > it is the job of the files ref backend to add a LF, and the input to
> > the ref backend is a string without trailing LF. But the files backend
> > then produces that message with a LF, when reading back the data, eg.
>
> Ah, perhaps our confusion comes from the fact that I view the ref
> API as a whole and do not draw a fine line of distinction between
> the "ref API implementation common across backends" vs "what ref
> files backend does".  But as the implementor of one backend, you
> obviously have to care.

Having the read function return something different than what the
write gets still seems odd to me.
How about having copy_reflog_msg() trim trailing space and then always add LF?

The files backend can assert that the string ends in LF, and doesn't
need to add LF itself.
Junio C Hamano Nov. 24, 2021, 7:26 p.m. UTC | #11
Han-Wen Nienhuys <hanwen@google.com> writes:

> $ GIT_TRACE_REFS="1" git branch -m bla blub
> ..
> 12:03:59.408705 refs/debug.c:162        rename_ref: refs/heads/bla ->
> refs/heads/blub "Branch: renamed refs/heads/bla to refs/heads/blub": 0
>
> $ GIT_TRACE_REFS=1 git reflog show refs/heads/blub
> 12:04:23.277805 refs/debug.c:294        reflog_ent refs/heads/blub
> (ret 0): cd3e606211bb1cf8bc57f7d76bab98cc17a150bc ->
> cd3e606211bb1cf8bc57f7d76bab98cc17a150bc, Han-Wen Nienhuys
> <hanwen@google.com> 1637751839 "Branch: renamed refs/heads/bla to
> refs/heads/blub
> "
>
>> I think the rule for "msg" is that:
>>
>>    a multi-line message, or a message on a single incomplete-line,
>>    are normalized into a single complete line, and callback gets a
>>    single complete line.
>>
>
> That is not how it works today. The files backend verbatimly dumps the
> message supplied to it. (Maybe it should crash if there is a '\n' in
> the message).

I still am puzzled what you wanted to illustrate with the "git
branch -m bla" trace.  The call graph into the refs API looks like
this:

builtin/branch.c::cmd_branch()
 -> branch.c::create_branch()
    -> refs.c::ref_transaction_update()
       -> refs.c::ref_transaction_add_update()
    -> refs.c::ref_transaction_commit()

and the message the user gave is passed through in "msg" variable
without modification, when calling ref_transaction_update(), which
in turn makes a call to ref_transaction_add_update().  It does this:

    struct ref_update *ref_transaction_add_update(
                    struct ref_transaction *transaction,
                    const char *refname, unsigned int flags,
                    const struct object_id *new_oid,
                    const struct object_id *old_oid,
                    const char *msg)
    {
            struct ref_update *update;

            if (transaction->state != REF_TRANSACTION_OPEN)
                    BUG("update called for transaction that is not open");

            FLEX_ALLOC_STR(update, refname, refname);
            ALLOC_GROW(transaction->updates, transaction->nr + 1, transaction->alloc);
            transaction->updates[transaction->nr++] = update;

            update->flags = flags;

            if (flags & REF_HAVE_NEW)
                    oidcpy(&update->new_oid, new_oid);
            if (flags & REF_HAVE_OLD)
                    oidcpy(&update->old_oid, old_oid);
            update->msg = normalize_reflog_message(msg);
            return update;
    }

And normalize_reflog_message() calls copy_reflog_msg() to squash
runs of isspace() bytes (note: that class includes LF) into a single
space, and runs rtrim(), so update->msg will get a single incomplete
line.  As I suspected in a separate message, I think my notion of
what happens in the ref API implementation common to all backends
and what happens in each backend, and hence my statements about the
distinction, were much fuzzier than yours, so I should say that I
consider that all of the above is happening in the common part
across all backends.  If normalize happens in ref-files, it should
happen in reftable backend, too, automatically.

The files-backend has no chance to even see an embedded LF in the
message when the transaction gets committed and the backend is
called, does it?  So I am not sure why we should crash upon seeing a
LF in the message.

In any case, it seems that as a comment to clarify the end-user
facing each_reflog_ent_fn() parameters, what you quoted above from
my message seems correct to me, after following the callgraph like
the above.  A 0-line (i.e. incomplete, like your 'bla' given to "git
branch"), 1-line (i.e. a single complete line, like the message I
gave to "update-ref -m" in my earlier illustration), or multi-line
message given when a reflog entry is created, is normalized and when
the each_reflog_ent_fn() callback is called, it is shown to the
callback function as a single complete line, with a LF at the end.

Phrased without the explanation specific to this discussion, but
with a bit more detail:

  The message given when a reflog entry was created, is normalized
  into a single line by turning each LF into a space, squeezing a
  run of multiple whitespaces into one space, and removing trailing
  whitespaces. The callback gets a single complete line in the 'msg'
  parameter.

perhaps?
Han-Wen Nienhuys Nov. 24, 2021, 7:39 p.m. UTC | #12
On Wed, Nov 24, 2021 at 8:27 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Han-Wen Nienhuys <hanwen@google.com> writes:
>
> > $ GIT_TRACE_REFS="1" git branch -m bla blub
> > ..
> > 12:03:59.408705 refs/debug.c:162        rename_ref: refs/heads/bla ->
> > refs/heads/blub "Branch: renamed refs/heads/bla to refs/heads/blub": 0
> >
> > $ GIT_TRACE_REFS=1 git reflog show refs/heads/blub
> > 12:04:23.277805 refs/debug.c:294        reflog_ent refs/heads/blub
> > (ret 0): cd3e606211bb1cf8bc57f7d76bab98cc17a150bc ->
> > cd3e606211bb1cf8bc57f7d76bab98cc17a150bc, Han-Wen Nienhuys
> > <hanwen@google.com> 1637751839 "Branch: renamed refs/heads/bla to
> > refs/heads/blub
> > "
> >
> >> I think the rule for "msg" is that:
> >>
> >>    a multi-line message, or a message on a single incomplete-line,
> >>    are normalized into a single complete line, and callback gets a
> >>    single complete line.
> >>
> >
> > That is not how it works today. The files backend verbatimly dumps the
> > message supplied to it. (Maybe it should crash if there is a '\n' in
> > the message).
>
> I still am puzzled what you wanted to illustrate with the "git
> branch -m bla" trace.

I'm trying to illustrate that (from the perspective of the ref backend
API) one call inserts something without a '\n', but the call that
reads the info back, gets the same data back with a '\n'. It looks
confusing and inconsistent to me.

It seems fine to decide that the message should always end in a LF,
but then why not do that in the normalization routine, so it is shared
across all backends?

For the purpose of the debug support (GIT_TRACE_REFS=1), I would
rather prefer if the message was always without LF, because the LF
ruins the visual of the debug output, but that is a minor concern.
Junio C Hamano Nov. 24, 2021, 8:55 p.m. UTC | #13
Han-Wen Nienhuys <hanwen@google.com> writes:

>> Ah, perhaps our confusion comes from the fact that I view the ref
>> API as a whole and do not draw a fine line of distinction between
>> the "ref API implementation common across backends" vs "what ref
>> files backend does".  But as the implementor of one backend, you
>> obviously have to care.
>
> Having the read function return something different than what the
> write gets still seems odd to me.
> How about having copy_reflog_msg() trim trailing space and then always add LF?
>
> The files backend can assert that the string ends in LF, and doesn't
> need to add LF itself.

Ehh, let's step back a bit.

Is there anything in the common part and files backend in ref API
that needs to be changed to fix some bug?  I do not see anything
broken that needs to be fixed by "assert that the string ends in LF
and doesn't need to add LF itself" or by "always add LF".
Han-Wen Nienhuys Nov. 25, 2021, 4 p.m. UTC | #14
On Wed, Nov 24, 2021 at 9:55 PM Junio C Hamano <gitster@pobox.com> wrote:
> Ehh, let's step back a bit.
>
> Is there anything in the common part and files backend in ref API
> that needs to be changed to fix some bug?  I do not see anything
> broken that needs to be fixed by "assert that the string ends in LF
> and doesn't need to add LF itself" or by "always add LF".

Clearly, you and I disagree about what is logical behavior here.

I've reworked the series to fit with your opinion on the matter. PTAL.
Junio C Hamano Nov. 29, 2021, 2:30 a.m. UTC | #15
Han-Wen Nienhuys <hanwen@google.com> writes:

> On Wed, Nov 24, 2021 at 9:55 PM Junio C Hamano <gitster@pobox.com> wrote:
>> Ehh, let's step back a bit.
>>
>> Is there anything in the common part and files backend in ref API
>> that needs to be changed to fix some bug?  I do not see anything
>> broken that needs to be fixed by "assert that the string ends in LF
>> and doesn't need to add LF itself" or by "always add LF".
>
> Clearly, you and I disagree about what is logical behavior here.
>
> I've reworked the series to fit with your opinion on the matter. PTAL.

Thanks.

I find both are logical and internally consistent, so I prefer a
choice that requires fewer changes to what is already working.
diff mbox series

Patch

diff --git a/builtin/show-branch.c b/builtin/show-branch.c
index f1e8318592c..016ccdafd0f 100644
--- a/builtin/show-branch.c
+++ b/builtin/show-branch.c
@@ -761,7 +761,6 @@  int cmd_show_branch(int ac, const char **av, const char *prefix)
 			char *logmsg;
 			char *nth_desc;
 			const char *msg;
-			char *end;
 			timestamp_t timestamp;
 			int tz;
 
@@ -772,10 +771,6 @@  int cmd_show_branch(int ac, const char **av, const char *prefix)
 				break;
 			}
 
-			end = strchr(logmsg, '\n');
-			if (end)
-				*end = '\0';
-
 			msg = (*logmsg == '\0') ? "(none)" : logmsg;
 			reflog_msg[i] = xstrfmt("(%s) %s",
 						show_date(timestamp, tz,
diff --git a/reflog-walk.c b/reflog-walk.c
index 8ac4b284b6b..3ee59a98d2f 100644
--- a/reflog-walk.c
+++ b/reflog-walk.c
@@ -244,8 +244,6 @@  void get_reflog_message(struct strbuf *sb,
 
 	info = &commit_reflog->reflogs->items[commit_reflog->recno+1];
 	len = strlen(info->message);
-	if (len > 0)
-		len--; /* strip away trailing newline */
 	strbuf_add(sb, info->message, len);
 }
 
@@ -284,10 +282,10 @@  void show_reflog_message(struct reflog_walk_info *reflog_info, int oneline,
 		info = &commit_reflog->reflogs->items[commit_reflog->recno+1];
 		get_reflog_selector(&selector, reflog_info, dmode, force_date, 0);
 		if (oneline) {
-			printf("%s: %s", selector.buf, info->message);
+			printf("%s: %s\n", selector.buf, info->message);
 		}
 		else {
-			printf("Reflog: %s (%s)\nReflog message: %s",
+			printf("Reflog: %s (%s)\nReflog message: %s\n",
 			       selector.buf, info->email, info->message);
 		}
 
diff --git a/refs/files-backend.c b/refs/files-backend.c
index 151b0056fe5..583bbc5f8b9 100644
--- a/refs/files-backend.c
+++ b/refs/files-backend.c
@@ -1936,17 +1936,15 @@  static int show_one_reflog_ent(struct strbuf *sb, each_reflog_ent_fn fn, void *c
 	int tz;
 	const char *p = sb->buf;
 
-	/* old SP new SP name <email> SP time TAB msg LF */
-	if (!sb->len || sb->buf[sb->len - 1] != '\n' ||
-	    parse_oid_hex(p, &ooid, &p) || *p++ != ' ' ||
+	/* old SP new SP name <email> SP time TAB msg */
+	if (!sb->len || parse_oid_hex(p, &ooid, &p) || *p++ != ' ' ||
 	    parse_oid_hex(p, &noid, &p) || *p++ != ' ' ||
-	    !(email_end = strchr(p, '>')) ||
-	    email_end[1] != ' ' ||
+	    !(email_end = strchr(p, '>')) || email_end[1] != ' ' ||
 	    !(timestamp = parse_timestamp(email_end + 2, &message, 10)) ||
 	    !message || message[0] != ' ' ||
-	    (message[1] != '+' && message[1] != '-') ||
-	    !isdigit(message[2]) || !isdigit(message[3]) ||
-	    !isdigit(message[4]) || !isdigit(message[5]))
+	    (message[1] != '+' && message[1] != '-') || !isdigit(message[2]) ||
+	    !isdigit(message[3]) || !isdigit(message[4]) ||
+	    !isdigit(message[5]))
 		return 0; /* corrupt? */
 	email_end[1] = '\0';
 	tz = strtol(message + 1, NULL, 10);
@@ -2038,6 +2036,7 @@  static int files_for_each_reflog_ent_reverse(struct ref_store *ref_store,
 				strbuf_splice(&sb, 0, 0, bp + 1, endp - (bp + 1));
 				scanp = bp;
 				endp = bp + 1;
+				strbuf_trim_trailing_newline(&sb);
 				ret = show_one_reflog_ent(&sb, fn, cb_data);
 				strbuf_reset(&sb);
 				if (ret)
@@ -2050,6 +2049,7 @@  static int files_for_each_reflog_ent_reverse(struct ref_store *ref_store,
 				 * Process it, and we can end the loop.
 				 */
 				strbuf_splice(&sb, 0, 0, buf, endp - buf);
+				strbuf_trim_trailing_newline(&sb);
 				ret = show_one_reflog_ent(&sb, fn, cb_data);
 				strbuf_reset(&sb);
 				break;
@@ -2099,7 +2099,7 @@  static int files_for_each_reflog_ent(struct ref_store *ref_store,
 	if (!logfp)
 		return -1;
 
-	while (!ret && !strbuf_getwholeline(&sb, logfp, '\n'))
+	while (!ret && !strbuf_getline(&sb, logfp))
 		ret = show_one_reflog_ent(&sb, fn, cb_data);
 	fclose(logfp);
 	strbuf_release(&sb);
@@ -3059,18 +3059,18 @@  static int expire_reflog_ent(struct object_id *ooid, struct object_id *noid,
 	if ((*cb->should_prune_fn)(ooid, noid, email, timestamp, tz,
 				   message, policy_cb)) {
 		if (!cb->newlog)
-			printf("would prune %s", message);
+			printf("would prune %s\n", message);
 		else if (cb->flags & EXPIRE_REFLOGS_VERBOSE)
-			printf("prune %s", message);
+			printf("prune %s\n", message);
 	} else {
 		if (cb->newlog) {
-			fprintf(cb->newlog, "%s %s %s %"PRItime" %+05d\t%s",
-				oid_to_hex(ooid), oid_to_hex(noid),
-				email, timestamp, tz, message);
+			fprintf(cb->newlog, "%s %s %s %" PRItime " %+05d\t%s\n",
+				oid_to_hex(ooid), oid_to_hex(noid), email,
+				timestamp, tz, message);
 			oidcpy(&cb->last_kept_oid, noid);
 		}
 		if (cb->flags & EXPIRE_REFLOGS_VERBOSE)
-			printf("keep %s", message);
+			printf("keep %s\n", message);
 	}
 	return 0;
 }
diff --git a/t/t1405-main-ref-store.sh b/t/t1405-main-ref-store.sh
index 49718b7ea7f..a600bedf2cd 100755
--- a/t/t1405-main-ref-store.sh
+++ b/t/t1405-main-ref-store.sh
@@ -89,13 +89,12 @@  test_expect_success 'for_each_reflog()' '
 test_expect_success 'for_each_reflog_ent()' '
 	$RUN for-each-reflog-ent HEAD >actual &&
 	head -n1 actual | grep one &&
-	tail -n2 actual | head -n1 | grep recreate-main
+	tail -n1 actual | grep recreate-main
 '
 
 test_expect_success 'for_each_reflog_ent_reverse()' '
 	$RUN for-each-reflog-ent-reverse HEAD >actual &&
-	head -n1 actual | grep recreate-main &&
-	tail -n2 actual | head -n1 | grep one
+	tail -n1 actual | grep one
 '
 
 test_expect_success 'reflog_exists(HEAD)' '
diff --git a/t/t1406-submodule-ref-store.sh b/t/t1406-submodule-ref-store.sh
index 0a87058971e..b0365c1fee0 100755
--- a/t/t1406-submodule-ref-store.sh
+++ b/t/t1406-submodule-ref-store.sh
@@ -74,13 +74,13 @@  test_expect_success 'for_each_reflog()' '
 test_expect_success 'for_each_reflog_ent()' '
 	$RUN for-each-reflog-ent HEAD >actual &&
 	head -n1 actual | grep first &&
-	tail -n2 actual | head -n1 | grep main.to.new
+	tail -n1 actual | grep main.to.new
 '
 
 test_expect_success 'for_each_reflog_ent_reverse()' '
 	$RUN for-each-reflog-ent-reverse HEAD >actual &&
 	head -n1 actual | grep main.to.new &&
-	tail -n2 actual | head -n1 | grep first
+	tail -n1 actual | grep first
 '
 
 test_expect_success 'reflog_exists(HEAD)' '