diff mbox series

[6/7] refs: allow multiple reflog entries for the same refname

Message ID 20241209-320-git-refs-migrate-reflogs-v1-6-d4bc37ee860f@gmail.com (mailing list archive)
State Superseded
Headers show
Series refs: add reflog support to `git refs migrate` | expand

Commit Message

Karthik Nayak Dec. 9, 2024, 11:07 a.m. UTC
The reference transaction only allows a update for a given reference to
avoid conflicts. This, however, isn't an issue for reflogs. There are no
conflicts to be resolved in reflogs and when migrating reflogs between
backends we'd have multiple reflog entries for the same refname.

So allow multiple reflog updates within a single transaction. Also the
reflog creation logic isn't exposed to the end user. While this might
change in the future, currently, this reduces the scope of issues to
think about.

This is required to add reflog migration support to `git refs migrate`
which currently doesn't support it.

Signed-off-by: Karthik Nayak <karthik.188@gmail.com>
---
 refs/files-backend.c    | 15 +++++++++++----
 refs/reftable-backend.c | 16 +++++++++++++---
 2 files changed, 24 insertions(+), 7 deletions(-)

Comments

Christian Couder Dec. 11, 2024, 10:44 a.m. UTC | #1
On Mon, Dec 9, 2024 at 12:11 PM Karthik Nayak <karthik.188@gmail.com> wrote:
>
> The reference transaction only allows a update for a given reference to

s/a update/an update/

or: s/a update/a single update/

> avoid conflicts. This, however, isn't an issue for reflogs. There are no
> conflicts to be resolved in reflogs and when migrating reflogs between
> backends we'd have multiple reflog entries for the same refname.


> @@ -1302,6 +1303,7 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data
>         struct ident_split committer_ident = {0};
>         size_t logs_nr = 0, logs_alloc = 0, i;
>         const char *committer_info;
> +       struct strintmap logs_ts;

Here a comment might help explain what logs_ts is used for.

>         int ret = 0;
>
>         committer_info = git_committer_info(0);
> @@ -1310,6 +1312,8 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data
>
>         QSORT(arg->updates, arg->updates_nr, transaction_update_cmp);
>
> +       strintmap_init(&logs_ts, ts);

I am not sure I understand what logs_ts is used for and why its
default value is set to ts.

Also ts is an uint64_t while the second argument to strintmap_init()
is an int. I wonder if it could be an issue especially on 32 bits
platforms.

>         reftable_writer_set_limits(writer, ts, ts);
>
>         for (i = 0; i < arg->updates_nr; i++) {
> @@ -1391,6 +1395,7 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data
>
>                         if (create_reflog) {
>                                 struct ident_split c;
> +                               uint64_t update_index;
>
>                                 ALLOC_GROW(logs, logs_nr + 1, logs_alloc);
>                                 log = &logs[logs_nr++];
> @@ -1405,7 +1410,11 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data
>                                 }
>
>                                 fill_reftable_log_record(log, &c);
> -                               log->update_index = ts;
> +
> +                               update_index = strintmap_get(&logs_ts, u->refname);
> +                               log->update_index = update_index;
> +                               strintmap_set(&logs_ts, u->refname, update_index+1);

s/update_index+1/update_index + 1/

Also is the 'update_index' var really needed or could we just do:

                               log->update_index =
strintmap_get(&logs_ts, u->refname);
                               strintmap_set(&logs_ts, u->refname,
log->update_index + 1);

?

>                                 log->refname = xstrdup(u->refname);
>                                 memcpy(log->value.update.new_hash,
>                                        u->new_oid.hash, GIT_MAX_RAWSZ);
Patrick Steinhardt Dec. 11, 2024, 2:26 p.m. UTC | #2
On Mon, Dec 09, 2024 at 12:07:20PM +0100, Karthik Nayak wrote:
> The reference transaction only allows a update for a given reference to
> avoid conflicts. This, however, isn't an issue for reflogs. There are no
> conflicts to be resolved in reflogs and when migrating reflogs between
> backends we'd have multiple reflog entries for the same refname.
> 
> So allow multiple reflog updates within a single transaction. Also the
> reflog creation logic isn't exposed to the end user. While this might
> change in the future, currently, this reduces the scope of issues to
> think about.
> 
> This is required to add reflog migration support to `git refs migrate`
> which currently doesn't support it.

Nit: the second half of this sentence starting with "which currently..."
feels rather pointless, as it's implicit in the first half. I'd just
drop it.

> Signed-off-by: Karthik Nayak <karthik.188@gmail.com>
> ---
>  refs/files-backend.c    | 15 +++++++++++----
>  refs/reftable-backend.c | 16 +++++++++++++---
>  2 files changed, 24 insertions(+), 7 deletions(-)
> 
> diff --git a/refs/files-backend.c b/refs/files-backend.c
> index 32975e0fd7a03ab8ddf99c0a68af99921d3f5090..10fba1e97b967fbc04c62a0a6d7d9648ce1c51fb 100644
> --- a/refs/files-backend.c
> +++ b/refs/files-backend.c
> @@ -2612,6 +2612,9 @@ static int lock_ref_for_update(struct files_ref_store *refs,
>  
>  	update->backend_data = lock;
>  
> +	if (update->flags & REF_LOG_ONLY)
> +		goto out;
> +
>  	if (update->type & REF_ISSYMREF) {
>  		if (update->flags & REF_NO_DEREF) {
>  			/*

Hm. Does this mean that we don't lock at all for REF_LOG_ONLY updates?
Reflogs themselves have no lockfile, so isn't it mandatory that we lock
the corresponding ref like we used to do? Otherwise I cannot see how we
avoid two concurrent writers to the same reflog.

> @@ -3036,8 +3042,9 @@ static int files_transaction_finish_initial(struct files_ref_store *refs,
>  
>  	/* Fail if a refname appears more than once in the transaction: */
>  	for (i = 0; i < transaction->nr; i++)
> -		string_list_append(&affected_refnames,
> -				   transaction->updates[i]->refname);
> +		if (!(transaction->updates[i]->flags & REF_LOG_ONLY))
> +			string_list_append(&affected_refnames,
> +					   transaction->updates[i]->refname);
>  	string_list_sort(&affected_refnames);
>  	if (ref_update_reject_duplicates(&affected_refnames, err)) {
>  		ret = TRANSACTION_GENERIC_ERROR;

This on the other hand is sensible -- having multiple REF_LOG_ONLY
transactions queued is fine.

> diff --git a/refs/reftable-backend.c b/refs/reftable-backend.c
> index b2e3ba877de9e59fea5a4d066eb13e60ef22a32b..d9d2e28122a00ddd7f835c35a5851e390761885b 100644
> --- a/refs/reftable-backend.c
> +++ b/refs/reftable-backend.c
> @@ -1405,7 +1410,11 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data
>  				}
>  
>  				fill_reftable_log_record(log, &c);
> -				log->update_index = ts;
> +
> +				update_index = strintmap_get(&logs_ts, u->refname);
> +				log->update_index = update_index;
> +				strintmap_set(&logs_ts, u->refname, update_index+1);

So we're now tracking update indices via another map in order to ensure
that the update index will be increased if we have multiple reflog
entries for the same refname. Can we avoid that overhead by instead just
having a global update index counter that increases for every single
reflog entry, regardless of whether we have multiple ones queued up for
the same reference?

I guess the result would be kind of weird as a single transaction with
multiple ref updates would now always contain N different update
indices. Maybe there's an alternative that allows us to reduce the cost,
like only doing this for REF_LOG_ONLY updates?

I'm mostly being careful because this here is the hot loop of writing
refs, so I don't want to regress performance.

Patrick
Karthik Nayak Dec. 12, 2024, 2:47 p.m. UTC | #3
Patrick Steinhardt <ps@pks.im> writes:

> On Mon, Dec 09, 2024 at 12:07:20PM +0100, Karthik Nayak wrote:
>> The reference transaction only allows a update for a given reference to
>> avoid conflicts. This, however, isn't an issue for reflogs. There are no
>> conflicts to be resolved in reflogs and when migrating reflogs between
>> backends we'd have multiple reflog entries for the same refname.
>>
>> So allow multiple reflog updates within a single transaction. Also the
>> reflog creation logic isn't exposed to the end user. While this might
>> change in the future, currently, this reduces the scope of issues to
>> think about.
>>
>> This is required to add reflog migration support to `git refs migrate`
>> which currently doesn't support it.
>
> Nit: the second half of this sentence starting with "which currently..."
> feels rather pointless, as it's implicit in the first half. I'd just
> drop it.
>

Will do.

>> Signed-off-by: Karthik Nayak <karthik.188@gmail.com>
>> ---
>>  refs/files-backend.c    | 15 +++++++++++----
>>  refs/reftable-backend.c | 16 +++++++++++++---
>>  2 files changed, 24 insertions(+), 7 deletions(-)
>>
>> diff --git a/refs/files-backend.c b/refs/files-backend.c
>> index 32975e0fd7a03ab8ddf99c0a68af99921d3f5090..10fba1e97b967fbc04c62a0a6d7d9648ce1c51fb 100644
>> --- a/refs/files-backend.c
>> +++ b/refs/files-backend.c
>> @@ -2612,6 +2612,9 @@ static int lock_ref_for_update(struct files_ref_store *refs,
>>
>>  	update->backend_data = lock;
>>
>> +	if (update->flags & REF_LOG_ONLY)
>> +		goto out;
>> +
>>  	if (update->type & REF_ISSYMREF) {
>>  		if (update->flags & REF_NO_DEREF) {
>>  			/*
>
> Hm. Does this mean that we don't lock at all for REF_LOG_ONLY updates?
> Reflogs themselves have no lockfile, so isn't it mandatory that we lock
> the corresponding ref like we used to do? Otherwise I cannot see how we
> avoid two concurrent writers to the same reflog.
>

No it doesn't, this is after the lock is obtained. We simply exit early
since for reflog only updates, there is nothing further to do.

>> @@ -3036,8 +3042,9 @@ static int files_transaction_finish_initial(struct files_ref_store *refs,
>>
>>  	/* Fail if a refname appears more than once in the transaction: */
>>  	for (i = 0; i < transaction->nr; i++)
>> -		string_list_append(&affected_refnames,
>> -				   transaction->updates[i]->refname);
>> +		if (!(transaction->updates[i]->flags & REF_LOG_ONLY))
>> +			string_list_append(&affected_refnames,
>> +					   transaction->updates[i]->refname);
>>  	string_list_sort(&affected_refnames);
>>  	if (ref_update_reject_duplicates(&affected_refnames, err)) {
>>  		ret = TRANSACTION_GENERIC_ERROR;
>
> This on the other hand is sensible -- having multiple REF_LOG_ONLY
> transactions queued is fine.
>
>> diff --git a/refs/reftable-backend.c b/refs/reftable-backend.c
>> index b2e3ba877de9e59fea5a4d066eb13e60ef22a32b..d9d2e28122a00ddd7f835c35a5851e390761885b 100644
>> --- a/refs/reftable-backend.c
>> +++ b/refs/reftable-backend.c
>> @@ -1405,7 +1410,11 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data
>>  				}
>>
>>  				fill_reftable_log_record(log, &c);
>> -				log->update_index = ts;
>> +
>> +				update_index = strintmap_get(&logs_ts, u->refname);
>> +				log->update_index = update_index;
>> +				strintmap_set(&logs_ts, u->refname, update_index+1);
>
> So we're now tracking update indices via another map in order to ensure
> that the update index will be increased if we have multiple reflog
> entries for the same refname. Can we avoid that overhead by instead just
> having a global update index counter that increases for every single
> reflog entry, regardless of whether we have multiple ones queued up for
> the same reference?
>
> I guess the result would be kind of weird as a single transaction with
> multiple ref updates would now always contain N different update
> indices. Maybe there's an alternative that allows us to reduce the cost,
> like only doing this for REF_LOG_ONLY updates?
>
> I'm mostly being careful because this here is the hot loop of writing
> refs, so I don't want to regress performance.

Thanks for bringing this up. I was thinking hard about this for a while.
I also did some local benchmarking, for 10000 atomic writes, I couldn't
find a note-worthy regression.

But I really like the point you made about how this could probably use a
counter. I think we can use `u->index`. The index field was added to
ensure that the logs stay in order when we sort inside
`write_transaction_table()`. But we can use the same here. We can simply
do

    log->update_index = ts + u->index;

This would increment the update_index as needed and also only kick in
when the `index` itself is set. Which would be only for reflog migration
between backends.

I think this would work well.

Karthik
Karthik Nayak Dec. 12, 2024, 2:52 p.m. UTC | #4
Christian Couder <christian.couder@gmail.com> writes:

> On Mon, Dec 9, 2024 at 12:11 PM Karthik Nayak <karthik.188@gmail.com> wrote:
>>
>> The reference transaction only allows a update for a given reference to
>
> s/a update/an update/
>
> or: s/a update/a single update/

'a single update' sounds the best here, will add.

>> avoid conflicts. This, however, isn't an issue for reflogs. There are no
>> conflicts to be resolved in reflogs and when migrating reflogs between
>> backends we'd have multiple reflog entries for the same refname.
>
>
>> @@ -1302,6 +1303,7 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data
>>         struct ident_split committer_ident = {0};
>>         size_t logs_nr = 0, logs_alloc = 0, i;
>>         const char *committer_info;
>> +       struct strintmap logs_ts;
>
> Here a comment might help explain what logs_ts is used for.
>

I think with Patricks comment, this whole code will be removed for
something simpler.

>>         int ret = 0;
>>
>>         committer_info = git_committer_info(0);
>> @@ -1310,6 +1312,8 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data
>>
>>         QSORT(arg->updates, arg->updates_nr, transaction_update_cmp);
>>
>> +       strintmap_init(&logs_ts, ts);
>
> I am not sure I understand what logs_ts is used for and why its
> default value is set to ts.
>

The reason I added this was because in the reftable backend, the writer
sorts logs before writing. So if the multiple reflogs contained the same
update_index, their order might be changed. But for migrating reflogs,
we need to ensure we maintain the order. Using a map here, allowed us to
increment the update_index for reflogs for a given refname.

> Also ts is an uint64_t while the second argument to strintmap_init()
> is an int. I wonder if it could be an issue especially on 32 bits
> platforms.
>

This is fair point, I decided to scrap this ultimately and simply append
`u->index` to the update_index. Which would provide the same desired
effect.

>>         reftable_writer_set_limits(writer, ts, ts);
>>
>>         for (i = 0; i < arg->updates_nr; i++) {
>> @@ -1391,6 +1395,7 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data
>>
>>                         if (create_reflog) {
>>                                 struct ident_split c;
>> +                               uint64_t update_index;
>>
>>                                 ALLOC_GROW(logs, logs_nr + 1, logs_alloc);
>>                                 log = &logs[logs_nr++];
>> @@ -1405,7 +1410,11 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data
>>                                 }
>>
>>                                 fill_reftable_log_record(log, &c);
>> -                               log->update_index = ts;
>> +
>> +                               update_index = strintmap_get(&logs_ts, u->refname);
>> +                               log->update_index = update_index;
>> +                               strintmap_set(&logs_ts, u->refname, update_index+1);
>
> s/update_index+1/update_index + 1/
>
> Also is the 'update_index' var really needed or could we just do:
>
>                                log->update_index =
> strintmap_get(&logs_ts, u->refname);
>                                strintmap_set(&logs_ts, u->refname,
> log->update_index + 1);
>
> ?
>

The temp variable can be removed here indeed. But I'll remove all of
this in the next version. Thanks

>>                                 log->refname = xstrdup(u->refname);
>>                                 memcpy(log->value.update.new_hash,
>>                                        u->new_oid.hash, GIT_MAX_RAWSZ);
diff mbox series

Patch

diff --git a/refs/files-backend.c b/refs/files-backend.c
index 32975e0fd7a03ab8ddf99c0a68af99921d3f5090..10fba1e97b967fbc04c62a0a6d7d9648ce1c51fb 100644
--- a/refs/files-backend.c
+++ b/refs/files-backend.c
@@ -2612,6 +2612,9 @@  static int lock_ref_for_update(struct files_ref_store *refs,
 
 	update->backend_data = lock;
 
+	if (update->flags & REF_LOG_ONLY)
+		goto out;
+
 	if (update->type & REF_ISSYMREF) {
 		if (update->flags & REF_NO_DEREF) {
 			/*
@@ -2830,13 +2833,16 @@  static int files_transaction_prepare(struct ref_store *ref_store,
 	 */
 	for (i = 0; i < transaction->nr; i++) {
 		struct ref_update *update = transaction->updates[i];
-		struct string_list_item *item =
-			string_list_append(&affected_refnames, update->refname);
+		struct string_list_item *item;
 
 		if ((update->flags & REF_IS_PRUNING) &&
 		    !(update->flags & REF_NO_DEREF))
 			BUG("REF_IS_PRUNING set without REF_NO_DEREF");
 
+		if (update->flags & REF_LOG_ONLY)
+			continue;
+
+		item = string_list_append(&affected_refnames, update->refname);
 		/*
 		 * We store a pointer to update in item->util, but at
 		 * the moment we never use the value of this field
@@ -3036,8 +3042,9 @@  static int files_transaction_finish_initial(struct files_ref_store *refs,
 
 	/* Fail if a refname appears more than once in the transaction: */
 	for (i = 0; i < transaction->nr; i++)
-		string_list_append(&affected_refnames,
-				   transaction->updates[i]->refname);
+		if (!(transaction->updates[i]->flags & REF_LOG_ONLY))
+			string_list_append(&affected_refnames,
+					   transaction->updates[i]->refname);
 	string_list_sort(&affected_refnames);
 	if (ref_update_reject_duplicates(&affected_refnames, err)) {
 		ret = TRANSACTION_GENERIC_ERROR;
diff --git a/refs/reftable-backend.c b/refs/reftable-backend.c
index b2e3ba877de9e59fea5a4d066eb13e60ef22a32b..d9d2e28122a00ddd7f835c35a5851e390761885b 100644
--- a/refs/reftable-backend.c
+++ b/refs/reftable-backend.c
@@ -990,8 +990,9 @@  static int reftable_be_transaction_prepare(struct ref_store *ref_store,
 		if (ret)
 			goto done;
 
-		string_list_append(&affected_refnames,
-				   transaction->updates[i]->refname);
+		if (!(transaction->updates[i]->flags & REF_LOG_ONLY))
+			string_list_append(&affected_refnames,
+					   transaction->updates[i]->refname);
 	}
 
 	/*
@@ -1302,6 +1303,7 @@  static int write_transaction_table(struct reftable_writer *writer, void *cb_data
 	struct ident_split committer_ident = {0};
 	size_t logs_nr = 0, logs_alloc = 0, i;
 	const char *committer_info;
+	struct strintmap logs_ts;
 	int ret = 0;
 
 	committer_info = git_committer_info(0);
@@ -1310,6 +1312,8 @@  static int write_transaction_table(struct reftable_writer *writer, void *cb_data
 
 	QSORT(arg->updates, arg->updates_nr, transaction_update_cmp);
 
+	strintmap_init(&logs_ts, ts);
+
 	reftable_writer_set_limits(writer, ts, ts);
 
 	for (i = 0; i < arg->updates_nr; i++) {
@@ -1391,6 +1395,7 @@  static int write_transaction_table(struct reftable_writer *writer, void *cb_data
 
 			if (create_reflog) {
 				struct ident_split c;
+				uint64_t update_index;
 
 				ALLOC_GROW(logs, logs_nr + 1, logs_alloc);
 				log = &logs[logs_nr++];
@@ -1405,7 +1410,11 @@  static int write_transaction_table(struct reftable_writer *writer, void *cb_data
 				}
 
 				fill_reftable_log_record(log, &c);
-				log->update_index = ts;
+
+				update_index = strintmap_get(&logs_ts, u->refname);
+				log->update_index = update_index;
+				strintmap_set(&logs_ts, u->refname, update_index+1);
+
 				log->refname = xstrdup(u->refname);
 				memcpy(log->value.update.new_hash,
 				       u->new_oid.hash, GIT_MAX_RAWSZ);
@@ -1476,6 +1485,7 @@  static int write_transaction_table(struct reftable_writer *writer, void *cb_data
 
 done:
 	assert(ret != REFTABLE_API_ERROR);
+	strintmap_clear(&logs_ts);
 	for (i = 0; i < logs_nr; i++)
 		reftable_log_record_release(&logs[i]);
 	free(logs);