diff mbox series

[10/16] mktree: overwrite duplicate entries

Message ID b59a4ad8ab4b0e47373f811700eba59141fdc6c6.1718130288.git.gitgitgadget@gmail.com (mailing list archive)
State New
Headers show
Series mktree: support more flexible usage | expand

Commit Message

Victoria Dye June 11, 2024, 6:24 p.m. UTC
From: Victoria Dye <vdye@github.com>

If multiple tree entries with the same name are provided as input to
'mktree', only write the last one to the tree. Entries are considered
duplicates if they have identical names (*not* considering mode); if a blob
and a tree with the same name are provided, only the last one will be
written to the tree. A tree with duplicate entries is invalid (per 'git
fsck'), so that condition should be avoided wherever possible.

Signed-off-by: Victoria Dye <vdye@github.com>
---
 Documentation/git-mktree.txt |  8 ++++---
 builtin/mktree.c             | 45 ++++++++++++++++++++++++++++++++----
 t/t1010-mktree.sh            | 36 +++++++++++++++++++++++++++--
 3 files changed, 80 insertions(+), 9 deletions(-)

Comments

Patrick Steinhardt June 12, 2024, 9:40 a.m. UTC | #1
On Tue, Jun 11, 2024 at 06:24:42PM +0000, Victoria Dye via GitGitGadget wrote:
> From: Victoria Dye <vdye@github.com>
> 
> If multiple tree entries with the same name are provided as input to
> 'mktree', only write the last one to the tree. Entries are considered
> duplicates if they have identical names (*not* considering mode); if a blob
> and a tree with the same name are provided, only the last one will be
> written to the tree. A tree with duplicate entries is invalid (per 'git
> fsck'), so that condition should be avoided wherever possible.
> 
> Signed-off-by: Victoria Dye <vdye@github.com>
> ---
>  Documentation/git-mktree.txt |  8 ++++---
>  builtin/mktree.c             | 45 ++++++++++++++++++++++++++++++++----
>  t/t1010-mktree.sh            | 36 +++++++++++++++++++++++++++--
>  3 files changed, 80 insertions(+), 9 deletions(-)
> 
> diff --git a/Documentation/git-mktree.txt b/Documentation/git-mktree.txt
> index fb07e40cef0..afbc846d077 100644
> --- a/Documentation/git-mktree.txt
> +++ b/Documentation/git-mktree.txt
> @@ -43,9 +43,11 @@ OPTIONS
>  INPUT FORMAT
>  ------------
>  Tree entries may be specified in any of the formats compatible with the
> -`--index-info` option to linkgit:git-update-index[1]. The order of the tree
> -entries is normalized by `mktree` so pre-sorting the input by path is not
> -required.
> +`--index-info` option to linkgit:git-update-index[1].
> +
> +The order of the tree entries is normalized by `mktree` so pre-sorting the input
> +by path is not required. Multiple entries provided with the same path are
> +deduplicated, with only the last one specified added to the tree.

Hm. I'm not sure whether this is a good idea. With git-mktree(1) being
part of our plumbing layer, you can expect that it's mostly going to be
fed input from scripts. And any script that generates duplicate tree
entries is broken, but we now start to paper over such brokenness
without giving the user any indicator of this. As user of git-mktree(1)
in Gitaly I can certainly say that I'd rather want to see it die instead
of silently fixing my inputs so that I start to notice my own bugs.

So without seeing a strong motivating usecase for this feature I'd think
that git-mktree(1) should reject such inputs and return an error such
that the user can fix their tooling.

Patrick
Victoria Dye June 12, 2024, 6:48 p.m. UTC | #2
Patrick Steinhardt wrote:
> On Tue, Jun 11, 2024 at 06:24:42PM +0000, Victoria Dye via GitGitGadget wrote:
>> From: Victoria Dye <vdye@github.com>
>>
>> If multiple tree entries with the same name are provided as input to
>> 'mktree', only write the last one to the tree. Entries are considered
>> duplicates if they have identical names (*not* considering mode); if a blob
>> and a tree with the same name are provided, only the last one will be
>> written to the tree. A tree with duplicate entries is invalid (per 'git
>> fsck'), so that condition should be avoided wherever possible.
>>
>> Signed-off-by: Victoria Dye <vdye@github.com>
>> ---
>>  Documentation/git-mktree.txt |  8 ++++---
>>  builtin/mktree.c             | 45 ++++++++++++++++++++++++++++++++----
>>  t/t1010-mktree.sh            | 36 +++++++++++++++++++++++++++--
>>  3 files changed, 80 insertions(+), 9 deletions(-)
>>
>> diff --git a/Documentation/git-mktree.txt b/Documentation/git-mktree.txt
>> index fb07e40cef0..afbc846d077 100644
>> --- a/Documentation/git-mktree.txt
>> +++ b/Documentation/git-mktree.txt
>> @@ -43,9 +43,11 @@ OPTIONS
>>  INPUT FORMAT
>>  ------------
>>  Tree entries may be specified in any of the formats compatible with the
>> -`--index-info` option to linkgit:git-update-index[1]. The order of the tree
>> -entries is normalized by `mktree` so pre-sorting the input by path is not
>> -required.
>> +`--index-info` option to linkgit:git-update-index[1].
>> +
>> +The order of the tree entries is normalized by `mktree` so pre-sorting the input
>> +by path is not required. Multiple entries provided with the same path are
>> +deduplicated, with only the last one specified added to the tree.
> 
> Hm. I'm not sure whether this is a good idea. With git-mktree(1) being
> part of our plumbing layer, you can expect that it's mostly going to be
> fed input from scripts. And any script that generates duplicate tree
> entries is broken, but we now start to paper over such brokenness
> without giving the user any indicator of this. As user of git-mktree(1)
> in Gitaly I can certainly say that I'd rather want to see it die instead
> of silently fixing my inputs so that I start to notice my own bugs.

'git mktree' already does some cleaning of the inputs by sorting the
entries, presumably so that a valid tree is created rather than one with
ordering errors. Deduplication is also a cleanup of user inputs to ensure a
valid tree is created, so to me it's a consistent extension to existing
behavior. Conversely, rejecting the inputs and failing would be introducing
an error scenario where none existed previously, which to me would be a
bigger deviation.

One potential way to get the kind of functionality you're looking for,
though, might be to combine something like '--literally' and a '--strict'
that validates the tree before writing. Like I mentioned in the cover letter
[1], I do plan to submit a follow-up series with '--strict' (it's just that
this series is already pretty long and it would add 4-ish more patches). 

[1] https://lore.kernel.org/git/pull.1746.git.1718130288.gitgitgadget@gmail.com/

> So without seeing a strong motivating usecase for this feature I'd think
> that git-mktree(1) should reject such inputs and return an error such
> that the user can fix their tooling.

Practically, there are a couple of reasons that led me to wanting this
behavior. One is that it allows using data structures with more rigid
integrity checks (like the index & cache tree). The other is that, once the
ability to add nested entries is introduced, the concept of a "duplicate"
gets fuzzier and blocking them entirely could lead to inconsistencies and/or
limited flexibility. If, for example a user wants to create a tree with a
directory 'folder1/' with OID '0123456789012345678901234567890123456789',
but update a blob 'folder1/file1' in it to OID
'0987654321098765432109876543210987654321', the latter is technically a
"duplicate" but rejecting it would avoid being able to create the tree
without first expanding 'folder1/'with something like 'ls-tree', replacing the
appropriate entry, then calling 'mktree'.

> 
> Patrick
diff mbox series

Patch

diff --git a/Documentation/git-mktree.txt b/Documentation/git-mktree.txt
index fb07e40cef0..afbc846d077 100644
--- a/Documentation/git-mktree.txt
+++ b/Documentation/git-mktree.txt
@@ -43,9 +43,11 @@  OPTIONS
 INPUT FORMAT
 ------------
 Tree entries may be specified in any of the formats compatible with the
-`--index-info` option to linkgit:git-update-index[1]. The order of the tree
-entries is normalized by `mktree` so pre-sorting the input by path is not
-required.
+`--index-info` option to linkgit:git-update-index[1].
+
+The order of the tree entries is normalized by `mktree` so pre-sorting the input
+by path is not required. Multiple entries provided with the same path are
+deduplicated, with only the last one specified added to the tree.
 
 GIT
 ---
diff --git a/builtin/mktree.c b/builtin/mktree.c
index 29e9dc6ce69..e9e2134136f 100644
--- a/builtin/mktree.c
+++ b/builtin/mktree.c
@@ -15,6 +15,9 @@ 
 #include "object-store-ll.h"
 
 struct tree_entry {
+	/* Internal */
+	size_t order;
+
 	unsigned mode;
 	struct object_id oid;
 	int len;
@@ -72,15 +75,49 @@  static void append_to_tree(unsigned mode, struct object_id *oid, const char *pat
 	oidcpy(&ent->oid, oid);
 
 	/* Append the update */
+	ent->order = arr->nr;
 	tree_entry_array_push(arr, ent);
 }
 
-static int ent_compare(const void *a_, const void *b_)
+static int ent_compare(const void *a_, const void *b_, void *ctx)
 {
+	int cmp;
 	struct tree_entry *a = *(struct tree_entry **)a_;
 	struct tree_entry *b = *(struct tree_entry **)b_;
-	return base_name_compare(a->name, a->len, a->mode,
-				 b->name, b->len, b->mode);
+	int ignore_mode = *((int *)ctx);
+
+	if (ignore_mode)
+		cmp = name_compare(a->name, a->len, b->name, b->len);
+	else
+		cmp = base_name_compare(a->name, a->len, a->mode,
+					b->name, b->len, b->mode);
+	return cmp ? cmp : b->order - a->order;
+}
+
+static void sort_and_dedup_tree_entry_array(struct tree_entry_array *arr)
+{
+	size_t count = arr->nr;
+	struct tree_entry *prev = NULL;
+
+	int ignore_mode = 1;
+	QSORT_S(arr->entries, arr->nr, ent_compare, &ignore_mode);
+
+	arr->nr = 0;
+	for (size_t i = 0; i < count; i++) {
+		struct tree_entry *curr = arr->entries[i];
+		if (prev &&
+		    !name_compare(prev->name, prev->len,
+				  curr->name, curr->len)) {
+			FREE_AND_NULL(curr);
+		} else {
+			arr->entries[arr->nr++] = curr;
+			prev = curr;
+		}
+	}
+
+	/* Sort again to order the entries for tree insertion */
+	ignore_mode = 0;
+	QSORT_S(arr->entries, arr->nr, ent_compare, &ignore_mode);
 }
 
 static void write_tree(struct tree_entry_array *arr, struct object_id *oid)
@@ -88,7 +125,7 @@  static void write_tree(struct tree_entry_array *arr, struct object_id *oid)
 	struct strbuf buf;
 	size_t size = 0;
 
-	QSORT(arr->entries, arr->nr, ent_compare);
+	sort_and_dedup_tree_entry_array(arr);
 	for (size_t i = 0; i < arr->nr; i++)
 		size += 32 + arr->entries[i]->len;
 
diff --git a/t/t1010-mktree.sh b/t/t1010-mktree.sh
index e0263cb2bf8..956692347f0 100755
--- a/t/t1010-mktree.sh
+++ b/t/t1010-mktree.sh
@@ -6,11 +6,16 @@  TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 
 test_expect_success setup '
-	for d in a a- a0
+	for d in folder folder- folder0
 	do
 		mkdir "$d" && echo "$d/one" >"$d/one" &&
 		git add "$d" || return 1
 	done &&
+	for f in before folder.txt later
+	do
+		echo "$f" >"$f" &&
+		git add "$f" || return 1
+	done &&
 	echo zero >one &&
 	git update-index --add --info-only one &&
 	git write-tree --missing-ok >tree.missing &&
@@ -175,7 +180,7 @@  test_expect_success '--literally can create invalid trees' '
 
 test_expect_success 'mktree validates path' '
 	tree_oid="$(cat tree)" &&
-	blob_oid="$(git rev-parse $tree_oid:a/one)" &&
+	blob_oid="$(git rev-parse $tree_oid:folder.txt)" &&
 	head_oid="$(git rev-parse HEAD)" &&
 
 	# Valid: tree with or without trailing slash, blob without trailing slash
@@ -206,4 +211,31 @@  test_expect_success 'mktree validates path' '
 	grep "invalid path ${SQ}.git/${SQ}" err
 '
 
+test_expect_success 'mktree with duplicate entries' '
+	tree_oid=$(cat tree) &&
+	folder_oid=$(git rev-parse ${tree_oid}:folder) &&
+	before_oid=$(git rev-parse ${tree_oid}:before) &&
+	head_oid=$(git rev-parse HEAD) &&
+
+	{
+		printf "100755 blob $before_oid\ttest\n" &&
+		printf "040000 tree $folder_oid\ttest-\n" &&
+		printf "160000 commit $head_oid\ttest.txt\n" &&
+		printf "040000 tree $folder_oid\ttest\n" &&
+		printf "100644 blob $before_oid\ttest0\n" &&
+		printf "160000 commit $head_oid\ttest-\n"
+	} >top.dup &&
+	git mktree <top.dup >tree.actual &&
+
+	{
+		printf "160000 commit $head_oid\ttest-\n" &&
+		printf "160000 commit $head_oid\ttest.txt\n" &&
+		printf "040000 tree $folder_oid\ttest\n" &&
+		printf "100644 blob $before_oid\ttest0\n"
+	} >expect &&
+	git ls-tree $(cat tree.actual) >actual &&
+
+	test_cmp expect actual
+'
+
 test_done