@@ -374,6 +374,15 @@ breaking most of the collisions from a similarly-named file appearing in
many different directories. At the moment, this version is not allowed
when writing reachability bitmap files with `--write-bitmap-index` and it
will be automatically changed to version `1`.
++
+The name hash version `3` abandons the locality features of versions `1`
+and `2` in favor of minimizing collisions. The goal here is to separate
+objects by their full path and abandon hope for cross-path delta
+compression. For this reason, this option is preferred for repacking large
+repositories with many versions and many name hash collisions when using
+the first two versions. At the moment, this version is not allowed when
+writing reachability bitmap files with `--write-bitmap-index` and it will
+be automatically changed to version `1`.
DELTA ISLANDS
@@ -270,7 +270,7 @@ static int name_hash_version = -1;
static void validate_name_hash_version(void)
{
- if (name_hash_version < 1 || name_hash_version > 2)
+ if (name_hash_version < 1 || name_hash_version > 3)
die(_("invalid --name-hash-version option: %d"), name_hash_version);
}
@@ -292,6 +292,9 @@ static inline uint32_t pack_name_hash_fn(const char *name)
case 2:
return pack_name_hash_v2((const unsigned char *)name);
+ case 3:
+ return pack_name_hash_v3(name);
+
default:
BUG("invalid name-hash version: %d", name_hash_version);
}
@@ -235,6 +235,32 @@ static inline uint32_t pack_name_hash_v2(const unsigned char *name)
return (base >> 6) ^ hash;
}
+static inline uint32_t pack_name_hash_v3(const char *name)
+{
+ /*
+ * This 'bigp' value is a large prime, at least 25% of the max
+ * value of an uint32_t. Multiplying by this value (modulo 2^32)
+ * causes the 32 bits to change pseudo-randomly.
+ */
+ const uint32_t bigp = 1234572167U;
+ uint32_t c, hash = bigp;
+
+ if (!name)
+ return 0;
+
+ /*
+ * Do the simplest thing that will resemble pseudo-randomness: add
+ * random multiples of a large prime number with a binary shift.
+ * The goal is not to be cryptographic, but to be generally
+ * uniformly distributed.
+ */
+ while ((c = *name++) != 0) {
+ hash += c * bigp;
+ hash = (hash >> 5) | (hash << 27);
+ }
+ return hash;
+}
+
static inline enum object_type oe_type(const struct object_entry *e)
{
return e->type_valid ? e->type_ : OBJ_BAD;
@@ -15,6 +15,7 @@ int cmd__name_hash(int argc UNUSED, const char **argv UNUSED)
while (!strbuf_getline(&line, stdin)) {
printf("%10u ", pack_name_hash(line.buf));
printf("%10u ", pack_name_hash_v2((unsigned const char *)line.buf));
+ printf("%10u ", pack_name_hash_v3(line.buf));
printf("%s\n", line.buf);
}
@@ -25,7 +25,7 @@ test_expect_success 'create rev input' '
EOF
'
-for version in 1 2
+for version in 1 2 3
do
export version
@@ -14,7 +14,7 @@ test_size 'paths at head' '
test-tool name-hash <path-list >name-hashes
'
-for version in 1 2
+for version in 1 2 3
do
test_size "distinct hash value: v$version" '
awk "{ print \$$version; }" <name-hashes | sort | \
@@ -680,13 +680,13 @@ test_expect_success 'valid and invalid --name-hash-versions' '
# Valid values are hard to verify other than "do not fail".
# Performance tests will be more valuable to validate these versions.
# Negative values are converted to version 1.
- for value in -1 1 2
+ for value in -1 1 2 3
do
git pack-objects base --all --name-hash-version=$value || return 1
done &&
# Invalid values have clear post-conditions.
- for value in 0 3
+ for value in 0 4
do
test_must_fail git pack-objects base --all --name-hash-version=$value 2>err &&
test_grep "invalid --name-hash-version option" err || return 1
@@ -45,13 +45,13 @@ test_expect_success 'name-hash value stability' '
test-tool name-hash <names >out &&
cat >expect <<-\EOF &&
- 2582249472 1763573760 first
- 2289942528 1188134912 second
- 2300837888 1130758144 third
- 2544516325 3963087891 a/one-long-enough-for-collisions
- 2544516325 4013419539 b/two-long-enough-for-collisions
- 1420111091 1709547268 many/parts/to/this/path/enough/to/collide/in/v2
- 1420111091 1709547268 enough/parts/to/this/path/enough/to/collide/in/v2
+ 2582249472 1763573760 3109209818 first
+ 2289942528 1188134912 3781118409 second
+ 2300837888 1130758144 3028707182 third
+ 2544516325 3963087891 3586976147 a/one-long-enough-for-collisions
+ 2544516325 4013419539 1701624798 b/two-long-enough-for-collisions
+ 1420111091 1709547268 2676129939 many/parts/to/this/path/enough/to/collide/in/v2
+ 1420111091 1709547268 2740459187 enough/parts/to/this/path/enough/to/collide/in/v2
EOF
test_cmp expect out
@@ -516,7 +516,18 @@ test_expect_success 'fetch lazy-fetches only to resolve deltas' '
# Exercise to make sure it works. Git will not fetch anything from the
# promisor remote other than for the big tree (because it needs to
# resolve the delta).
- GIT_TRACE_PACKET="$(pwd)/trace" git -C client \
+ #
+ # TODO: the --name-hash-version option is disabled here, since this test
+ # is fundamentally broken! When GIT_TEST_NAME_HASH_VERSION=3, the server
+ # recognizes delta bases in a different way and then sends a _blob_ to
+ # the client with a delta base that the client does not have! This is
+ # because the client is cloned from "promisor-server" with tree:0 but
+ # is now fetching from "server" without any filter. This is violating the
+ # promise to the server that all reachable objects exist and could be
+ # used as delta bases!
+ GIT_TRACE_PACKET="$(pwd)/trace" \
+ GIT_TEST_NAME_HASH_VERSION=1 \
+ git -C client \
fetch "file://$(pwd)/server" main &&
# Verify the assumption that the client needed to fetch the delta base
@@ -535,7 +546,18 @@ test_expect_success 'fetch lazy-fetches only to resolve deltas, protocol v2' '
# Exercise to make sure it works. Git will not fetch anything from the
# promisor remote other than for the big blob (because it needs to
# resolve the delta).
- GIT_TRACE_PACKET="$(pwd)/trace" git -C client \
+ #
+ # TODO: the --name-hash-verion option is disabled here, since this test
+ # is fundamentally broken! When GIT_TEST_NAME_HASH_VERSION=3, the server
+ # recognizes delta bases in a different way and then sends a _blob_ to
+ # the client with a delta base that the client does not have! This is
+ # because the client is cloned from "promisor-server" with tree:0 but
+ # is now fetching from "server" without any filter. This is violating the
+ # promise to the server that all reachable objects exist and could be
+ # used as delta bases!
+ GIT_TRACE_PACKET="$(pwd)/trace" \
+ GIT_TEST_NAME_HASH_VERSION=1 \
+ git -C client \
fetch "file://$(pwd)/server" main &&
# Verify that protocol version 2 was used.