@@ -15,7 +15,8 @@ SYNOPSIS
[--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
[--cruft] [--cruft-expiration=<time>]
[--stdout [--filter=<filter-spec>] | <base-name>]
- [--shallow] [--keep-true-parents] [--[no-]sparse] < <object-list>
+ [--shallow] [--keep-true-parents] [--[no-]sparse]
+ [--name-hash-version=<n>] < <object-list>
DESCRIPTION
@@ -345,6 +346,35 @@ raise an error.
Restrict delta matches based on "islands". See DELTA ISLANDS
below.
+--name-hash-version=<n>::
+ While performing delta compression, Git groups objects that may be
+ similar based on heuristics using the path to that object. While
+ grouping objects by an exact path match is good for paths with
+ many versions, there are benefits for finding delta pairs across
+ different full paths. Git collects objects by type and then by a
+ "name hash" of the path and then by size, hoping to group objects
+ that will compress well together.
++
+The default name hash version is `1`, which prioritizes hash locality by
+considering the final bytes of the path as providing the maximum magnitude
+to the hash function. This version excels at distinguishing short paths
+and finding renames across directories. However, the hash function depends
+primarily on the final 16 bytes of the path. If there are many paths in
+the repo that have the same final 16 bytes and differ only by parent
+directory, then this name-hash may lead to too many collisions and cause
+poor results. At the moment, this version is required when writing
+reachability bitmap files with `--write-bitmap-index`.
++
+The name hash version `2` has similar locality features as version `1`,
+except it considers each path component separately and overlays the hashes
+with a shift. This still prioritizes the final bytes of the path, but also
+"salts" the lower bits of the hash using the parent directory names. This
+method allows for some of the locality benefits of version `1` while
+breaking most of the collisions from a similarly-named file appearing in
+many different directories. At the moment, this version is not allowed
+when writing reachability bitmap files with `--write-bitmap-index` and it
+will be automatically changed to version `1`.
+
DELTA ISLANDS
-------------
@@ -266,6 +266,28 @@ struct configured_exclusion {
static struct oidmap configured_exclusions;
static struct oidset excluded_by_config;
+static int name_hash_version = 1;
+
+static void validate_name_hash_version(void)
+{
+ if (name_hash_version < 1 || name_hash_version > 2)
+ die(_("invalid --name-hash-version option: %d"), name_hash_version);
+}
+
+static inline uint32_t pack_name_hash_fn(const char *name)
+{
+ switch (name_hash_version)
+ {
+ case 1:
+ return pack_name_hash(name);
+
+ case 2:
+ return pack_name_hash_v2((const unsigned char *)name);
+
+ default:
+ BUG("invalid name-hash version: %d", name_hash_version);
+ }
+}
/*
* stats
@@ -1698,7 +1720,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
return 0;
}
- create_object_entry(oid, type, pack_name_hash(name),
+ create_object_entry(oid, type, pack_name_hash_fn(name),
exclude, name && no_try_delta(name),
found_pack, found_offset);
return 1;
@@ -1912,7 +1934,7 @@ static void add_preferred_base_object(const char *name)
{
struct pbase_tree *it;
size_t cmplen;
- unsigned hash = pack_name_hash(name);
+ unsigned hash = pack_name_hash_fn(name);
if (!num_preferred_base || check_pbase_path(hash))
return;
@@ -3422,7 +3444,7 @@ static void show_object_pack_hint(struct object *object, const char *name,
* here using a now in order to perhaps improve the delta selection
* process.
*/
- oe->hash = pack_name_hash(name);
+ oe->hash = pack_name_hash_fn(name);
oe->no_try_delta = name && no_try_delta(name);
stdin_packs_hints_nr++;
@@ -3572,7 +3594,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
entry = packlist_find(&to_pack, oid);
if (entry) {
if (name) {
- entry->hash = pack_name_hash(name);
+ entry->hash = pack_name_hash_fn(name);
entry->no_try_delta = no_try_delta(name);
}
} else {
@@ -3595,7 +3617,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
return;
}
- entry = create_object_entry(oid, type, pack_name_hash(name),
+ entry = create_object_entry(oid, type, pack_name_hash_fn(name),
0, name && no_try_delta(name),
pack, offset);
}
@@ -4069,6 +4091,15 @@ static int get_object_list_from_bitmap(struct rev_info *revs)
if (!(bitmap_git = prepare_bitmap_walk(revs, 0)))
return -1;
+ /*
+ * For now, force the name-hash version to be 1 since that
+ * is the version implied by the bitmap format. Later, the
+ * format can include this version explicitly in its format,
+ * allowing readers to know the version that was used during
+ * the bitmap write.
+ */
+ name_hash_version = 1;
+
if (pack_options_allow_reuse())
reuse_partial_packfile_from_bitmap(bitmap_git,
&reuse_packfiles,
@@ -4429,6 +4460,8 @@ int cmd_pack_objects(int argc,
OPT_STRING_LIST(0, "uri-protocol", &uri_protocols,
N_("protocol"),
N_("exclude any configured uploadpack.blobpackfileuri with this protocol")),
+ OPT_INTEGER(0, "name-hash-version", &name_hash_version,
+ N_("use the specified name-hash function to group similar objects")),
OPT_END(),
};
@@ -4576,6 +4609,12 @@ int cmd_pack_objects(int argc,
if (pack_to_stdout || !rev_list_all)
write_bitmap_index = 0;
+ validate_name_hash_version();
+ if (write_bitmap_index && name_hash_version != 1) {
+ warning(_("currently, --write-bitmap-index requires --name-hash-version=1"));
+ name_hash_version = 1;
+ }
+
if (use_delta_islands)
strvec_push(&rp, "--topo-order");
@@ -674,4 +674,35 @@ do
'
done
+test_expect_success 'valid and invalid --name-hash-versions' '
+ # Valid values are hard to verify other than "do not fail".
+ # Performance tests will be more valuable to validate these versions.
+ for value in 1 2
+ do
+ git pack-objects base --all --name-hash-version=$value || return 1
+ done &&
+
+ # Invalid values have clear post-conditions.
+ for value in -1 0 3
+ do
+ test_must_fail git pack-objects base --all --name-hash-version=$value 2>err &&
+ test_grep "invalid --name-hash-version option" err || return 1
+ done
+'
+
+# The following test is not necessarily a permanent choice, but since we do not
+# have a "name hash version" bit in the .bitmap file format, we cannot write the
+# hash values into the .bitmap file without risking breakage later.
+#
+# TODO: Make these compatible in the future and replace this test with the
+# expected behavior when both are specified.
+test_expect_success '--name-hash-version=2 and --write-bitmap-index are incompatible' '
+ git pack-objects base --all --name-hash-version=2 --write-bitmap-index 2>err &&
+ test_grep "currently, --write-bitmap-index requires --name-hash-version=1" err &&
+
+ # --stdout option silently removes --write-bitmap-index
+ git pack-objects --stdout --all --name-hash-version=2 --write-bitmap-index >out 2>err &&
+ ! test_grep "currently, --write-bitmap-index requires --name-hash-version=1" err
+'
+
test_done