diff mbox series

[1/7] diffcore-rename: use a mem_pool for exact rename detection's hashmap

Message ID 9f8ab62b84256be6d7d984d576ff4fda09d88a1d.1627044897.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series Final optimization batch (#15): use memory pools | expand

Commit Message

Elijah Newren July 23, 2021, 12:54 p.m. UTC
From: Elijah Newren <newren@gmail.com>

Exact rename detection, via insert_file_table(), uses a hashmap to store
files by oid.  Use a mem_pool for the hashmap entries so these can all be
allocated and deallocated together.

For the testcases mentioned in commit 557ac0350d ("merge-ort: begin
performance work; instrument with trace2_region_* calls", 2020-10-28),
this change improves the performance as follows:

                            Before                  After
    no-renames:      204.2  ms ±  3.0  ms   202.5  ms ±  3.2  ms
    mega-renames:      1.076 s ±  0.015 s     1.072 s ±  0.012 s
    just-one-mega:   364.1  ms ±  7.0  ms   357.3  ms ±  3.9  ms

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 diffcore-rename.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

Comments

Eric Sunshine July 23, 2021, 9:59 p.m. UTC | #1
On Fri, Jul 23, 2021 at 8:55 AM Elijah Newren via GitGitGadget
<gitgitgadget@gmail.com> wrote:
> Exact rename detection, via insert_file_table(), uses a hashmap to store
> files by oid.  Use a mem_pool for the hashmap entries so these can all be
> allocated and deallocated together.
> [...]
> Signed-off-by: Elijah Newren <newren@gmail.com>
> ---
> diff --git a/diffcore-rename.c b/diffcore-rename.c
> @@ -355,7 +357,7 @@ static int find_exact_renames(struct diff_options *options)
>         /* Free the hash data structure and entries */
> -       hashmap_clear_and_free(&file_table, struct file_similarity, entry);
> +       hashmap_clear(&file_table);

Does the in-code comment become a bit out of date with this change?
(It might make sense to drop the comment altogether -- or, if not,
explain that the hashmap entries get thrown away later with the pool?)

Not necessarily worth a re-roll.
Elijah Newren July 23, 2021, 10:03 p.m. UTC | #2
On Fri, Jul 23, 2021 at 2:59 PM Eric Sunshine <sunshine@sunshineco.com> wrote:
>
> On Fri, Jul 23, 2021 at 8:55 AM Elijah Newren via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
> > Exact rename detection, via insert_file_table(), uses a hashmap to store
> > files by oid.  Use a mem_pool for the hashmap entries so these can all be
> > allocated and deallocated together.
> > [...]
> > Signed-off-by: Elijah Newren <newren@gmail.com>
> > ---
> > diff --git a/diffcore-rename.c b/diffcore-rename.c
> > @@ -355,7 +357,7 @@ static int find_exact_renames(struct diff_options *options)
> >         /* Free the hash data structure and entries */
> > -       hashmap_clear_and_free(&file_table, struct file_similarity, entry);
> > +       hashmap_clear(&file_table);
>
> Does the in-code comment become a bit out of date with this change?
> (It might make sense to drop the comment altogether -- or, if not,
> explain that the hashmap entries get thrown away later with the pool?)

Ah, good catch.  Yeah, I should update it or drop the comment.

> Not necessarily worth a re-roll.

I'm sure someone will comment on something else, so I'll just include
this among the fixes.
diff mbox series

Patch

diff --git a/diffcore-rename.c b/diffcore-rename.c
index 4ef0459cfb5..23b917eca42 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -317,10 +317,11 @@  static int find_identical_files(struct hashmap *srcs,
 }
 
 static void insert_file_table(struct repository *r,
+			      struct mem_pool *pool,
 			      struct hashmap *table, int index,
 			      struct diff_filespec *filespec)
 {
-	struct file_similarity *entry = xmalloc(sizeof(*entry));
+	struct file_similarity *entry = mem_pool_alloc(pool, sizeof(*entry));
 
 	entry->index = index;
 	entry->filespec = filespec;
@@ -336,7 +337,8 @@  static void insert_file_table(struct repository *r,
  * and then during the second round we try to match
  * cache-dirty entries as well.
  */
-static int find_exact_renames(struct diff_options *options)
+static int find_exact_renames(struct diff_options *options,
+			      struct mem_pool *pool)
 {
 	int i, renames = 0;
 	struct hashmap file_table;
@@ -346,7 +348,7 @@  static int find_exact_renames(struct diff_options *options)
 	 */
 	hashmap_init(&file_table, NULL, NULL, rename_src_nr);
 	for (i = rename_src_nr-1; i >= 0; i--)
-		insert_file_table(options->repo,
+		insert_file_table(options->repo, pool,
 				  &file_table, i,
 				  rename_src[i].p->one);
 
@@ -355,7 +357,7 @@  static int find_exact_renames(struct diff_options *options)
 		renames += find_identical_files(&file_table, i, options);
 
 	/* Free the hash data structure and entries */
-	hashmap_clear_and_free(&file_table, struct file_similarity, entry);
+	hashmap_clear(&file_table);
 
 	return renames;
 }
@@ -1341,6 +1343,7 @@  void diffcore_rename_extended(struct diff_options *options,
 	int num_destinations, dst_cnt;
 	int num_sources, want_copies;
 	struct progress *progress = NULL;
+	struct mem_pool local_pool;
 	struct dir_rename_info info;
 	struct diff_populate_filespec_options dpf_options = {
 		.check_binary = 0,
@@ -1409,11 +1412,18 @@  void diffcore_rename_extended(struct diff_options *options,
 		goto cleanup; /* nothing to do */
 
 	trace2_region_enter("diff", "exact renames", options->repo);
+	mem_pool_init(&local_pool, 32*1024);
 	/*
 	 * We really want to cull the candidates list early
 	 * with cheap tests in order to avoid doing deltas.
 	 */
-	rename_count = find_exact_renames(options);
+	rename_count = find_exact_renames(options, &local_pool);
+	/*
+	 * Discard local_pool immediately instead of at "cleanup:" in order
+	 * to reduce maximum memory usage; inexact rename detection uses up
+	 * a fair amount of memory, and mem_pools can too.
+	 */
+	mem_pool_discard(&local_pool, 0);
 	trace2_region_leave("diff", "exact renames", options->repo);
 
 	/* Did we only want exact renames? */