[08/15] name-rev: pull out deref handling from the recursion
diff mbox series

Message ID 20190919214712.7348-9-szeder.dev@gmail.com
State New
Headers show
Series
  • name-rev: eliminate recursion
Related show

Commit Message

SZEDER Gábor Sept. 19, 2019, 9:47 p.m. UTC
The 'if (deref) { ... }' condition near the beginning of the recursive
name_rev() function can only ever be true in the first invocation,
because the 'deref' parameter is always 0 in the subsequent recursive
invocations.

Extract this condition from the recursion into name_rev()'s caller and
drop the function's 'deref' parameter.  This makes eliminating the
recursion a bit easier to follow, and it will be moved back into
name_rev() after the recursion is elminated.

Furthermore, drop the condition that die()s when both 'deref' and
'generation' are non-null (which should have been a BUG() to begin
with).

Note that this change reintroduces the memory leak that was plugged in
in commit 5308224633 (name-rev: avoid leaking memory in the `deref`
case, 2017-05-04), but a later patch in this series will plug it in
again.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
---
 builtin/name-rev.c | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

Comments

Derrick Stolee Sept. 20, 2019, 3:21 p.m. UTC | #1
On 9/19/2019 5:47 PM, SZEDER Gábor wrote:
> The 'if (deref) { ... }' condition near the beginning of the recursive
> name_rev() function can only ever be true in the first invocation,
> because the 'deref' parameter is always 0 in the subsequent recursive
> invocations.
> 
> Extract this condition from the recursion into name_rev()'s caller and
> drop the function's 'deref' parameter.  This makes eliminating the
> recursion a bit easier to follow, and it will be moved back into
> name_rev() after the recursion is elminated.
> 
> Furthermore, drop the condition that die()s when both 'deref' and
> 'generation' are non-null (which should have been a BUG() to begin
> with).

These changes seem sensible. I look forward to seeing how deref is
reintroduced.

> Note that this change reintroduces the memory leak that was plugged in
> in commit 5308224633 (name-rev: avoid leaking memory in the `deref`
> case, 2017-05-04), but a later patch in this series will plug it in
> again.

The memory leak is now for "tip_name" correct? Just tracking to make
sure it gets plugged later.

> Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
> ---
>  builtin/name-rev.c | 27 ++++++++++-----------------
>  1 file changed, 10 insertions(+), 17 deletions(-)
> 
> diff --git a/builtin/name-rev.c b/builtin/name-rev.c
> index cb8ac2fa64..42cea5c881 100644
> --- a/builtin/name-rev.c
> +++ b/builtin/name-rev.c
> @@ -102,30 +102,19 @@ static struct rev_name *create_or_update_name(struct commit *commit,
>  
>  static void name_rev(struct commit *commit,
>  		const char *tip_name, timestamp_t taggerdate,
> -		int generation, int distance, int from_tag,
> -		int deref)
> +		int generation, int distance, int from_tag)
>  {
>  	struct commit_list *parents;
>  	int parent_number = 1;
> -	char *to_free = NULL;
>  
>  	parse_commit(commit);
>  
>  	if (commit->date < cutoff)
>  		return;
>  
> -	if (deref) {
> -		tip_name = to_free = xstrfmt("%s^0", tip_name);
> -
> -		if (generation)
> -			die("generation: %d, but deref?", generation);
> -	}
> -
>  	if (!create_or_update_name(commit, tip_name, taggerdate, generation,
> -				   distance, from_tag)) {
> -		free(to_free);
> +				   distance, from_tag))
>  		return;
> -	}
>  
>  	for (parents = commit->parents;
>  			parents;
> @@ -144,11 +133,11 @@ static void name_rev(struct commit *commit,
>  
>  			name_rev(parents->item, new_name, taggerdate, 0,
>  				 distance + MERGE_TRAVERSAL_WEIGHT,
> -				 from_tag, 0);
> +				 from_tag);
>  		} else {
>  			name_rev(parents->item, tip_name, taggerdate,
>  				 generation + 1, distance + 1,
> -				 from_tag, 0);
> +				 from_tag);
>  		}
>  	}
>  }
> @@ -280,12 +269,16 @@ static int name_ref(const char *path, const struct object_id *oid, int flags, vo
>  	if (o && o->type == OBJ_COMMIT) {
>  		struct commit *commit = (struct commit *)o;
>  		int from_tag = starts_with(path, "refs/tags/");
> +		const char *tip_name;
>  
>  		if (taggerdate == TIME_MAX)
>  			taggerdate = commit->date;
>  		path = name_ref_abbrev(path, can_abbreviate_output);
> -		name_rev(commit, xstrdup(path), taggerdate, 0, 0,
> -			 from_tag, deref);
> +		if (deref)
> +			tip_name = xstrfmt("%s^0", path);
> +		else
> +			tip_name = xstrdup(path);

(leak above, as noted in message).

> +		name_rev(commit, tip_name, taggerdate, 0, 0, from_tag);
>  	}
>  	return 0;
>  }
>
René Scharfe Sept. 20, 2019, 4:37 p.m. UTC | #2
Am 19.09.19 um 23:47 schrieb SZEDER Gábor:
> The 'if (deref) { ... }' condition near the beginning of the recursive
> name_rev() function can only ever be true in the first invocation,
> because the 'deref' parameter is always 0 in the subsequent recursive
> invocations.
>
> Extract this condition from the recursion into name_rev()'s caller and
> drop the function's 'deref' parameter.  This makes eliminating the
> recursion a bit easier to follow, and it will be moved back into
> name_rev() after the recursion is elminated.
>
> Furthermore, drop the condition that die()s when both 'deref' and
> 'generation' are non-null (which should have been a BUG() to begin
> with).
>
> Note that this change reintroduces the memory leak that was plugged in
> in commit 5308224633 (name-rev: avoid leaking memory in the `deref`
> case, 2017-05-04), but a later patch in this series will plug it in
> again.
>
> Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
> ---
>  builtin/name-rev.c | 27 ++++++++++-----------------
>  1 file changed, 10 insertions(+), 17 deletions(-)
>
> diff --git a/builtin/name-rev.c b/builtin/name-rev.c
> index cb8ac2fa64..42cea5c881 100644
> --- a/builtin/name-rev.c
> +++ b/builtin/name-rev.c
> @@ -102,30 +102,19 @@ static struct rev_name *create_or_update_name(struct commit *commit,
>
>  static void name_rev(struct commit *commit,
>  		const char *tip_name, timestamp_t taggerdate,
> -		int generation, int distance, int from_tag,
> -		int deref)
> +		int generation, int distance, int from_tag)
>  {
>  	struct commit_list *parents;
>  	int parent_number = 1;
> -	char *to_free = NULL;
>
>  	parse_commit(commit);
>
>  	if (commit->date < cutoff)
>  		return;
>
> -	if (deref) {
> -		tip_name = to_free = xstrfmt("%s^0", tip_name);
> -
> -		if (generation)
> -			die("generation: %d, but deref?", generation);
> -	}
> -
>  	if (!create_or_update_name(commit, tip_name, taggerdate, generation,
> -				   distance, from_tag)) {
> -		free(to_free);
> +				   distance, from_tag))
>  		return;
> -	}
>
>  	for (parents = commit->parents;
>  			parents;
> @@ -144,11 +133,11 @@ static void name_rev(struct commit *commit,
>
>  			name_rev(parents->item, new_name, taggerdate, 0,
>  				 distance + MERGE_TRAVERSAL_WEIGHT,
> -				 from_tag, 0);
> +				 from_tag);
>  		} else {
>  			name_rev(parents->item, tip_name, taggerdate,
>  				 generation + 1, distance + 1,
> -				 from_tag, 0);
> +				 from_tag);
>  		}
>  	}
>  }
> @@ -280,12 +269,16 @@ static int name_ref(const char *path, const struct object_id *oid, int flags, vo
>  	if (o && o->type == OBJ_COMMIT) {
>  		struct commit *commit = (struct commit *)o;
>  		int from_tag = starts_with(path, "refs/tags/");
> +		const char *tip_name;

This should not be const because you allocate the buffer it points to
right here in the function, in each execution path.

>
>  		if (taggerdate == TIME_MAX)
>  			taggerdate = commit->date;
>  		path = name_ref_abbrev(path, can_abbreviate_output);
> -		name_rev(commit, xstrdup(path), taggerdate, 0, 0,
> -			 from_tag, deref);
> +		if (deref)
> +			tip_name = xstrfmt("%s^0", path);
> +		else
> +			tip_name = xstrdup(path);
> +		name_rev(commit, tip_name, taggerdate, 0, 0, from_tag);

tip_name should be free(3)'d here.  Except we can't do that because
name_rev() sometimes stores that pointer in a commit slab.  Ugh.

If the (re)introduced leak doesn't impact performance and memory
usage too much then duplicating tip_name again in name_rev() or
rather your new create_or_update_name() would likely make the
lifetimes of those string buffers easier to manage.

>  	}
>  	return 0;
>  }
>
SZEDER Gábor Sept. 20, 2019, 5:42 p.m. UTC | #3
On Fri, Sep 20, 2019 at 11:21:53AM -0400, Derrick Stolee wrote:
> On 9/19/2019 5:47 PM, SZEDER Gábor wrote:
> > The 'if (deref) { ... }' condition near the beginning of the recursive
> > name_rev() function can only ever be true in the first invocation,
> > because the 'deref' parameter is always 0 in the subsequent recursive
> > invocations.
> > 
> > Extract this condition from the recursion into name_rev()'s caller and
> > drop the function's 'deref' parameter.  This makes eliminating the
> > recursion a bit easier to follow, and it will be moved back into
> > name_rev() after the recursion is elminated.

s/elminated/eliminated/

> > Furthermore, drop the condition that die()s when both 'deref' and
> > 'generation' are non-null (which should have been a BUG() to begin
> > with).
> 
> These changes seem sensible. I look forward to seeing how deref is
> reintroduced.
> 
> > Note that this change reintroduces the memory leak that was plugged in
> > in commit 5308224633 (name-rev: avoid leaking memory in the `deref`
> > case, 2017-05-04), but a later patch in this series will plug it in
> > again.
> 
> The memory leak is now for "tip_name" correct? Just tracking to make
> sure it gets plugged later.

Yes, it's 'tip_name' (the one returned by xstrfmt()).

> > -	if (deref) {
> > -		tip_name = to_free = xstrfmt("%s^0", tip_name);

> > +		if (deref)
> > +			tip_name = xstrfmt("%s^0", path);
> > +		else
> > +			tip_name = xstrdup(path);
SZEDER Gábor Sept. 20, 2019, 6:13 p.m. UTC | #4
> > @@ -280,12 +269,16 @@ static int name_ref(const char *path, const struct object_id *oid, int flags, vo
> >  	if (o && o->type == OBJ_COMMIT) {
> >  		struct commit *commit = (struct commit *)o;
> >  		int from_tag = starts_with(path, "refs/tags/");
> > +		const char *tip_name;
> 
> This should not be const because you allocate the buffer it points to
> right here in the function, in each execution path.

Marking it as const indicates that this function doesn't modify the
buffer where the pointer points at.

> >
> >  		if (taggerdate == TIME_MAX)
> >  			taggerdate = commit->date;
> >  		path = name_ref_abbrev(path, can_abbreviate_output);
> > -		name_rev(commit, xstrdup(path), taggerdate, 0, 0,
> > -			 from_tag, deref);
> > +		if (deref)
> > +			tip_name = xstrfmt("%s^0", path);
> > +		else
> > +			tip_name = xstrdup(path);
> > +		name_rev(commit, tip_name, taggerdate, 0, 0, from_tag);
> 
> tip_name should be free(3)'d here.  Except we can't do that because
> name_rev() sometimes stores that pointer in a commit slab.  Ugh.
> 
> If the (re)introduced leak doesn't impact performance and memory
> usage too much then duplicating tip_name again in name_rev() or
> rather your new create_or_update_name() would likely make the
> lifetimes of those string buffers easier to manage.

Yeah, the easiest would be when each 'struct rev_name' in the commit
slab would have its own 'tip_name' string, but that would result in
a lot of duplicated strings and increased memory usage.
SZEDER Gábor Sept. 20, 2019, 6:14 p.m. UTC | #5
On Fri, Sep 20, 2019 at 08:13:02PM +0200, SZEDER Gábor wrote:
> > If the (re)introduced leak doesn't impact performance and memory
> > usage too much then duplicating tip_name again in name_rev() or
> > rather your new create_or_update_name() would likely make the
> > lifetimes of those string buffers easier to manage.
> 
> Yeah, the easiest would be when each 'struct rev_name' in the commit
> slab would have its own 'tip_name' string, but that would result in
> a lot of duplicated strings and increased memory usage.

I didn't measure how much more memory would be used, though.
SZEDER Gábor Sept. 21, 2019, 9:57 a.m. UTC | #6
On Fri, Sep 20, 2019 at 08:14:07PM +0200, SZEDER Gábor wrote:
> On Fri, Sep 20, 2019 at 08:13:02PM +0200, SZEDER Gábor wrote:
> > > If the (re)introduced leak doesn't impact performance and memory
> > > usage too much then duplicating tip_name again in name_rev() or
> > > rather your new create_or_update_name() would likely make the
> > > lifetimes of those string buffers easier to manage.
> > 
> > Yeah, the easiest would be when each 'struct rev_name' in the commit
> > slab would have its own 'tip_name' string, but that would result in
> > a lot of duplicated strings and increased memory usage.
> 
> I didn't measure how much more memory would be used, though.

So, I tried the patch below to give each 'struct rev_name' instance
its own copy of 'tip_name', and the memory usage of 'git name-rev
--all' usually increased.

The increase depends on how many merges and how many refs there are
compared to the number of commits: the fewer merges and refs, the
higher the more the memory usage increased:

  linux:         +4.8%
  gcc:           +7.2% 
  gecko-dev:     +9.2%
  webkit:       +12.4%
  llvm-project: +19.0%

git.git is the exception with its unusually high number of merge
commits (about 25%), and the memory usage decresed by 4.4%.


 --- >8 ---

diff --git a/builtin/name-rev.c b/builtin/name-rev.c
index 6969af76c4..62ab78242b 100644
--- a/builtin/name-rev.c
+++ b/builtin/name-rev.c
@@ -88,6 +88,7 @@ static struct rev_name *create_or_update_name(struct commit *commit,
 		set_commit_rev_name(commit, name);
 		goto copy_data;
 	} else if (is_better_name(name, taggerdate, distance, from_tag)) {
+		free((char*) name->tip_name);
 copy_data:
 		name->tip_name = tip_name;
 		name->taggerdate = taggerdate;
@@ -106,21 +107,19 @@ static void name_rev(struct commit *start_commit,
 {
 	struct commit_list *list = NULL;
 	const char *tip_name;
-	char *to_free = NULL;
 
 	parse_commit(start_commit);
 	if (start_commit->date < cutoff)
 		return;
 
 	if (deref) {
-		tip_name = to_free = xstrfmt("%s^0", start_tip_name);
-		free((char*) start_tip_name);
+		tip_name = xstrfmt("%s^0", start_tip_name);
 	} else
-		tip_name = start_tip_name;
+		tip_name = strdup(start_tip_name);
 
 	if (!create_or_update_name(start_commit, tip_name, taggerdate, 0, 0,
 				   from_tag)) {
-		free(to_free);
+		free((char*) tip_name);
 		return;
 	}
 
@@ -139,7 +138,6 @@ static void name_rev(struct commit *start_commit,
 			struct commit *parent = parents->item;
 			const char *new_name;
 			int generation, distance;
-			const char *new_name_to_free = NULL;
 
 			parse_commit(parent);
 			if (parent->date < cutoff)
@@ -159,11 +157,10 @@ static void name_rev(struct commit *start_commit,
 					new_name = xstrfmt("%.*s^%d", (int)len,
 							   name->tip_name,
 							   parent_number);
-				new_name_to_free = new_name;
 				generation = 0;
 				distance = name->distance + MERGE_TRAVERSAL_WEIGHT;
 			} else {
-				new_name = name->tip_name;
+				new_name = strdup(name->tip_name);
 				generation = name->generation + 1;
 				distance = name->distance + 1;
 			}
@@ -174,7 +171,7 @@ static void name_rev(struct commit *start_commit,
 				last_new_parent = commit_list_append(parent,
 						  last_new_parent);
 			else
-				free((char*) new_name_to_free);
+				free((char*) new_name);
 		}
 
 		*last_new_parent = list;
@@ -313,7 +310,7 @@ static int name_ref(const char *path, const struct object_id *oid, int flags, vo
 		if (taggerdate == TIME_MAX)
 			taggerdate = commit->date;
 		path = name_ref_abbrev(path, can_abbreviate_output);
-		name_rev(commit, xstrdup(path), taggerdate, from_tag, deref);
+		name_rev(commit, path, taggerdate, from_tag, deref);
 	}
 	return 0;
 }
René Scharfe Sept. 21, 2019, 12:37 p.m. UTC | #7
Am 20.09.19 um 20:13 schrieb SZEDER Gábor:
>>> @@ -280,12 +269,16 @@ static int name_ref(const char *path, const struct object_id *oid, int flags, vo
>>>  	if (o && o->type == OBJ_COMMIT) {
>>>  		struct commit *commit = (struct commit *)o;
>>>  		int from_tag = starts_with(path, "refs/tags/");
>>> +		const char *tip_name;
>>
>> This should not be const because you allocate the buffer it points to
>> right here in the function, in each execution path.
>
> Marking it as const indicates that this function doesn't modify the
> buffer where the pointer points at.

Right, and that's at odds with this code:

>>> +		if (deref)
>>> +			tip_name = xstrfmt("%s^0", path);
>>> +		else
>>> +			tip_name = xstrdup(path);

... which allocates said memory and writes a string to it.

René
René Scharfe Sept. 21, 2019, 12:37 p.m. UTC | #8
Am 21.09.19 um 11:57 schrieb SZEDER Gábor:
> On Fri, Sep 20, 2019 at 08:14:07PM +0200, SZEDER Gábor wrote:
>> On Fri, Sep 20, 2019 at 08:13:02PM +0200, SZEDER Gábor wrote:
>>>> If the (re)introduced leak doesn't impact performance and memory
>>>> usage too much then duplicating tip_name again in name_rev() or
>>>> rather your new create_or_update_name() would likely make the
>>>> lifetimes of those string buffers easier to manage.
>>>
>>> Yeah, the easiest would be when each 'struct rev_name' in the commit
>>> slab would have its own 'tip_name' string, but that would result in
>>> a lot of duplicated strings and increased memory usage.
>>
>> I didn't measure how much more memory would be used, though.
>
> So, I tried the patch below to give each 'struct rev_name' instance
> its own copy of 'tip_name', and the memory usage of 'git name-rev
> --all' usually increased.
>
> The increase depends on how many merges and how many refs there are
> compared to the number of commits: the fewer merges and refs, the
> higher the more the memory usage increased:
>
>   linux:         +4.8%
>   gcc:           +7.2%
>   gecko-dev:     +9.2%
>   webkit:       +12.4%
>   llvm-project: +19.0%

Is that the overall memory usage or just for struct rev_name instances
and tip_name strings?  And how much is that in absolute terms?  (Perhaps
it's worth it to get the memory ownership question off the table at
least during the transformation to iterative processing.)

> git.git is the exception with its unusually high number of merge
> commits (about 25%), and the memory usage decresed by 4.4%.

Interesting.

I wonder why regular commits even need a struct name_rev.  Shouldn't
only tips and roots need ones?  And perhaps merges and occasional
regular "checkpoint" commits, to avoid too many duplicate traversals.

That's not exactly on-topic, though, and I didn't think all that
deeply about it, but perhaps switching to a different marking
strategy could get rid of recursion as a side-effect?  *waves hands
vaguely*

>
>
>  --- >8 ---
>
> diff --git a/builtin/name-rev.c b/builtin/name-rev.c
> index 6969af76c4..62ab78242b 100644
> --- a/builtin/name-rev.c
> +++ b/builtin/name-rev.c
> @@ -88,6 +88,7 @@ static struct rev_name *create_or_update_name(struct commit *commit,
>  		set_commit_rev_name(commit, name);
>  		goto copy_data;
>  	} else if (is_better_name(name, taggerdate, distance, from_tag)) {
> +		free((char*) name->tip_name);
>  copy_data:
>  		name->tip_name = tip_name;

I would have expected a xstrdup() call here.

>  		name->taggerdate = taggerdate;
> @@ -106,21 +107,19 @@ static void name_rev(struct commit *start_commit,
>  {
>  	struct commit_list *list = NULL;
>  	const char *tip_name;
> -	char *to_free = NULL;
>
>  	parse_commit(start_commit);
>  	if (start_commit->date < cutoff)
>  		return;
>
>  	if (deref) {
> -		tip_name = to_free = xstrfmt("%s^0", start_tip_name);
> -		free((char*) start_tip_name);
> +		tip_name = xstrfmt("%s^0", start_tip_name);
>  	} else
> -		tip_name = start_tip_name;
> +		tip_name = strdup(start_tip_name);

This would not be needed with the central xstrdup() call mentioned above.

>
>  	if (!create_or_update_name(start_commit, tip_name, taggerdate, 0, 0,
>  				   from_tag)) {
> -		free(to_free);
> +		free((char*) tip_name);
>  		return;
>  	}
>
> @@ -139,7 +138,6 @@ static void name_rev(struct commit *start_commit,
>  			struct commit *parent = parents->item;
>  			const char *new_name;
>  			int generation, distance;
> -			const char *new_name_to_free = NULL;
>
>  			parse_commit(parent);
>  			if (parent->date < cutoff)
> @@ -159,11 +157,10 @@ static void name_rev(struct commit *start_commit,
>  					new_name = xstrfmt("%.*s^%d", (int)len,
>  							   name->tip_name,
>  							   parent_number);
> -				new_name_to_free = new_name;
>  				generation = 0;
>  				distance = name->distance + MERGE_TRAVERSAL_WEIGHT;
>  			} else {
> -				new_name = name->tip_name;
> +				new_name = strdup(name->tip_name);

... and neither would this.

Sure the xstrfmt() result would be duplicated instead of being reused, but
that doesn't increase memory usage overall.

>  				generation = name->generation + 1;
>  				distance = name->distance + 1;
>  			}
> @@ -174,7 +171,7 @@ static void name_rev(struct commit *start_commit,
>  				last_new_parent = commit_list_append(parent,
>  						  last_new_parent);
>  			else
> -				free((char*) new_name_to_free);
> +				free((char*) new_name);
>  		}
>
>  		*last_new_parent = list;
> @@ -313,7 +310,7 @@ static int name_ref(const char *path, const struct object_id *oid, int flags, vo
>  		if (taggerdate == TIME_MAX)
>  			taggerdate = commit->date;
>  		path = name_ref_abbrev(path, can_abbreviate_output);
> -		name_rev(commit, xstrdup(path), taggerdate, from_tag, deref);
> +		name_rev(commit, path, taggerdate, from_tag, deref);
>  	}
>  	return 0;
>  }
>
SZEDER Gábor Sept. 21, 2019, 2:21 p.m. UTC | #9
On Sat, Sep 21, 2019 at 02:37:05PM +0200, René Scharfe wrote:
> Am 20.09.19 um 20:13 schrieb SZEDER Gábor:
> >>> @@ -280,12 +269,16 @@ static int name_ref(const char *path, const struct object_id *oid, int flags, vo
> >>>  	if (o && o->type == OBJ_COMMIT) {
> >>>  		struct commit *commit = (struct commit *)o;
> >>>  		int from_tag = starts_with(path, "refs/tags/");
> >>> +		const char *tip_name;
> >>
> >> This should not be const because you allocate the buffer it points to
> >> right here in the function, in each execution path.
> >
> > Marking it as const indicates that this function doesn't modify the
> > buffer where the pointer points at.
> 
> Right, and that's at odds with this code:
> 
> >>> +		if (deref)
> >>> +			tip_name = xstrfmt("%s^0", path);
> >>> +		else
> >>> +			tip_name = xstrdup(path);
> 
> ... which allocates said memory and writes a string to it.

... before assigning it to the const pointer.
René Scharfe Sept. 21, 2019, 3:52 p.m. UTC | #10
Am 21.09.19 um 16:21 schrieb SZEDER Gábor:
> On Sat, Sep 21, 2019 at 02:37:05PM +0200, René Scharfe wrote:
>> Am 20.09.19 um 20:13 schrieb SZEDER Gábor:
>>>>> @@ -280,12 +269,16 @@ static int name_ref(const char *path, const struct object_id *oid, int flags, vo
>>>>>  	if (o && o->type == OBJ_COMMIT) {
>>>>>  		struct commit *commit = (struct commit *)o;
>>>>>  		int from_tag = starts_with(path, "refs/tags/");
>>>>> +		const char *tip_name;
>>>>
>>>> This should not be const because you allocate the buffer it points to
>>>> right here in the function, in each execution path.
>>>
>>> Marking it as const indicates that this function doesn't modify the
>>> buffer where the pointer points at.
>>
>> Right, and that's at odds with this code:
>>
>>>>> +		if (deref)
>>>>> +			tip_name = xstrfmt("%s^0", path);
>>>>> +		else
>>>>> +			tip_name = xstrdup(path);
>>
>> ... which allocates said memory and writes a string to it.
>
> ... before assigning it to the const pointer.
>

Sure, you can cast anything to anything else, and slapping on a const
qualifier is even allowed to be done implicitly for pointers to objects
(but not for pointers to pointers).  Removing it later (e.g. for
free(3)) is a warning sign; such sites need to be checked manually, as
the compiler won't do it.

The declaration says we don't modify the buffer, but then we actually
create it, which is as big a modification as can be.  That's a bit
misleading.  Is protection against accidental updates worth the
misdirection, and where would they come from?  Usually code without
such tricks is easier to read and maintain.

René
SZEDER Gábor Sept. 22, 2019, 7:05 p.m. UTC | #11
On Sat, Sep 21, 2019 at 02:37:18PM +0200, René Scharfe wrote:
> Am 21.09.19 um 11:57 schrieb SZEDER Gábor:
> > On Fri, Sep 20, 2019 at 08:14:07PM +0200, SZEDER Gábor wrote:
> >> On Fri, Sep 20, 2019 at 08:13:02PM +0200, SZEDER Gábor wrote:
> >>>> If the (re)introduced leak doesn't impact performance and memory
> >>>> usage too much then duplicating tip_name again in name_rev() or
> >>>> rather your new create_or_update_name() would likely make the
> >>>> lifetimes of those string buffers easier to manage.
> >>>
> >>> Yeah, the easiest would be when each 'struct rev_name' in the commit
> >>> slab would have its own 'tip_name' string, but that would result in
> >>> a lot of duplicated strings and increased memory usage.
> >>
> >> I didn't measure how much more memory would be used, though.
> >
> > So, I tried the patch below to give each 'struct rev_name' instance
> > its own copy of 'tip_name', and the memory usage of 'git name-rev
> > --all' usually increased.
> >
> > The increase depends on how many merges and how many refs there are
> > compared to the number of commits: the fewer merges and refs, the
> > higher the more the memory usage increased:
> >
> >   linux:         +4.8%
> >   gcc:           +7.2%
> >   gecko-dev:     +9.2%
> >   webkit:       +12.4%
> >   llvm-project: +19.0%
> 
> Is that the overall memory usage or just for struct rev_name instances
> and tip_name strings?

It's overall memory usage, the avarage of five runs of:

  /usr/bin/time --format='%M' ~/src/git/git name-rev --all

> And how much is that in absolute terms?  

git:     29801 ->  28514
linux:  317018 -> 332218
gcc:    106462 -> 114140
gecko:  315448 -> 344486
webkit:  55847 ->  62780
llvm:   112867 -> 134384

> (Perhaps
> it's worth it to get the memory ownership question off the table at
> least during the transformation to iterative processing.)

I looked into it only because I got curious, but other than that I
will definitely play the "beyond the scope of this patch series" card
:)

> > git.git is the exception with its unusually high number of merge
> > commits (about 25%), and the memory usage decresed by 4.4%.
> 
> Interesting.
> 
> I wonder why regular commits even need a struct name_rev.  Shouldn't
> only tips and roots need ones?  And perhaps merges and occasional
> regular "checkpoint" commits, to avoid too many duplicate traversals.

The 'struct rev_name' holds all info that's necessary to determine the
commit's name.  It seems to be much simpler to just attach one to each
commit and then retrieve it from the commit slab when printing the
name of the commit than to come up with an algorithm where only a
sleect set of commits get a 'struct rev_name', including how to access
those when naming a commit that doesn't have one.

> That's not exactly on-topic, though, and I didn't think all that
> deeply about it, but perhaps switching to a different marking
> strategy could get rid of recursion as a side-effect?  *waves hands
> vaguely*

I suppose a topo-order-based history walk should be able to name all
commits in a single traversal, and, consequently, be faster.  However,
'git rev-list --all --topo-order' doesn't seem to be that much faster
than 'git name-rev --all', so it might not be worth the effort.

> >  --- >8 ---
> >
> > diff --git a/builtin/name-rev.c b/builtin/name-rev.c
> > index 6969af76c4..62ab78242b 100644
> > --- a/builtin/name-rev.c
> > +++ b/builtin/name-rev.c
> > @@ -88,6 +88,7 @@ static struct rev_name *create_or_update_name(struct commit *commit,
> >  		set_commit_rev_name(commit, name);
> >  		goto copy_data;
> >  	} else if (is_better_name(name, taggerdate, distance, from_tag)) {
> > +		free((char*) name->tip_name);
> >  copy_data:
> >  		name->tip_name = tip_name;
> 
> I would have expected a xstrdup() call here.

But then we'd needed to release the results of all those xstrfmt()
calls at the callsites of create_or_update_name(), so instead of those
strdup() calls that you deem unnecessary we would need additional
free() calls.

> >  		name->taggerdate = taggerdate;
> > @@ -106,21 +107,19 @@ static void name_rev(struct commit *start_commit,
> >  {
> >  	struct commit_list *list = NULL;
> >  	const char *tip_name;
> > -	char *to_free = NULL;
> >
> >  	parse_commit(start_commit);
> >  	if (start_commit->date < cutoff)
> >  		return;
> >
> >  	if (deref) {
> > -		tip_name = to_free = xstrfmt("%s^0", start_tip_name);
> > -		free((char*) start_tip_name);
> > +		tip_name = xstrfmt("%s^0", start_tip_name);
> >  	} else
> > -		tip_name = start_tip_name;
> > +		tip_name = strdup(start_tip_name);
> 
> This would not be needed with the central xstrdup() call mentioned above.
> 
> >
> >  	if (!create_or_update_name(start_commit, tip_name, taggerdate, 0, 0,
> >  				   from_tag)) {
> > -		free(to_free);
> > +		free((char*) tip_name);
> >  		return;
> >  	}
> >
> > @@ -139,7 +138,6 @@ static void name_rev(struct commit *start_commit,
> >  			struct commit *parent = parents->item;
> >  			const char *new_name;
> >  			int generation, distance;
> > -			const char *new_name_to_free = NULL;
> >
> >  			parse_commit(parent);
> >  			if (parent->date < cutoff)
> > @@ -159,11 +157,10 @@ static void name_rev(struct commit *start_commit,
> >  					new_name = xstrfmt("%.*s^%d", (int)len,
> >  							   name->tip_name,
> >  							   parent_number);
> > -				new_name_to_free = new_name;
> >  				generation = 0;
> >  				distance = name->distance + MERGE_TRAVERSAL_WEIGHT;
> >  			} else {
> > -				new_name = name->tip_name;
> > +				new_name = strdup(name->tip_name);
> 
> ... and neither would this.
> 
> Sure the xstrfmt() result would be duplicated instead of being reused, but
> that doesn't increase memory usage overall.
> 
> >  				generation = name->generation + 1;
> >  				distance = name->distance + 1;
> >  			}
> > @@ -174,7 +171,7 @@ static void name_rev(struct commit *start_commit,
> >  				last_new_parent = commit_list_append(parent,
> >  						  last_new_parent);
> >  			else
> > -				free((char*) new_name_to_free);
> > +				free((char*) new_name);
> >  		}
> >
> >  		*last_new_parent = list;
> > @@ -313,7 +310,7 @@ static int name_ref(const char *path, const struct object_id *oid, int flags, vo
> >  		if (taggerdate == TIME_MAX)
> >  			taggerdate = commit->date;
> >  		path = name_ref_abbrev(path, can_abbreviate_output);
> > -		name_rev(commit, xstrdup(path), taggerdate, from_tag, deref);
> > +		name_rev(commit, path, taggerdate, from_tag, deref);
> >  	}
> >  	return 0;
> >  }
> >
René Scharfe Sept. 23, 2019, 6:43 p.m. UTC | #12
Am 22.09.19 um 21:05 schrieb SZEDER Gábor:
> On Sat, Sep 21, 2019 at 02:37:18PM +0200, René Scharfe wrote:
>> Am 21.09.19 um 11:57 schrieb SZEDER Gábor:
>>> On Fri, Sep 20, 2019 at 08:14:07PM +0200, SZEDER Gábor wrote:
>>>> On Fri, Sep 20, 2019 at 08:13:02PM +0200, SZEDER Gábor wrote:
>>>>>> If the (re)introduced leak doesn't impact performance and memory
>>>>>> usage too much then duplicating tip_name again in name_rev() or
>>>>>> rather your new create_or_update_name() would likely make the
>>>>>> lifetimes of those string buffers easier to manage.
>>>>>
>>>>> Yeah, the easiest would be when each 'struct rev_name' in the commit
>>>>> slab would have its own 'tip_name' string, but that would result in
>>>>> a lot of duplicated strings and increased memory usage.
>>>>
>>>> I didn't measure how much more memory would be used, though.
>>>
>>> So, I tried the patch below to give each 'struct rev_name' instance
>>> its own copy of 'tip_name', and the memory usage of 'git name-rev
>>> --all' usually increased.
>>>
>>> The increase depends on how many merges and how many refs there are
>>> compared to the number of commits: the fewer merges and refs, the
>>> higher the more the memory usage increased:
>>>
>>>   linux:         +4.8%
>>>   gcc:           +7.2%
>>>   gecko-dev:     +9.2%
>>>   webkit:       +12.4%
>>>   llvm-project: +19.0%
>>
>> Is that the overall memory usage or just for struct rev_name instances
>> and tip_name strings?
>
> It's overall memory usage, the avarage of five runs of:
>
>   /usr/bin/time --format='%M' ~/src/git/git name-rev --all
>
>> And how much is that in absolute terms?
>
> git:     29801 ->  28514
> linux:  317018 -> 332218
> gcc:    106462 -> 114140
> gecko:  315448 -> 344486
> webkit:  55847 ->  62780
> llvm:   112867 -> 134384

I only have the first two handy, and I get numbers like this with
master:

git, lots of branches with long names: 3075476
git, local clone, single branch:       1349016
linux, single branch:                  1520468

O_o

>> (Perhaps
>> it's worth it to get the memory ownership question off the table at
>> least during the transformation to iterative processing.)
>
> I looked into it only because I got curious, but other than that I
> will definitely play the "beyond the scope of this patch series" card
> :)

Fair enough.

>> I wonder why regular commits even need a struct name_rev.  Shouldn't
>> only tips and roots need ones?  And perhaps merges and occasional
>> regular "checkpoint" commits, to avoid too many duplicate traversals.
>
> The 'struct rev_name' holds all info that's necessary to determine the
> commit's name.  It seems to be much simpler to just attach one to each
> commit and then retrieve it from the commit slab when printing the
> name of the commit than to come up with an algorithm where only a
> sleect set of commits get a 'struct rev_name', including how to access
> those when naming a commit that doesn't have one.

Sure, the lookup of individual commits is much easier once all commits
have name tags attached.  Preparing that sounds expensive, though.
It's a trade-off favoring looking up lots of names per program run.

>>>  --- >8 ---
>>>
>>> diff --git a/builtin/name-rev.c b/builtin/name-rev.c
>>> index 6969af76c4..62ab78242b 100644
>>> --- a/builtin/name-rev.c
>>> +++ b/builtin/name-rev.c
>>> @@ -88,6 +88,7 @@ static struct rev_name *create_or_update_name(struct commit *commit,
>>>  		set_commit_rev_name(commit, name);
>>>  		goto copy_data;
>>>  	} else if (is_better_name(name, taggerdate, distance, from_tag)) {
>>> +		free((char*) name->tip_name);
>>>  copy_data:
>>>  		name->tip_name = tip_name;
>>
>> I would have expected a xstrdup() call here.
>
> But then we'd needed to release the results of all those xstrfmt()
> calls at the callsites of create_or_update_name(), so instead of those
> strdup() calls that you deem unnecessary we would need additional
> free() calls.

Correct.  That would be simpler and shouldn't affect peak memory
usage.

René
SZEDER Gábor Sept. 23, 2019, 6:59 p.m. UTC | #13
On Mon, Sep 23, 2019 at 08:43:11PM +0200, René Scharfe wrote:
> > It's overall memory usage, the avarage of five runs of:
> >
> >   /usr/bin/time --format='%M' ~/src/git/git name-rev --all
> >
> >> And how much is that in absolute terms?
> >
> > git:     29801 ->  28514
> > linux:  317018 -> 332218
> > gcc:    106462 -> 114140
> > gecko:  315448 -> 344486
> > webkit:  55847 ->  62780
> > llvm:   112867 -> 134384
> 
> I only have the first two handy, and I get numbers like this with
> master:
> 
> git, lots of branches with long names: 3075476
> git, local clone, single branch:       1349016
> linux, single branch:                  1520468
> 
> O_o

I have commit graph present and enabled.  Without that I get approx.
the same memory usage in my linux repo as you did (along with much
longer runtime).

Will have to clarify this in the commit messages that talk about
runtime and memory usage.
René Scharfe Sept. 23, 2019, 7:55 p.m. UTC | #14
Am 23.09.19 um 20:59 schrieb SZEDER Gábor:
> On Mon, Sep 23, 2019 at 08:43:11PM +0200, René Scharfe wrote:
>>> It's overall memory usage, the avarage of five runs of:
>>>
>>>   /usr/bin/time --format='%M' ~/src/git/git name-rev --all
>>>
>>>> And how much is that in absolute terms?
>>>
>>> git:     29801 ->  28514
>>> linux:  317018 -> 332218
>>> gcc:    106462 -> 114140
>>> gecko:  315448 -> 344486
>>> webkit:  55847 ->  62780
>>> llvm:   112867 -> 134384
>>
>> I only have the first two handy, and I get numbers like this with
>> master:
>>
>> git, lots of branches with long names: 3075476
>> git, local clone, single branch:       1349016
>> linux, single branch:                  1520468
>>
>> O_o
>
> I have commit graph present and enabled.  Without that I get approx.
> the same memory usage in my linux repo as you did (along with much
> longer runtime).

OK.  Cloned git afresh and tried with master and without commit-graph
again, after "git commit-graph write" and both again with the patch
below:

git:                           109880
git w/ commit-graph:            47208
git w/ patch:                   94304
git w/ commit-graph and patch:  31220

Strange numbers, at least compared to my number for the clone above:
One order of magnitude less!  Not sure what to make of it.  (Tried
the clone again, same result.)

Anyway, here's the patch:

-- >8 --
Subject: [PATCH] name-rev: use FLEX_ARRAY for tip_name in struct rev_name

Give each rev_name its very own tip_name string.  This simplifies memory
ownership, as callers of name_rev() only have to make sure the tip_name
parameter exists for the duration of the call and don't have to preserve
it for the whole run of the program.

It also saves four or eight bytes per object because this change removes
the pointer indirection.  Memory usage is still higher for linear
histories that previously shared the same tip_name value between
multiple name_rev instances.

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 builtin/name-rev.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/builtin/name-rev.c b/builtin/name-rev.c
index c785fe16ba..4162fb29ee 100644
--- a/builtin/name-rev.c
+++ b/builtin/name-rev.c
@@ -12,11 +12,11 @@
 #define CUTOFF_DATE_SLOP 86400 /* one day */

 typedef struct rev_name {
-	const char *tip_name;
 	timestamp_t taggerdate;
 	int generation;
 	int distance;
 	int from_tag;
+	char tip_name[FLEX_ARRAY];
 } rev_name;

 define_commit_slab(commit_rev_name, struct rev_name *);
@@ -97,17 +97,14 @@ static void name_rev(struct commit *commit,
 			die("generation: %d, but deref?", generation);
 	}

-	if (name == NULL) {
-		name = xmalloc(sizeof(rev_name));
-		set_commit_rev_name(commit, name);
-		goto copy_data;
-	} else if (is_better_name(name, taggerdate, distance, from_tag)) {
-copy_data:
-		name->tip_name = tip_name;
+	if (!name || is_better_name(name, taggerdate, distance, from_tag)) {
+		free(name);
+		FLEX_ALLOC_STR(name, tip_name, tip_name);
 		name->taggerdate = taggerdate;
 		name->generation = generation;
 		name->distance = distance;
 		name->from_tag = from_tag;
+		set_commit_rev_name(commit, name);
 	} else {
 		free(to_free);
 		return;
@@ -131,12 +128,14 @@ static void name_rev(struct commit *commit,
 			name_rev(parents->item, new_name, taggerdate, 0,
 				 distance + MERGE_TRAVERSAL_WEIGHT,
 				 from_tag, 0);
+			free(new_name);
 		} else {
 			name_rev(parents->item, tip_name, taggerdate,
 				 generation + 1, distance + 1,
 				 from_tag, 0);
 		}
 	}
+	free(to_free);
 }

 static int subpath_matches(const char *path, const char *filter)
@@ -270,8 +269,7 @@ static int name_ref(const char *path, const struct object_id *oid, int flags, vo
 		if (taggerdate == TIME_MAX)
 			taggerdate = ((struct commit *)o)->date;
 		path = name_ref_abbrev(path, can_abbreviate_output);
-		name_rev(commit, xstrdup(path), taggerdate, 0, 0,
-			 from_tag, deref);
+		name_rev(commit, path, taggerdate, 0, 0, from_tag, deref);
 	}
 	return 0;
 }
--
2.23.0
SZEDER Gábor Sept. 23, 2019, 8:47 p.m. UTC | #15
On Mon, Sep 23, 2019 at 09:55:11PM +0200, René Scharfe wrote:
> -- >8 --
> Subject: [PATCH] name-rev: use FLEX_ARRAY for tip_name in struct rev_name
> 
> Give each rev_name its very own tip_name string.  This simplifies memory
> ownership, as callers of name_rev() only have to make sure the tip_name
> parameter exists for the duration of the call and don't have to preserve
> it for the whole run of the program.
> 
> It also saves four or eight bytes per object because this change removes
> the pointer indirection.  Memory usage is still higher for linear
> histories that previously shared the same tip_name value between
> multiple name_rev instances.

Besides looking at memory usage, have you run any performance
benchmarks?  Here it seems to make 'git name-rev --all >out' slower by
17% in the git repo and by 19.5% in the linux repo.


> Signed-off-by: René Scharfe <l.s.r@web.de>
> ---
>  builtin/name-rev.c | 18 ++++++++----------
>  1 file changed, 8 insertions(+), 10 deletions(-)
> 
> diff --git a/builtin/name-rev.c b/builtin/name-rev.c
> index c785fe16ba..4162fb29ee 100644
> --- a/builtin/name-rev.c
> +++ b/builtin/name-rev.c
> @@ -12,11 +12,11 @@
>  #define CUTOFF_DATE_SLOP 86400 /* one day */
> 
>  typedef struct rev_name {
> -	const char *tip_name;
>  	timestamp_t taggerdate;
>  	int generation;
>  	int distance;
>  	int from_tag;
> +	char tip_name[FLEX_ARRAY];
>  } rev_name;
> 
>  define_commit_slab(commit_rev_name, struct rev_name *);
> @@ -97,17 +97,14 @@ static void name_rev(struct commit *commit,
>  			die("generation: %d, but deref?", generation);
>  	}
> 
> -	if (name == NULL) {
> -		name = xmalloc(sizeof(rev_name));
> -		set_commit_rev_name(commit, name);
> -		goto copy_data;
> -	} else if (is_better_name(name, taggerdate, distance, from_tag)) {
> -copy_data:
> -		name->tip_name = tip_name;
> +	if (!name || is_better_name(name, taggerdate, distance, from_tag)) {
> +		free(name);
> +		FLEX_ALLOC_STR(name, tip_name, tip_name);
>  		name->taggerdate = taggerdate;
>  		name->generation = generation;
>  		name->distance = distance;
>  		name->from_tag = from_tag;
> +		set_commit_rev_name(commit, name);
>  	} else {
>  		free(to_free);
>  		return;
> @@ -131,12 +128,14 @@ static void name_rev(struct commit *commit,
>  			name_rev(parents->item, new_name, taggerdate, 0,
>  				 distance + MERGE_TRAVERSAL_WEIGHT,
>  				 from_tag, 0);
> +			free(new_name);
>  		} else {
>  			name_rev(parents->item, tip_name, taggerdate,
>  				 generation + 1, distance + 1,
>  				 from_tag, 0);
>  		}
>  	}
> +	free(to_free);
>  }
> 
>  static int subpath_matches(const char *path, const char *filter)
> @@ -270,8 +269,7 @@ static int name_ref(const char *path, const struct object_id *oid, int flags, vo
>  		if (taggerdate == TIME_MAX)
>  			taggerdate = ((struct commit *)o)->date;
>  		path = name_ref_abbrev(path, can_abbreviate_output);
> -		name_rev(commit, xstrdup(path), taggerdate, 0, 0,
> -			 from_tag, deref);
> +		name_rev(commit, path, taggerdate, 0, 0, from_tag, deref);
>  	}
>  	return 0;
>  }
> --
> 2.23.0
René Scharfe Sept. 24, 2019, 5:03 p.m. UTC | #16
Am 23.09.19 um 22:47 schrieb SZEDER Gábor:
> On Mon, Sep 23, 2019 at 09:55:11PM +0200, René Scharfe wrote:
>> -- >8 --
>> Subject: [PATCH] name-rev: use FLEX_ARRAY for tip_name in struct rev_name
>>
>> Give each rev_name its very own tip_name string.  This simplifies memory
>> ownership, as callers of name_rev() only have to make sure the tip_name
>> parameter exists for the duration of the call and don't have to preserve
>> it for the whole run of the program.
>>
>> It also saves four or eight bytes per object because this change removes
>> the pointer indirection.  Memory usage is still higher for linear
>> histories that previously shared the same tip_name value between
>> multiple name_rev instances.
>
> Besides looking at memory usage, have you run any performance
> benchmarks?  Here it seems to make 'git name-rev --all >out' slower by
> 17% in the git repo and by 19.5% in the linux repo.

Did measure now; I also see a slowdown with my patch applied:

git:
Benchmark #1: ~/src/git/git name-rev --all
  Time (mean ± σ):     462.8 ms ±   2.8 ms    [User: 440.6 ms, System: 20.5 ms]
  Range (min … max):   459.6 ms … 466.5 ms    10 runs

git w/ commit-graph:
Benchmark #1: ~/src/git/git name-rev --all
  Time (mean ± σ):     104.0 ms ±   1.5 ms    [User: 93.7 ms, System: 10.0 ms]
  Range (min … max):   101.5 ms … 107.1 ms    28 runs

git w/ patch:
Benchmark #1: ~/src/git/git name-rev --all
  Time (mean ± σ):     475.1 ms ±   3.7 ms    [User: 458.3 ms, System: 16.0 ms]
  Range (min … max):   470.4 ms … 481.4 ms    10 runs

git w/ commit-graph and patch:
Benchmark #1: ~/src/git/git name-rev --all
  Time (mean ± σ):     110.9 ms ±   1.5 ms    [User: 106.6 ms, System: 4.1 ms]
  Range (min … max):   109.0 ms … 114.7 ms    26 runs


linux:
Benchmark #1: ~/src/git/git name-rev --all
  Time (mean ± σ):      6.670 s ±  0.027 s    [User: 6.450 s, System: 0.208 s]
  Range (min … max):    6.640 s …  6.721 s    10 runs

linux w/ patch:
Benchmark #1: ~/src/git/git name-rev --all
  Time (mean ± σ):      6.784 s ±  0.160 s    [User: 6.567 s, System: 0.214 s]
  Range (min … max):    6.638 s …  7.211 s    10 runs

linux w/ commit-graph:
Benchmark #1: ~/src/git/git name-rev --all
  Time (mean ± σ):     929.6 ms ±   5.3 ms    [User: 881.4 ms, System: 46.8 ms]
  Range (min … max):   924.1 ms … 939.5 ms    10 runs

linux w/ commit-graph and patch:
Benchmark #1: ~/src/git/git name-rev --all
  Time (mean ± σ):      1.004 s ±  0.007 s    [User: 957.4 ms, System: 45.6 ms]
  Range (min … max):    0.997 s …  1.021 s    10 runs

We can reuse a strbuf instead of allocating new strings when adding
suffixes to get some of the performance loss back.  I guess it's easier
after the recursion is removed.  Numbers:

git w/ both patches:
Benchmark #1: ~/src/git/git name-rev --all
  Time (mean ± σ):     448.0 ms ±   2.4 ms    [User: 428.2 ms, System: 19.6 ms]
  Range (min … max):   445.3 ms … 453.4 ms    10 runs

git w/ commit-graph and both patches:
Benchmark #1: ~/src/git/git name-rev --all
  Time (mean ± σ):      98.7 ms ±   1.6 ms    [User: 93.5 ms, System: 5.0 ms]
  Range (min … max):    96.7 ms … 102.8 ms    30 runs

linux w/ both patches:
Benchmark #1: ~/src/git/git name-rev --all
  Time (mean ± σ):      6.727 s ±  0.063 s    [User: 6.486 s, System: 0.226 s]
  Range (min … max):    6.675 s …  6.872 s    10 runs

linux w/ commit-graph and both patches:
Benchmark #1: ~/src/git/git name-rev --all
  Time (mean ± σ):     988.8 ms ±   4.5 ms    [User: 937.5 ms, System: 49.2 ms]
  Range (min … max):   981.4 ms … 994.8 ms    10 runs


---
 builtin/name-rev.c | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/builtin/name-rev.c b/builtin/name-rev.c
index 4162fb29ee..7fee664574 100644
--- a/builtin/name-rev.c
+++ b/builtin/name-rev.c
@@ -75,15 +75,14 @@ static int is_better_name(struct rev_name *name,
 	return 0;
 }

-static void name_rev(struct commit *commit,
-		const char *tip_name, timestamp_t taggerdate,
+static void name_rev(struct commit *commit, struct strbuf *sb,
+		timestamp_t taggerdate,
 		int generation, int distance, int from_tag,
 		int deref)
 {
 	struct rev_name *name = get_commit_rev_name(commit);
 	struct commit_list *parents;
 	int parent_number = 1;
-	char *to_free = NULL;

 	parse_commit(commit);

@@ -91,7 +90,7 @@ static void name_rev(struct commit *commit,
 		return;

 	if (deref) {
-		tip_name = to_free = xstrfmt("%s^0", tip_name);
+		strbuf_addstr(sb, "^0");

 		if (generation)
 			die("generation: %d, but deref?", generation);
@@ -99,14 +98,13 @@ static void name_rev(struct commit *commit,

 	if (!name || is_better_name(name, taggerdate, distance, from_tag)) {
 		free(name);
-		FLEX_ALLOC_STR(name, tip_name, tip_name);
+		FLEX_ALLOC_MEM(name, tip_name, sb->buf, sb->len);
 		name->taggerdate = taggerdate;
 		name->generation = generation;
 		name->distance = distance;
 		name->from_tag = from_tag;
 		set_commit_rev_name(commit, name);
 	} else {
-		free(to_free);
 		return;
 	}

@@ -114,28 +112,26 @@ static void name_rev(struct commit *commit,
 			parents;
 			parents = parents->next, parent_number++) {
 		if (parent_number > 1) {
-			size_t len;
-			char *new_name;
-
-			strip_suffix(tip_name, "^0", &len);
+			int stripped = strbuf_strip_suffix(sb, "^0");
+			size_t base_len = sb->len;
 			if (generation > 0)
-				new_name = xstrfmt("%.*s~%d^%d", (int)len, tip_name,
-						   generation, parent_number);
-			else
-				new_name = xstrfmt("%.*s^%d", (int)len, tip_name,
-						   parent_number);
+				strbuf_addf(sb, "~%d", generation);
+			strbuf_addf(sb, "^%d", parent_number);

-			name_rev(parents->item, new_name, taggerdate, 0,
+			name_rev(parents->item, sb, taggerdate, 0,
 				 distance + MERGE_TRAVERSAL_WEIGHT,
 				 from_tag, 0);
-			free(new_name);
+			strbuf_setlen(sb, base_len);
+			if (stripped)
+				strbuf_addstr(sb, "^0");
 		} else {
-			name_rev(parents->item, tip_name, taggerdate,
+			size_t base_len = sb->len;
+			name_rev(parents->item, sb, taggerdate,
 				 generation + 1, distance + 1,
 				 from_tag, 0);
+			strbuf_setlen(sb, base_len);
 		}
 	}
-	free(to_free);
 }

 static int subpath_matches(const char *path, const char *filter)
@@ -200,6 +196,7 @@ static int tipcmp(const void *a_, const void *b_)

 static int name_ref(const char *path, const struct object_id *oid, int flags, void *cb_data)
 {
+	static struct strbuf sb = STRBUF_INIT;
 	struct object *o = parse_object(the_repository, oid);
 	struct name_ref_data *data = cb_data;
 	int can_abbreviate_output = data->tags_only && data->name_only;
@@ -269,7 +266,9 @@ static int name_ref(const char *path, const struct object_id *oid, int flags, vo
 		if (taggerdate == TIME_MAX)
 			taggerdate = ((struct commit *)o)->date;
 		path = name_ref_abbrev(path, can_abbreviate_output);
-		name_rev(commit, path, taggerdate, 0, 0, from_tag, deref);
+		strbuf_reset(&sb);
+		strbuf_addstr(&sb, path);
+		name_rev(commit, &sb, taggerdate, 0, 0, from_tag, deref);
 	}
 	return 0;
 }
--
2.23.0
SZEDER Gábor Sept. 26, 2019, 5:33 p.m. UTC | #17
On Tue, Sep 24, 2019 at 07:03:50PM +0200, René Scharfe wrote:
> Am 23.09.19 um 22:47 schrieb SZEDER Gábor:
> > On Mon, Sep 23, 2019 at 09:55:11PM +0200, René Scharfe wrote:
> >> -- >8 --
> >> Subject: [PATCH] name-rev: use FLEX_ARRAY for tip_name in struct rev_name
> >>
> >> Give each rev_name its very own tip_name string.  This simplifies memory
> >> ownership, as callers of name_rev() only have to make sure the tip_name
> >> parameter exists for the duration of the call and don't have to preserve
> >> it for the whole run of the program.
> >>
> >> It also saves four or eight bytes per object because this change removes
> >> the pointer indirection.  Memory usage is still higher for linear
> >> histories that previously shared the same tip_name value between
> >> multiple name_rev instances.
> >
> > Besides looking at memory usage, have you run any performance
> > benchmarks?  Here it seems to make 'git name-rev --all >out' slower by
> > 17% in the git repo and by 19.5% in the linux repo.
> 
> Did measure now; I also see a slowdown with my patch applied:

Thanks for confirming.

> We can reuse a strbuf instead of allocating new strings when adding
> suffixes to get some of the performance loss back.  I guess it's easier
> after the recursion is removed.  Numbers:

Agreed, the conflicts on first sight are too ugly to have these
changes in parallel cooking topics.  Furthermore, after the recursion
is gone we can measure the memory usage and performance impact of your
changes even in big linear repositories.

I think I will drop the last two patches plugging memory leaks from v2
of my series, because it seems your proposed changes do it cleaner and
make them moot anyway.

Patch
diff mbox series

diff --git a/builtin/name-rev.c b/builtin/name-rev.c
index cb8ac2fa64..42cea5c881 100644
--- a/builtin/name-rev.c
+++ b/builtin/name-rev.c
@@ -102,30 +102,19 @@  static struct rev_name *create_or_update_name(struct commit *commit,
 
 static void name_rev(struct commit *commit,
 		const char *tip_name, timestamp_t taggerdate,
-		int generation, int distance, int from_tag,
-		int deref)
+		int generation, int distance, int from_tag)
 {
 	struct commit_list *parents;
 	int parent_number = 1;
-	char *to_free = NULL;
 
 	parse_commit(commit);
 
 	if (commit->date < cutoff)
 		return;
 
-	if (deref) {
-		tip_name = to_free = xstrfmt("%s^0", tip_name);
-
-		if (generation)
-			die("generation: %d, but deref?", generation);
-	}
-
 	if (!create_or_update_name(commit, tip_name, taggerdate, generation,
-				   distance, from_tag)) {
-		free(to_free);
+				   distance, from_tag))
 		return;
-	}
 
 	for (parents = commit->parents;
 			parents;
@@ -144,11 +133,11 @@  static void name_rev(struct commit *commit,
 
 			name_rev(parents->item, new_name, taggerdate, 0,
 				 distance + MERGE_TRAVERSAL_WEIGHT,
-				 from_tag, 0);
+				 from_tag);
 		} else {
 			name_rev(parents->item, tip_name, taggerdate,
 				 generation + 1, distance + 1,
-				 from_tag, 0);
+				 from_tag);
 		}
 	}
 }
@@ -280,12 +269,16 @@  static int name_ref(const char *path, const struct object_id *oid, int flags, vo
 	if (o && o->type == OBJ_COMMIT) {
 		struct commit *commit = (struct commit *)o;
 		int from_tag = starts_with(path, "refs/tags/");
+		const char *tip_name;
 
 		if (taggerdate == TIME_MAX)
 			taggerdate = commit->date;
 		path = name_ref_abbrev(path, can_abbreviate_output);
-		name_rev(commit, xstrdup(path), taggerdate, 0, 0,
-			 from_tag, deref);
+		if (deref)
+			tip_name = xstrfmt("%s^0", path);
+		else
+			tip_name = xstrdup(path);
+		name_rev(commit, tip_name, taggerdate, 0, 0, from_tag);
 	}
 	return 0;
 }