diff mbox series

[3/6] index-pack: remove redundant child field

Message ID 39740c6e58bd6cb6ea33e5abb4ab8542ff6eb7b7.1570663470.git.jonathantanmy@google.com (mailing list archive)
State New, archived
Headers show
Series Better threaded delta resolution in index-pack | expand

Commit Message

Jonathan Tan Oct. 9, 2019, 11:44 p.m. UTC
Instead, recompute ancestry if we ever need to reclaim memory.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 builtin/index-pack.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

Comments

Derrick Stolee Oct. 10, 2019, 2:45 p.m. UTC | #1
On 10/9/2019 7:44 PM, Jonathan Tan wrote:
> Instead, recompute ancestry if we ever need to reclaim memory.

I find this message lacking in important details:

1. Where do we recompute ancestry?
2. What are the performance implications of this change?
3. Why is it important that you construct a stack of deltas in prune_base_data()?

> Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
> ---
>  builtin/index-pack.c | 41 ++++++++++++++++++++++-------------------
>  1 file changed, 22 insertions(+), 19 deletions(-)
> 
> diff --git a/builtin/index-pack.c b/builtin/index-pack.c
> index 99f6e2957f..35f7f9e52b 100644
> --- a/builtin/index-pack.c
> +++ b/builtin/index-pack.c
> @@ -34,7 +34,6 @@ struct object_stat {
>  
>  struct base_data {
>  	struct base_data *base;
> -	struct base_data *child;
>  	struct object_entry *obj;
>  	void *data;
>  	unsigned long size;
> @@ -44,7 +43,6 @@ struct base_data {
>  
>  struct thread_local {
>  	pthread_t thread;
> -	struct base_data *base_cache;
>  	size_t base_cache_used;
>  	int pack_fd;
>  };
> @@ -380,27 +378,37 @@ static void free_base_data(struct base_data *c)
>  	}
>  }
>  
> -static void prune_base_data(struct base_data *retain)
> +static void prune_base_data(struct base_data *youngest_child)
>  {
>  	struct base_data *b;
>  	struct thread_local *data = get_thread_data();
> -	for (b = data->base_cache;
> -	     data->base_cache_used > delta_base_cache_limit && b;
> -	     b = b->child) {
> -		if (b->data && b != retain)
> -			free_base_data(b);
> +	struct base_data **ancestry = NULL;
> +	int nr = 0, alloc = 0;
> +	int i;
> +
> +	if (data->base_cache_used <= delta_base_cache_limit)
> +		return;
> +
> +	/*
> +	 * Free all ancestors of youngest_child until we have enough space,
> +	 * starting with the oldest. (We cannot free youngest_child itself.)
> +	 */
> +	for (b = youngest_child->base; b != NULL; b = b->base) {
> +		ALLOC_GROW(ancestry, nr + 1, alloc);
> +		ancestry[nr++] = b;
> +	}
> +	for (i = nr - 1;
> +	     i >= 0 && data->base_cache_used > delta_base_cache_limit;
> +	     i--) {
> +		if (ancestry[i]->data)
> +			free_base_data(ancestry[i]);
>  	}
> +	free(ancestry);
>  }
>  
>  static void link_base_data(struct base_data *base, struct base_data *c)
>  {
> -	if (base)
> -		base->child = c;
> -	else
> -		get_thread_data()->base_cache = c;
> -
>  	c->base = base;
> -	c->child = NULL;
>  	if (c->data)
>  		get_thread_data()->base_cache_used += c->size;
>  	prune_base_data(c);
> @@ -408,11 +416,6 @@ static void link_base_data(struct base_data *base, struct base_data *c)
>  
>  static void unlink_base_data(struct base_data *c)
>  {
> -	struct base_data *base = c->base;
> -	if (base)
> -		base->child = NULL;
> -	else
> -		get_thread_data()->base_cache = NULL;
>  	free_base_data(c);
>  }

Seems like this method should be removed and all callers should
call free_base_data() instead.

-Stolee
Jonathan Tan Oct. 10, 2019, 7:02 p.m. UTC | #2
> On 10/9/2019 7:44 PM, Jonathan Tan wrote:
> > Instead, recompute ancestry if we ever need to reclaim memory.
> 
> I find this message lacking in important details:
> 
> 1. Where do we recompute ancestry?
> 2. What are the performance implications of this change?
> 3. Why is it important that you construct a stack of deltas in prune_base_data()?

Thanks for taking a look at this. My original plan (as I perhaps badly
explained in the cover letter [1]) was to show the individual small
steps that I took to reach the end goal, each step still passing all
tests, in the hope that small steps are easier to understand than one
big one. Hence why I didn't explain much in this commit message (and
others), because I thought that I might have to squash them later. But
perhaps that is too confusing and I should have just squashed them in
the first place (and explain all the changes in the commit message -
it's +177 -198, which is not too big anyway).

To answer the question anyway, the short answer is that it doesn't
matter because I'm going to replace this mechanism in later patches. But
a longer answer:

 1. In prune_base_delta() (the stack of deltas you mention in question
    3).
 2. Slightly fewer pointer management during the normal course of
    operation, but an allocation if we ever need to reclaim memory.
 3. To recompute the ancestry. We have ancestry using the "base"
    pointer, but I need to iterate from the oldest to newest, so I
    create an array of all the "base" pointers and iterate in reverse.

[1] https://public-inbox.org/git/cover.1570663470.git.jonathantanmy@google.com/

> >  static void link_base_data(struct base_data *base, struct base_data *c)
> >  {
> > -	if (base)
> > -		base->child = c;
> > -	else
> > -		get_thread_data()->base_cache = c;
> > -
> >  	c->base = base;
> > -	c->child = NULL;
> >  	if (c->data)
> >  		get_thread_data()->base_cache_used += c->size;
> >  	prune_base_data(c);
> > @@ -408,11 +416,6 @@ static void link_base_data(struct base_data *base, struct base_data *c)
> >  
> >  static void unlink_base_data(struct base_data *c)
> >  {
> > -	struct base_data *base = c->base;
> > -	if (base)
> > -		base->child = NULL;
> > -	else
> > -		get_thread_data()->base_cache = NULL;
> >  	free_base_data(c);
> >  }
> 
> Seems like this method should be removed and all callers should
> call free_base_data() instead.

I agree, and did it in the next patch. Here I left it to preserve the
{link,unlink}_base_data symmetry.
Jeff King Oct. 17, 2019, 6:24 a.m. UTC | #3
On Thu, Oct 10, 2019 at 12:02:29PM -0700, Jonathan Tan wrote:

> > On 10/9/2019 7:44 PM, Jonathan Tan wrote:
> > > Instead, recompute ancestry if we ever need to reclaim memory.
> > 
> > I find this message lacking in important details:
> > 
> > 1. Where do we recompute ancestry?
> > 2. What are the performance implications of this change?
> > 3. Why is it important that you construct a stack of deltas in prune_base_data()?
> 
> Thanks for taking a look at this. My original plan (as I perhaps badly
> explained in the cover letter [1]) was to show the individual small
> steps that I took to reach the end goal, each step still passing all
> tests, in the hope that small steps are easier to understand than one
> big one. Hence why I didn't explain much in this commit message (and
> others), because I thought that I might have to squash them later. But
> perhaps that is too confusing and I should have just squashed them in
> the first place (and explain all the changes in the commit message -
> it's +177 -198, which is not too big anyway).

FWIW, I like the breakdown. These are tricky cleanups, and seeing them
individually makes it easy to verify that they don't themselves break
anything. I think they just need more explanation.

-Peff
Jeff King Oct. 17, 2019, 6:26 a.m. UTC | #4
On Wed, Oct 09, 2019 at 04:44:19PM -0700, Jonathan Tan wrote:

> -static void prune_base_data(struct base_data *retain)
> +static void prune_base_data(struct base_data *youngest_child)
>  {
>  	struct base_data *b;
>  	struct thread_local *data = get_thread_data();
> -	for (b = data->base_cache;
> -	     data->base_cache_used > delta_base_cache_limit && b;
> -	     b = b->child) {
> -		if (b->data && b != retain)
> -			free_base_data(b);
> +	struct base_data **ancestry = NULL;
> +	int nr = 0, alloc = 0;

Minor, but please use size_t for allocation variables.

> +	int i;

Technically this probably ought to be a size_t as well, but I'm much
more concerned about the allocation ones, where we might accidentally
overflow and underallocate a buffer. Overflowing "i" would probably just
lead to an error or bad result.

I _think_ what the patch is actually doing makes sense (taking as an
assumption that it's heading in a useful direction for the end of the
series).

-Peff
diff mbox series

Patch

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 99f6e2957f..35f7f9e52b 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -34,7 +34,6 @@  struct object_stat {
 
 struct base_data {
 	struct base_data *base;
-	struct base_data *child;
 	struct object_entry *obj;
 	void *data;
 	unsigned long size;
@@ -44,7 +43,6 @@  struct base_data {
 
 struct thread_local {
 	pthread_t thread;
-	struct base_data *base_cache;
 	size_t base_cache_used;
 	int pack_fd;
 };
@@ -380,27 +378,37 @@  static void free_base_data(struct base_data *c)
 	}
 }
 
-static void prune_base_data(struct base_data *retain)
+static void prune_base_data(struct base_data *youngest_child)
 {
 	struct base_data *b;
 	struct thread_local *data = get_thread_data();
-	for (b = data->base_cache;
-	     data->base_cache_used > delta_base_cache_limit && b;
-	     b = b->child) {
-		if (b->data && b != retain)
-			free_base_data(b);
+	struct base_data **ancestry = NULL;
+	int nr = 0, alloc = 0;
+	int i;
+
+	if (data->base_cache_used <= delta_base_cache_limit)
+		return;
+
+	/*
+	 * Free all ancestors of youngest_child until we have enough space,
+	 * starting with the oldest. (We cannot free youngest_child itself.)
+	 */
+	for (b = youngest_child->base; b != NULL; b = b->base) {
+		ALLOC_GROW(ancestry, nr + 1, alloc);
+		ancestry[nr++] = b;
+	}
+	for (i = nr - 1;
+	     i >= 0 && data->base_cache_used > delta_base_cache_limit;
+	     i--) {
+		if (ancestry[i]->data)
+			free_base_data(ancestry[i]);
 	}
+	free(ancestry);
 }
 
 static void link_base_data(struct base_data *base, struct base_data *c)
 {
-	if (base)
-		base->child = c;
-	else
-		get_thread_data()->base_cache = c;
-
 	c->base = base;
-	c->child = NULL;
 	if (c->data)
 		get_thread_data()->base_cache_used += c->size;
 	prune_base_data(c);
@@ -408,11 +416,6 @@  static void link_base_data(struct base_data *base, struct base_data *c)
 
 static void unlink_base_data(struct base_data *c)
 {
-	struct base_data *base = c->base;
-	if (base)
-		base->child = NULL;
-	else
-		get_thread_data()->base_cache = NULL;
 	free_base_data(c);
 }