diff mbox series

[v2,12/24] pack-bitmap-write: fill bitmap with commit history

Message ID 8e5607929d66a3c808dbe3a06c312d0cda1ef568.1605649533.git.me@ttaylorr.com (mailing list archive)
State Superseded
Headers show
Series pack-bitmap: bitmap generation improvements | expand

Commit Message

Taylor Blau Nov. 17, 2020, 9:47 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

The fill_bitmap_commit() method assumes that every parent of the given
commit is already part of the current bitmap. Instead of making that
assumption, let's walk parents until we reach commits already part of
the bitmap. Set the value for that parent immediately after querying to
save time doing double calls to find_object_pos() and to avoid inserting
the parent into the queue multiple times.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 pack-bitmap-write.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

Comments

Junio C Hamano Nov. 22, 2020, 9:50 p.m. UTC | #1
Taylor Blau <me@ttaylorr.com> writes:

> From: Derrick Stolee <dstolee@microsoft.com>
>
> The fill_bitmap_commit() method assumes that every parent of the given
> commit is already part of the current bitmap. Instead of making that
> assumption, let's walk parents until we reach commits already part of
> the bitmap. Set the value for that parent immediately after querying to
> save time doing double calls to find_object_pos() and to avoid inserting
> the parent into the queue multiple times.

Is it because somebody found a case where the assumption does not
hold and the code with the assumption produces a wrong result?  Is
it because we can get a better result without making the assumption
the current code does?

In other words, can we explain why we are making the change in the
proposed log message?

> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> Signed-off-by: Taylor Blau <me@ttaylorr.com>
> ---
>  pack-bitmap-write.c | 30 +++++++++++++++++++++++-------
>  1 file changed, 23 insertions(+), 7 deletions(-)
>
> diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
> index d2d46ff5f4..361f3305a2 100644
> --- a/pack-bitmap-write.c
> +++ b/pack-bitmap-write.c
> @@ -12,6 +12,7 @@
>  #include "sha1-lookup.h"
>  #include "pack-objects.h"
>  #include "commit-reach.h"
> +#include "prio-queue.h"
>  
>  struct bitmapped_commit {
>  	struct commit *commit;
> @@ -279,17 +280,30 @@ static void fill_bitmap_tree(struct bitmap *bitmap,
>  }
>  
>  static void fill_bitmap_commit(struct bb_commit *ent,
> -			       struct commit *commit)
> +			       struct commit *commit,
> +			       struct prio_queue *queue)
>  {
>  	if (!ent->bitmap)
>  		ent->bitmap = bitmap_new();
>  
> -	/*
> -	 * mark ourselves, but do not bother with parents; their values
> -	 * will already have been propagated to us
> -	 */
>  	bitmap_set(ent->bitmap, find_object_pos(&commit->object.oid));
> -	fill_bitmap_tree(ent->bitmap, get_commit_tree(commit));
> +	prio_queue_put(queue, commit);
> +
> +	while (queue->nr) {
> +		struct commit_list *p;
> +		struct commit *c = prio_queue_get(queue);
> +
> +		bitmap_set(ent->bitmap, find_object_pos(&c->object.oid));
> +		fill_bitmap_tree(ent->bitmap, get_commit_tree(c));
> +
> +		for (p = c->parents; p; p = p->next) {
> +			int pos = find_object_pos(&p->item->object.oid);
> +			if (!bitmap_get(ent->bitmap, pos)) {
> +				bitmap_set(ent->bitmap, pos);
> +				prio_queue_put(queue, p->item);
> +			}
> +		}
> +	}
>  }
>  
>  static void store_selected(struct bb_commit *ent, struct commit *commit)
> @@ -319,6 +333,7 @@ void bitmap_writer_build(struct packing_data *to_pack)
>  	struct bitmap_builder bb;
>  	size_t i;
>  	int nr_stored = 0; /* for progress */
> +	struct prio_queue queue = { compare_commits_by_gen_then_commit_date };
>  
>  	writer.bitmaps = kh_init_oid_map();
>  	writer.to_pack = to_pack;
> @@ -335,7 +350,7 @@ void bitmap_writer_build(struct packing_data *to_pack)
>  		struct commit *child;
>  		int reused = 0;
>  
> -		fill_bitmap_commit(ent, commit);
> +		fill_bitmap_commit(ent, commit, &queue);
>  
>  		if (ent->selected) {
>  			store_selected(ent, commit);
> @@ -360,6 +375,7 @@ void bitmap_writer_build(struct packing_data *to_pack)
>  			bitmap_free(ent->bitmap);
>  		ent->bitmap = NULL;
>  	}
> +	clear_prio_queue(&queue);
>  	bitmap_builder_clear(&bb);
>  
>  	stop_progress(&writer.progress);
Derrick Stolee Nov. 23, 2020, 2:54 p.m. UTC | #2
On 11/22/2020 4:50 PM, Junio C Hamano wrote:
> Taylor Blau <me@ttaylorr.com> writes:
> 
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> The fill_bitmap_commit() method assumes that every parent of the given
>> commit is already part of the current bitmap. Instead of making that
>> assumption, let's walk parents until we reach commits already part of
>> the bitmap. Set the value for that parent immediately after querying to
>> save time doing double calls to find_object_pos() and to avoid inserting
>> the parent into the queue multiple times.
> 
> Is it because somebody found a case where the assumption does not
> hold and the code with the assumption produces a wrong result?  Is
> it because we can get a better result without making the assumption
> the current code does?

The algorithm from "pack-bitmap-write: reimplement bitmap writing"
that calls fill_bitmap_commit() satisfies this assumption, since it
computes a reachability bitmap for every commit during the reverse
walk. We will soon change that algorithm to "skip" commits, so we
need this step in fill_bitmap_commit() to walk forward to fill the
gaps.

> In other words, can we explain why we are making the change in the
> proposed log message?
I'm sure Taylor and I can work out a better wording to make this
more clear.

Thanks,
-Stolee
Jonathan Tan Nov. 25, 2020, 1:14 a.m. UTC | #3
> From: Derrick Stolee <dstolee@microsoft.com>
> 
> The fill_bitmap_commit() method assumes that every parent of the given
> commit is already part of the current bitmap. Instead of making that
> assumption, let's walk parents until we reach commits already part of
> the bitmap. Set the value for that parent immediately after querying to
> save time doing double calls to find_object_pos() and to avoid inserting
> the parent into the queue multiple times.

I see from the later patches that this has no effect until the part
where we can skip commits, but as Junio says [1], it's worth mentioning
it here. Maybe something like:

  The fill_bitmap_commit() method assumes that every parent of the given
  commit is already part of the current bitmap. This is currently
  correct, but a subsequent patch will change the nature of the edges of
  the graph from parent-child to ancestor-descendant. In preparation for
  that, let's walk parents...

>  static void fill_bitmap_commit(struct bb_commit *ent,
> -			       struct commit *commit)
> +			       struct commit *commit,
> +			       struct prio_queue *queue)

As far as I can see, this function expects an empty queue and always
ends with the queue empty, and the only reason why we don't instantiate
a new queue every time is so that we can save on the internal array
allocation/deallocation. Maybe add a comment to that effect.

>  {
>  	if (!ent->bitmap)
>  		ent->bitmap = bitmap_new();
>  
> -	/*
> -	 * mark ourselves, but do not bother with parents; their values
> -	 * will already have been propagated to us
> -	 */
>  	bitmap_set(ent->bitmap, find_object_pos(&commit->object.oid));
> -	fill_bitmap_tree(ent->bitmap, get_commit_tree(commit));
> +	prio_queue_put(queue, commit);
> +
> +	while (queue->nr) {
> +		struct commit_list *p;
> +		struct commit *c = prio_queue_get(queue);
> +
> +		bitmap_set(ent->bitmap, find_object_pos(&c->object.oid));
> +		fill_bitmap_tree(ent->bitmap, get_commit_tree(c));
> +
> +		for (p = c->parents; p; p = p->next) {
> +			int pos = find_object_pos(&p->item->object.oid);
> +			if (!bitmap_get(ent->bitmap, pos)) {
> +				bitmap_set(ent->bitmap, pos);
> +				prio_queue_put(queue, p->item);
> +			}
> +		}
> +	}
>  }

[snip rest of code]

Everything else makes sense.
Taylor Blau Nov. 28, 2020, 5:21 p.m. UTC | #4
On Tue, Nov 24, 2020 at 05:14:09PM -0800, Jonathan Tan wrote:
> > From: Derrick Stolee <dstolee@microsoft.com>
> >
> > The fill_bitmap_commit() method assumes that every parent of the given
> > commit is already part of the current bitmap. Instead of making that
> > assumption, let's walk parents until we reach commits already part of
> > the bitmap. Set the value for that parent immediately after querying to
> > save time doing double calls to find_object_pos() and to avoid inserting
> > the parent into the queue multiple times.
>
> I see from the later patches that this has no effect until the part
> where we can skip commits, but as Junio says [1], it's worth mentioning
> it here. Maybe something like:
>
>   The fill_bitmap_commit() method assumes that every parent of the given
>   commit is already part of the current bitmap. This is currently
>   correct, but a subsequent patch will change the nature of the edges of
>   the graph from parent-child to ancestor-descendant. In preparation for
>   that, let's walk parents...

Thanks. Stolee and I worked a little on revising this last week, and I
think that the current log message is more along these lines. Here's
what we wrote:

    pack-bitmap-write: fill bitmap with commit history

    The current implementation of bitmap_writer_build() creates a
    reachability bitmap for every walked commit. After computing a bitmap
    for a commit, those bits are pushed to an in-progress bitmap for its
    children.

    fill_bitmap_commit() assumes the bits corresponding to objects
    reachable from the parents of a commit are already set. This means that
    when visiting a new commit, we only have to walk the objects reachable
    between it and any of its parents.

    A future change to bitmap_writer_build() will relax this condition so
    not all parents have their reachable objects set in the in-progress
    bitmap. Prepare for that by having 'fill_bitmap_commit()' walk
    parents until reaching commits whose bits are already set. Then, walk
    the trees for these commits as well.

    This has no functional change with the current implementation of
    bitmap_writer_build().

> >  static void fill_bitmap_commit(struct bb_commit *ent,
> > -			       struct commit *commit)
> > +			       struct commit *commit,
> > +			       struct prio_queue *queue)
>
> As far as I can see, this function expects an empty queue and always
> ends with the queue empty, and the only reason why we don't instantiate
> a new queue every time is so that we can save on the internal array
> allocation/deallocation. Maybe add a comment to that effect.

Sure. Would you find a comment like that more helpful above
'fill_bitmap_commit()', or above the declaration of 'queue' (in
'bitmap_writer_build()') itself?

Thanks,
Taylor
Jonathan Tan Nov. 30, 2020, 6:33 p.m. UTC | #5
> Thanks. Stolee and I worked a little on revising this last week, and I
> think that the current log message is more along these lines. Here's
> what we wrote:
> 
>     pack-bitmap-write: fill bitmap with commit history
> 
>     The current implementation of bitmap_writer_build() creates a
>     reachability bitmap for every walked commit. After computing a bitmap
>     for a commit, those bits are pushed to an in-progress bitmap for its
>     children.
> 
>     fill_bitmap_commit() assumes the bits corresponding to objects
>     reachable from the parents of a commit are already set. This means that
>     when visiting a new commit, we only have to walk the objects reachable
>     between it and any of its parents.
> 
>     A future change to bitmap_writer_build() will relax this condition so
>     not all parents have their reachable objects set in the in-progress

I would write "not all parents have their bits set" instead, but this is
fine too.

>     bitmap. Prepare for that by having 'fill_bitmap_commit()' walk
>     parents until reaching commits whose bits are already set. Then, walk
>     the trees for these commits as well.
> 
>     This has no functional change with the current implementation of
>     bitmap_writer_build().
> 
> > >  static void fill_bitmap_commit(struct bb_commit *ent,
> > > -			       struct commit *commit)
> > > +			       struct commit *commit,
> > > +			       struct prio_queue *queue)
> >
> > As far as I can see, this function expects an empty queue and always
> > ends with the queue empty, and the only reason why we don't instantiate
> > a new queue every time is so that we can save on the internal array
> > allocation/deallocation. Maybe add a comment to that effect.
> 
> Sure. Would you find a comment like that more helpful above
> 'fill_bitmap_commit()', or above the declaration of 'queue' (in
> 'bitmap_writer_build()') itself?

I think it's better with fill_bitmap_commit(), as it's the one in
control of how "queue" will be used.
diff mbox series

Patch

diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index d2d46ff5f4..361f3305a2 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -12,6 +12,7 @@ 
 #include "sha1-lookup.h"
 #include "pack-objects.h"
 #include "commit-reach.h"
+#include "prio-queue.h"
 
 struct bitmapped_commit {
 	struct commit *commit;
@@ -279,17 +280,30 @@  static void fill_bitmap_tree(struct bitmap *bitmap,
 }
 
 static void fill_bitmap_commit(struct bb_commit *ent,
-			       struct commit *commit)
+			       struct commit *commit,
+			       struct prio_queue *queue)
 {
 	if (!ent->bitmap)
 		ent->bitmap = bitmap_new();
 
-	/*
-	 * mark ourselves, but do not bother with parents; their values
-	 * will already have been propagated to us
-	 */
 	bitmap_set(ent->bitmap, find_object_pos(&commit->object.oid));
-	fill_bitmap_tree(ent->bitmap, get_commit_tree(commit));
+	prio_queue_put(queue, commit);
+
+	while (queue->nr) {
+		struct commit_list *p;
+		struct commit *c = prio_queue_get(queue);
+
+		bitmap_set(ent->bitmap, find_object_pos(&c->object.oid));
+		fill_bitmap_tree(ent->bitmap, get_commit_tree(c));
+
+		for (p = c->parents; p; p = p->next) {
+			int pos = find_object_pos(&p->item->object.oid);
+			if (!bitmap_get(ent->bitmap, pos)) {
+				bitmap_set(ent->bitmap, pos);
+				prio_queue_put(queue, p->item);
+			}
+		}
+	}
 }
 
 static void store_selected(struct bb_commit *ent, struct commit *commit)
@@ -319,6 +333,7 @@  void bitmap_writer_build(struct packing_data *to_pack)
 	struct bitmap_builder bb;
 	size_t i;
 	int nr_stored = 0; /* for progress */
+	struct prio_queue queue = { compare_commits_by_gen_then_commit_date };
 
 	writer.bitmaps = kh_init_oid_map();
 	writer.to_pack = to_pack;
@@ -335,7 +350,7 @@  void bitmap_writer_build(struct packing_data *to_pack)
 		struct commit *child;
 		int reused = 0;
 
-		fill_bitmap_commit(ent, commit);
+		fill_bitmap_commit(ent, commit, &queue);
 
 		if (ent->selected) {
 			store_selected(ent, commit);
@@ -360,6 +375,7 @@  void bitmap_writer_build(struct packing_data *to_pack)
 			bitmap_free(ent->bitmap);
 		ent->bitmap = NULL;
 	}
+	clear_prio_queue(&queue);
 	bitmap_builder_clear(&bb);
 
 	stop_progress(&writer.progress);