diff mbox series

[14/24] pack-objects: keep track of `pack_start` for each reuse pack

Message ID 6f4fba861b59f909148775ee64c3ba89afc872b5.1701198172.git.me@ttaylorr.com (mailing list archive)
State Superseded
Headers show
Series pack-objects: multi-pack verbatim reuse | expand

Commit Message

Taylor Blau Nov. 28, 2023, 7:08 p.m. UTC
When reusing objects from a pack, we keep track of a set of one or more
`reused_chunk`s, corresponding to sections of one or more object(s) from
a source pack that we are reusing. Each chunk contains two pieces of
information:

  - the offset of the first object in the source pack (relative to the
    beginning of the source pack)
  - the difference between that offset, and the corresponding offset in
    the pack we're generating

The purpose of keeping track of these is so that we can patch an
OFS_DELTAs that cross over a section of the reuse pack that we didn't
take.

For instance, consider a hypothetical pack as shown below:

                                                (chunk #2)
                                                __________...
                                               /
                                              /
      +--------+---------+-------------------+---------+
  ... | <base> | <other> |      (unused)     | <delta> | ...
      +--------+---------+-------------------+---------+
       \                /
        \______________/
           (chunk #1)

Suppose that we are sending objects "base", "other", and "delta", and
that the "delta" object is stored as an OFS_DELTA, and that its base is
"base". If we don't send any objects in the "(unused)" range, we can't
copy the delta'd object directly, since its delta offset includes a
range of the pack that we didn't copy, so we have to account for that
difference when patching and reassembling the delta.

In order to compute this value correctly, we need to know not only where
we are in the packfile we're assembling (with `hashfile_total(f)`) but
also the position of the first byte of the packfile that we are
currently reusing.

Together, these two allow us to compute the reused chunk's offset
difference relative to the start of the reused pack, as desired.

Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/pack-objects.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

Comments

Patrick Steinhardt Dec. 7, 2023, 1:13 p.m. UTC | #1
On Tue, Nov 28, 2023 at 02:08:32PM -0500, Taylor Blau wrote:
> When reusing objects from a pack, we keep track of a set of one or more
> `reused_chunk`s, corresponding to sections of one or more object(s) from
> a source pack that we are reusing. Each chunk contains two pieces of
> information:
> 
>   - the offset of the first object in the source pack (relative to the
>     beginning of the source pack)
>   - the difference between that offset, and the corresponding offset in
>     the pack we're generating
> 
> The purpose of keeping track of these is so that we can patch an
> OFS_DELTAs that cross over a section of the reuse pack that we didn't
> take.
> 
> For instance, consider a hypothetical pack as shown below:
> 
>                                                 (chunk #2)
>                                                 __________...
>                                                /
>                                               /
>       +--------+---------+-------------------+---------+
>   ... | <base> | <other> |      (unused)     | <delta> | ...
>       +--------+---------+-------------------+---------+
>        \                /
>         \______________/
>            (chunk #1)
> 
> Suppose that we are sending objects "base", "other", and "delta", and
> that the "delta" object is stored as an OFS_DELTA, and that its base is
> "base". If we don't send any objects in the "(unused)" range, we can't
> copy the delta'd object directly, since its delta offset includes a
> range of the pack that we didn't copy, so we have to account for that
> difference when patching and reassembling the delta.
> 
> In order to compute this value correctly, we need to know not only where
> we are in the packfile we're assembling (with `hashfile_total(f)`) but
> also the position of the first byte of the packfile that we are
> currently reusing.
> 
> Together, these two allow us to compute the reused chunk's offset
> difference relative to the start of the reused pack, as desired.

Hm. I'm not quite sure I fully understand the motivation here. Is this
something that was broken all along? Why does it become a problem now?
Sorry if I'm missing the obvious here.

> Helped-by: Jeff King <peff@peff.net>
> Signed-off-by: Taylor Blau <me@ttaylorr.com>
> ---
>  builtin/pack-objects.c | 11 ++++++++---
>  1 file changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index 7682bd65bb..eb8be514d1 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -1016,6 +1016,7 @@ static off_t find_reused_offset(off_t where)
>  
>  static void write_reused_pack_one(struct packed_git *reuse_packfile,
>  				  size_t pos, struct hashfile *out,
> +				  off_t pack_start,
>  				  struct pack_window **w_curs)
>  {
>  	off_t offset, next, cur;
> @@ -1025,7 +1026,8 @@ static void write_reused_pack_one(struct packed_git *reuse_packfile,
>  	offset = pack_pos_to_offset(reuse_packfile, pos);
>  	next = pack_pos_to_offset(reuse_packfile, pos + 1);
>  
> -	record_reused_object(offset, offset - hashfile_total(out));
> +	record_reused_object(offset,
> +			     offset - (hashfile_total(out) - pack_start));
>  
>  	cur = offset;
>  	type = unpack_object_header(reuse_packfile, w_curs, &cur, &size);
> @@ -1095,6 +1097,7 @@ static void write_reused_pack_one(struct packed_git *reuse_packfile,
>  
>  static size_t write_reused_pack_verbatim(struct packed_git *reuse_packfile,
>  					 struct hashfile *out,
> +					 off_t pack_start UNUSED,
>  					 struct pack_window **w_curs)
>  {
>  	size_t pos = 0;
> @@ -1126,10 +1129,12 @@ static void write_reused_pack(struct packed_git *reuse_packfile,
>  {
>  	size_t i = 0;
>  	uint32_t offset;
> +	off_t pack_start = hashfile_total(f) - sizeof(struct pack_header);

Given that this patch in its current state doesn't seem to do anything
yet, am I right in assuming that `hashfile_total(f) - sizeof(struct
pack_header)` is always expected to be zero for now?

Patrick
Taylor Blau Dec. 7, 2023, 8:43 p.m. UTC | #2
On Thu, Dec 07, 2023 at 02:13:24PM +0100, Patrick Steinhardt wrote:
> > In order to compute this value correctly, we need to know not only where
> > we are in the packfile we're assembling (with `hashfile_total(f)`) but
> > also the position of the first byte of the packfile that we are
> > currently reusing.
> >
> > Together, these two allow us to compute the reused chunk's offset
> > difference relative to the start of the reused pack, as desired.
>
> Hm. I'm not quite sure I fully understand the motivation here. Is this
> something that was broken all along? Why does it become a problem now?
> Sorry if I'm missing the obvious here.

No worries, I should have explained this better. Indeed we do have to
worry about patching deltas today when reusing objects from a pack. But
we have to extend the implementation in order to perform reuse over
multiple packs when any of them (excluding the first, which would work
with the existing logic) have delta/base pairs on either side of a gap.

I'll try to make it a little clearer, thanks for pointing that out.

> > @@ -1126,10 +1129,12 @@ static void write_reused_pack(struct packed_git *reuse_packfile,
> >  {
> >  	size_t i = 0;
> >  	uint32_t offset;
> > +	off_t pack_start = hashfile_total(f) - sizeof(struct pack_header);
>
> Given that this patch in its current state doesn't seem to do anything
> yet, am I right in assuming that `hashfile_total(f) - sizeof(struct
> pack_header)` is always expected to be zero for now?

Yep, that's right.

Thanks,
Taylor
diff mbox series

Patch

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7682bd65bb..eb8be514d1 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1016,6 +1016,7 @@  static off_t find_reused_offset(off_t where)
 
 static void write_reused_pack_one(struct packed_git *reuse_packfile,
 				  size_t pos, struct hashfile *out,
+				  off_t pack_start,
 				  struct pack_window **w_curs)
 {
 	off_t offset, next, cur;
@@ -1025,7 +1026,8 @@  static void write_reused_pack_one(struct packed_git *reuse_packfile,
 	offset = pack_pos_to_offset(reuse_packfile, pos);
 	next = pack_pos_to_offset(reuse_packfile, pos + 1);
 
-	record_reused_object(offset, offset - hashfile_total(out));
+	record_reused_object(offset,
+			     offset - (hashfile_total(out) - pack_start));
 
 	cur = offset;
 	type = unpack_object_header(reuse_packfile, w_curs, &cur, &size);
@@ -1095,6 +1097,7 @@  static void write_reused_pack_one(struct packed_git *reuse_packfile,
 
 static size_t write_reused_pack_verbatim(struct packed_git *reuse_packfile,
 					 struct hashfile *out,
+					 off_t pack_start UNUSED,
 					 struct pack_window **w_curs)
 {
 	size_t pos = 0;
@@ -1126,10 +1129,12 @@  static void write_reused_pack(struct packed_git *reuse_packfile,
 {
 	size_t i = 0;
 	uint32_t offset;
+	off_t pack_start = hashfile_total(f) - sizeof(struct pack_header);
 	struct pack_window *w_curs = NULL;
 
 	if (allow_ofs_delta)
-		i = write_reused_pack_verbatim(reuse_packfile, f, &w_curs);
+		i = write_reused_pack_verbatim(reuse_packfile, f, pack_start,
+					       &w_curs);
 
 	for (; i < reuse_packfile_bitmap->word_alloc; ++i) {
 		eword_t word = reuse_packfile_bitmap->words[i];
@@ -1146,7 +1151,7 @@  static void write_reused_pack(struct packed_git *reuse_packfile,
 			 * for why.
 			 */
 			write_reused_pack_one(reuse_packfile, pos + offset, f,
-					      &w_curs);
+					      pack_start, &w_curs);
 			display_progress(progress_state, ++written);
 		}
 	}