diff mbox series

[v4,02/17] pack-mtimes: support reading .mtimes files

Message ID 8f9fd21be9fcdda5c73d800fc66d1087d61a6888.1652915424.git.me@ttaylorr.com (mailing list archive)
State Superseded
Headers show
Series cruft packs | expand

Commit Message

Taylor Blau May 18, 2022, 11:10 p.m. UTC
To store the individual mtimes of objects in a cruft pack, introduce a
new `.mtimes` format that can optionally accompany a single pack in the
repository.

The format is defined in Documentation/technical/pack-format.txt, and
stores a 4-byte network order timestamp for each object in name (index)
order.

This patch prepares for cruft packs by defining the `.mtimes` format,
and introducing a basic API that callers can use to read out individual
mtimes.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/technical/pack-format.txt |  19 ++++
 Makefile                                |   1 +
 builtin/repack.c                        |   1 +
 object-store.h                          |   5 +-
 pack-mtimes.c                           | 126 ++++++++++++++++++++++++
 pack-mtimes.h                           |  15 +++
 packfile.c                              |  19 +++-
 7 files changed, 183 insertions(+), 3 deletions(-)
 create mode 100644 pack-mtimes.c
 create mode 100644 pack-mtimes.h

Comments

Ævar Arnfjörð Bjarmason May 19, 2022, 10:40 a.m. UTC | #1
On Wed, May 18 2022, Taylor Blau wrote:

Nit:

> +  - A 4-byte magic number '0x4d544d45' ('MTME').
> +
> +  - A 4-byte version identifier (= 1).
> +
> +  - A 4-byte hash function identifier (= 1 for SHA-1, 2 for SHA-256).

Here we let it suffice that later we'll say "All 4-byte numbers are in
network order".

> +  - A table of 4-byte unsigned integers in network order. The ith

But here we call out "network order" explicitly, shouldn't this just be
s/ in network order//?

> +    value is the modification time (mtime) of the ith object in the
> +    corresponding pack by lexicographic (index) order. The mtimes
> +    count standard epoch seconds.
> +
> +  - A trailer, containing a checksum of the corresponding packfile,
> +    and a checksum of all of the above (each having length according
> +    to the specified hash function).
> +
> +All 4-byte numbers are in network order.

I.e. this is sufficient.
Junio C Hamano May 19, 2022, 3:21 p.m. UTC | #2
Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

> On Wed, May 18 2022, Taylor Blau wrote:
>
> Nit:
>
>> +  - A 4-byte magic number '0x4d544d45' ('MTME').
>> +
>> +  - A 4-byte version identifier (= 1).
>> +
>> +  - A 4-byte hash function identifier (= 1 for SHA-1, 2 for SHA-256).
>
> Here we let it suffice that later we'll say "All 4-byte numbers are in
> network order".
>
>> +  - A table of 4-byte unsigned integers in network order. The ith
>
> But here we call out "network order" explicitly, shouldn't this just be
> s/ in network order//?
>
>> +    value is the modification time (mtime) of the ith object in the
>> +    corresponding pack by lexicographic (index) order. The mtimes
>> +    count standard epoch seconds.
>> +
>> +  - A trailer, containing a checksum of the corresponding packfile,
>> +    and a checksum of all of the above (each having length according
>> +    to the specified hash function).
>> +
>> +All 4-byte numbers are in network order.
>
> I.e. this is sufficient.

Very good eyes.  One explicit mention among several others can
indeed be misleading the readers.

When asked for "network order", all your search engines show are
entries about "network byte order", so let's use that longer form of
spelling.

Thanks.
Ævar Arnfjörð Bjarmason May 20, 2022, 7:32 a.m. UTC | #3
On Thu, May 19 2022, Junio C Hamano wrote:

> Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
>
>> On Wed, May 18 2022, Taylor Blau wrote:
>>
>> Nit:
>>
>>> +  - A 4-byte magic number '0x4d544d45' ('MTME').
>>> +
>>> +  - A 4-byte version identifier (= 1).
>>> +
>>> +  - A 4-byte hash function identifier (= 1 for SHA-1, 2 for SHA-256).
>>
>> Here we let it suffice that later we'll say "All 4-byte numbers are in
>> network order".
>>
>>> +  - A table of 4-byte unsigned integers in network order. The ith
>>
>> But here we call out "network order" explicitly, shouldn't this just be
>> s/ in network order//?
>>
>>> +    value is the modification time (mtime) of the ith object in the
>>> +    corresponding pack by lexicographic (index) order. The mtimes
>>> +    count standard epoch seconds.
>>> +
>>> +  - A trailer, containing a checksum of the corresponding packfile,
>>> +    and a checksum of all of the above (each having length according
>>> +    to the specified hash function).
>>> +
>>> +All 4-byte numbers are in network order.
>>
>> I.e. this is sufficient.
>
> Very good eyes.  One explicit mention among several others can
> indeed be misleading the readers.
>
> When asked for "network order", all your search engines show are
> entries about "network byte order", so let's use that longer form of
> spelling.

*Nod*, note that "network order" is on "master" already though,
i.e. this section re-used a template introduced in 2f4ba2a867f
(packfile: prepare for the existence of '*.rev' files, 2021-01-25) just
above this hunk.

Before that change the rest of the file used "network byte order"
consistently.
Taylor Blau May 20, 2022, 10:37 p.m. UTC | #4
On Fri, May 20, 2022 at 09:32:50AM +0200, Ævar Arnfjörð Bjarmason wrote:
>
> On Thu, May 19 2022, Junio C Hamano wrote:
>
> > Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
> >
> >> On Wed, May 18 2022, Taylor Blau wrote:
> >>
> >> Nit:
> >>
> >>> +  - A 4-byte magic number '0x4d544d45' ('MTME').
> >>> +
> >>> +  - A 4-byte version identifier (= 1).
> >>> +
> >>> +  - A 4-byte hash function identifier (= 1 for SHA-1, 2 for SHA-256).
> >>
> >> Here we let it suffice that later we'll say "All 4-byte numbers are in
> >> network order".
> >>
> >>> +  - A table of 4-byte unsigned integers in network order. The ith
> >>
> >> But here we call out "network order" explicitly, shouldn't this just be
> >> s/ in network order//?
> >>
> >>> +    value is the modification time (mtime) of the ith object in the
> >>> +    corresponding pack by lexicographic (index) order. The mtimes
> >>> +    count standard epoch seconds.
> >>> +
> >>> +  - A trailer, containing a checksum of the corresponding packfile,
> >>> +    and a checksum of all of the above (each having length according
> >>> +    to the specified hash function).
> >>> +
> >>> +All 4-byte numbers are in network order.
> >>
> >> I.e. this is sufficient.
> >
> > Very good eyes.  One explicit mention among several others can
> > indeed be misleading the readers.
> >
> > When asked for "network order", all your search engines show are
> > entries about "network byte order", so let's use that longer form of
> > spelling.
>
> *Nod*, note that "network order" is on "master" already though,
> i.e. this section re-used a template introduced in 2f4ba2a867f
> (packfile: prepare for the existence of '*.rev' files, 2021-01-25) just
> above this hunk.
>
> Before that change the rest of the file used "network byte order"
> consistently.

Hmm. e0d1bcf825 (multi-pack-index: add format details, 2018-07-12)
(which predates 2f4ba2a867f by a few years) introduced the first use of
"network order" as opposed to "network byte order".

I think it's worth cleaning this up, but let's do it in two parts.
I'll send a rerolled version of tb/cruft-packs that moves the "All
4-byte numbers are in network order" to the top of that section,
switching "network order" for "network byte order", and dropping other
mentions of "network [byte] order" from that section.

Then, we can come back later and perhaps do something like the following
(but I don't want to do it now and tie up this series with semi-related
cleanups):

--- 8< ---

diff --git a/Documentation/technical/pack-format.txt b/Documentation/technical/pack-format.txt
index b520aa9c45..2591a410fd 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -276,6 +276,8 @@ Pack file entry: <+

 == pack-*.rev files have the format:

+All 4-byte numbers are in network byte order.
+
   - A 4-byte magic number '0x52494458' ('RIDX').

   - A 4-byte version identifier (= 1).
@@ -283,8 +285,8 @@ Pack file entry: <+
   - A 4-byte hash function identifier (= 1 for SHA-1, 2 for SHA-256).

   - A table of index positions (one per packed object, num_objects in
-    total, each a 4-byte unsigned integer in network order), sorted by
-    their corresponding offsets in the packfile.
+    total, each a 4-byte unsigned integer), sorted by their
+    corresponding offsets in the packfile.

   - A trailer, containing a:

@@ -292,8 +294,6 @@ Pack file entry: <+

     a checksum of all of the above.

-All 4-byte numbers are in network order.
-
 == pack-*.mtimes files have the format:

 All 4-byte numbers are in network byte order.
@@ -322,7 +322,7 @@ the body into "chunks" and provide a lookup table at the beginning of the
 body. The header includes certain length values, such as the number of packs,
 the number of base MIDX files, hash lengths and types.

-All 4-byte numbers are in network order.
+All 4-byte numbers are in network byte order.

 HEADER:

@@ -397,8 +397,8 @@ CHUNK DATA:

 	[Optional] Bitmap pack order (ID: {'R', 'I', 'D', 'X'})
 	    A list of MIDX positions (one per object in the MIDX, num_objects in
-	    total, each a 4-byte unsigned integer in network byte order), sorted
-	    according to their relative bitmap/pseudo-pack positions.
+	    total, each a 4-byte unsigned integer), sorted according to their
+	    relative bitmap/pseudo-pack positions.

 TRAILER:


--- >8 ---

Thanks,
Taylor
diff mbox series

Patch

diff --git a/Documentation/technical/pack-format.txt b/Documentation/technical/pack-format.txt
index 6d3efb7d16..c443dbb526 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -294,6 +294,25 @@  Pack file entry: <+
 
 All 4-byte numbers are in network order.
 
+== pack-*.mtimes files have the format:
+
+  - A 4-byte magic number '0x4d544d45' ('MTME').
+
+  - A 4-byte version identifier (= 1).
+
+  - A 4-byte hash function identifier (= 1 for SHA-1, 2 for SHA-256).
+
+  - A table of 4-byte unsigned integers in network order. The ith
+    value is the modification time (mtime) of the ith object in the
+    corresponding pack by lexicographic (index) order. The mtimes
+    count standard epoch seconds.
+
+  - A trailer, containing a checksum of the corresponding packfile,
+    and a checksum of all of the above (each having length according
+    to the specified hash function).
+
+All 4-byte numbers are in network order.
+
 == multi-pack-index (MIDX) files have the following format:
 
 The multi-pack-index files refer to multiple pack-files and loose objects.
diff --git a/Makefile b/Makefile
index 61aadf3ce8..a299580b7c 100644
--- a/Makefile
+++ b/Makefile
@@ -993,6 +993,7 @@  LIB_OBJS += oidtree.o
 LIB_OBJS += pack-bitmap-write.o
 LIB_OBJS += pack-bitmap.o
 LIB_OBJS += pack-check.o
+LIB_OBJS += pack-mtimes.o
 LIB_OBJS += pack-objects.o
 LIB_OBJS += pack-revindex.o
 LIB_OBJS += pack-write.o
diff --git a/builtin/repack.c b/builtin/repack.c
index d1a563d5b6..e7a3920c6d 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -217,6 +217,7 @@  static struct {
 } exts[] = {
 	{".pack"},
 	{".rev", 1},
+	{".mtimes", 1},
 	{".bitmap", 1},
 	{".promisor", 1},
 	{".idx"},
diff --git a/object-store.h b/object-store.h
index 53996018c1..2c4671ed7a 100644
--- a/object-store.h
+++ b/object-store.h
@@ -115,12 +115,15 @@  struct packed_git {
 		 freshened:1,
 		 do_not_close:1,
 		 pack_promisor:1,
-		 multi_pack_index:1;
+		 multi_pack_index:1,
+		 is_cruft:1;
 	unsigned char hash[GIT_MAX_RAWSZ];
 	struct revindex_entry *revindex;
 	const uint32_t *revindex_data;
 	const uint32_t *revindex_map;
 	size_t revindex_size;
+	const uint32_t *mtimes_map;
+	size_t mtimes_size;
 	/* something like ".git/objects/pack/xxxxx.pack" */
 	char pack_name[FLEX_ARRAY]; /* more */
 };
diff --git a/pack-mtimes.c b/pack-mtimes.c
new file mode 100644
index 0000000000..46ad584af1
--- /dev/null
+++ b/pack-mtimes.c
@@ -0,0 +1,126 @@ 
+#include "pack-mtimes.h"
+#include "object-store.h"
+#include "packfile.h"
+
+static char *pack_mtimes_filename(struct packed_git *p)
+{
+	size_t len;
+	if (!strip_suffix(p->pack_name, ".pack", &len))
+		BUG("pack_name does not end in .pack");
+	/* NEEDSWORK: this could reuse code from pack-revindex.c. */
+	return xstrfmt("%.*s.mtimes", (int)len, p->pack_name);
+}
+
+#define MTIMES_HEADER_SIZE (12)
+#define MTIMES_MIN_SIZE (MTIMES_HEADER_SIZE + (2 * the_hash_algo->rawsz))
+
+struct mtimes_header {
+	uint32_t signature;
+	uint32_t version;
+	uint32_t hash_id;
+};
+
+static int load_pack_mtimes_file(char *mtimes_file,
+				 uint32_t num_objects,
+				 const uint32_t **data_p, size_t *len_p)
+{
+	int fd, ret = 0;
+	struct stat st;
+	void *data = NULL;
+	size_t mtimes_size;
+	struct mtimes_header header;
+	uint32_t *hdr;
+
+	fd = git_open(mtimes_file);
+
+	if (fd < 0) {
+		ret = -1;
+		goto cleanup;
+	}
+	if (fstat(fd, &st)) {
+		ret = error_errno(_("failed to read %s"), mtimes_file);
+		goto cleanup;
+	}
+
+	mtimes_size = xsize_t(st.st_size);
+
+	if (mtimes_size < MTIMES_MIN_SIZE) {
+		ret = error(_("mtimes file %s is too small"), mtimes_file);
+		goto cleanup;
+	}
+
+	if (mtimes_size - MTIMES_MIN_SIZE != st_mult(sizeof(uint32_t), num_objects)) {
+		ret = error(_("mtimes file %s is corrupt"), mtimes_file);
+		goto cleanup;
+	}
+
+	data = hdr = xmmap(NULL, mtimes_size, PROT_READ, MAP_PRIVATE, fd, 0);
+
+	header.signature = ntohl(hdr[0]);
+	header.version = ntohl(hdr[1]);
+	header.hash_id = ntohl(hdr[2]);
+
+	if (header.signature != MTIMES_SIGNATURE) {
+		ret = error(_("mtimes file %s has unknown signature"), mtimes_file);
+		goto cleanup;
+	}
+
+	if (header.version != 1) {
+		ret = error(_("mtimes file %s has unsupported version %"PRIu32),
+			    mtimes_file, header.version);
+		goto cleanup;
+	}
+
+	if (!(header.hash_id == 1 || header.hash_id == 2)) {
+		ret = error(_("mtimes file %s has unsupported hash id %"PRIu32),
+			    mtimes_file, header.hash_id);
+		goto cleanup;
+	}
+
+cleanup:
+	if (ret) {
+		if (data)
+			munmap(data, mtimes_size);
+	} else {
+		*len_p = mtimes_size;
+		*data_p = (const uint32_t *)data;
+	}
+
+	close(fd);
+	return ret;
+}
+
+int load_pack_mtimes(struct packed_git *p)
+{
+	char *mtimes_name = NULL;
+	int ret = 0;
+
+	if (!p->is_cruft)
+		return ret; /* not a cruft pack */
+	if (p->mtimes_map)
+		return ret; /* already loaded */
+
+	ret = open_pack_index(p);
+	if (ret < 0)
+		goto cleanup;
+
+	mtimes_name = pack_mtimes_filename(p);
+	ret = load_pack_mtimes_file(mtimes_name,
+				    p->num_objects,
+				    &p->mtimes_map,
+				    &p->mtimes_size);
+cleanup:
+	free(mtimes_name);
+	return ret;
+}
+
+uint32_t nth_packed_mtime(struct packed_git *p, uint32_t pos)
+{
+	if (!p->mtimes_map)
+		BUG("pack .mtimes file not loaded for %s", p->pack_name);
+	if (p->num_objects <= pos)
+		BUG("pack .mtimes out-of-bounds (%"PRIu32" vs %"PRIu32")",
+		    pos, p->num_objects);
+
+	return get_be32(p->mtimes_map + pos + 3);
+}
diff --git a/pack-mtimes.h b/pack-mtimes.h
new file mode 100644
index 0000000000..38ddb9f893
--- /dev/null
+++ b/pack-mtimes.h
@@ -0,0 +1,15 @@ 
+#ifndef PACK_MTIMES_H
+#define PACK_MTIMES_H
+
+#include "git-compat-util.h"
+
+#define MTIMES_SIGNATURE 0x4d544d45 /* "MTME" */
+#define MTIMES_VERSION 1
+
+struct packed_git;
+
+int load_pack_mtimes(struct packed_git *p);
+
+uint32_t nth_packed_mtime(struct packed_git *p, uint32_t pos);
+
+#endif
diff --git a/packfile.c b/packfile.c
index 835b2d2716..fc0245fbab 100644
--- a/packfile.c
+++ b/packfile.c
@@ -334,12 +334,22 @@  static void close_pack_revindex(struct packed_git *p)
 	p->revindex_data = NULL;
 }
 
+static void close_pack_mtimes(struct packed_git *p)
+{
+	if (!p->mtimes_map)
+		return;
+
+	munmap((void *)p->mtimes_map, p->mtimes_size);
+	p->mtimes_map = NULL;
+}
+
 void close_pack(struct packed_git *p)
 {
 	close_pack_windows(p);
 	close_pack_fd(p);
 	close_pack_index(p);
 	close_pack_revindex(p);
+	close_pack_mtimes(p);
 	oidset_clear(&p->bad_objects);
 }
 
@@ -363,7 +373,7 @@  void close_object_store(struct raw_object_store *o)
 
 void unlink_pack_path(const char *pack_name, int force_delete)
 {
-	static const char *exts[] = {".pack", ".idx", ".rev", ".keep", ".bitmap", ".promisor"};
+	static const char *exts[] = {".pack", ".idx", ".rev", ".keep", ".bitmap", ".promisor", ".mtimes"};
 	int i;
 	struct strbuf buf = STRBUF_INIT;
 	size_t plen;
@@ -718,6 +728,10 @@  struct packed_git *add_packed_git(const char *path, size_t path_len, int local)
 	if (!access(p->pack_name, F_OK))
 		p->pack_promisor = 1;
 
+	xsnprintf(p->pack_name + path_len, alloc - path_len, ".mtimes");
+	if (!access(p->pack_name, F_OK))
+		p->is_cruft = 1;
+
 	xsnprintf(p->pack_name + path_len, alloc - path_len, ".pack");
 	if (stat(p->pack_name, &st) || !S_ISREG(st.st_mode)) {
 		free(p);
@@ -869,7 +883,8 @@  static void prepare_pack(const char *full_name, size_t full_name_len,
 	    ends_with(file_name, ".pack") ||
 	    ends_with(file_name, ".bitmap") ||
 	    ends_with(file_name, ".keep") ||
-	    ends_with(file_name, ".promisor"))
+	    ends_with(file_name, ".promisor") ||
+	    ends_with(file_name, ".mtimes"))
 		string_list_append(data->garbage, full_name);
 	else
 		report_garbage(PACKDIR_FILE_GARBAGE, full_name);