diff mbox series

[v2,3/6] fsck: don't hard die on invalid object types

Message ID patch-3.6-d0d9cb33315-20210413T093734Z-avarab@gmail.com (mailing list archive)
State New
Headers show
Series fsck: better "invalid object" error reporting | expand

Commit Message

Ævar Arnfjörð Bjarmason April 13, 2021, 9:43 a.m. UTC
Change builtin/fsck.c to pass down a
OBJECT_INFO_ALLOW_UNKNOWN_TYPE. This changes this very ungraceful
error:

    $ git hash-object --stdin -w -t garbage --literally </dev/null
    <OID>
    $ git fsck
    fatal: invalid object type
    $

Into:

    $ git fsck
    error: hash mismatch for <OID_PATH> (expected <OID>)
    error: <OID>: object corrupt or missing: <OID_PATH>
    [ the rest of the fsck output here, i.e. it didn't hard die ]

We'll still exit with non-zero, but now we'll finish the rest of the
traversal. The tests that's being added here asserts that we'll still
complain about other fsck issues (e.g. an unrelated dangling blob).

But why are we complaining about a "hash mismatch" for an object of a
type we don't know about? We shouldn't. This is the bare minimal
change needed to not make fsck hard die on a repository that's been
corrupted in this manner. In subsequent commits we'll teach fsck to
recognize this particular type of corruption and emit a better error
message.

The parse_loose_header() function being changed here is only used in
builtin/fsck.c, see f6371f92104 (sha1_file: add read_loose_object()
function, 2017-01-13) for its introduction.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 builtin/fsck.c  |  3 ++-
 object-file.c   | 24 ++++++++++--------------
 object-store.h  |  6 ++++--
 streaming.c     |  5 ++++-
 t/t1450-fsck.sh |  9 +++++++++
 5 files changed, 29 insertions(+), 18 deletions(-)

Comments

Jeff King April 23, 2021, 2:26 p.m. UTC | #1
On Tue, Apr 13, 2021 at 11:43:06AM +0200, Ævar Arnfjörð Bjarmason wrote:

> Change builtin/fsck.c to pass down a
> OBJECT_INFO_ALLOW_UNKNOWN_TYPE. This changes this very ungraceful
> error:
> 
>     $ git hash-object --stdin -w -t garbage --literally </dev/null
>     <OID>
>     $ git fsck
>     fatal: invalid object type
>     $
> 
> Into:
> 
>     $ git fsck
>     error: hash mismatch for <OID_PATH> (expected <OID>)
>     error: <OID>: object corrupt or missing: <OID_PATH>
>     [ the rest of the fsck output here, i.e. it didn't hard die ]
> 
> We'll still exit with non-zero, but now we'll finish the rest of the
> traversal. The tests that's being added here asserts that we'll still
> complain about other fsck issues (e.g. an unrelated dangling blob).
> 
> But why are we complaining about a "hash mismatch" for an object of a
> type we don't know about? We shouldn't. This is the bare minimal
> change needed to not make fsck hard die on a repository that's been
> corrupted in this manner. In subsequent commits we'll teach fsck to
> recognize this particular type of corruption and emit a better error
> message.

OK. The overall goal makes sense.

> The parse_loose_header() function being changed here is only used in
> builtin/fsck.c, see f6371f92104 (sha1_file: add read_loose_object()
> function, 2017-01-13) for its introduction.

This left me scratching my head for a long time. Did you mean
read_loose_object() in the beginning of the sentence?

> -static int parse_loose_header_extended(const char *hdr, struct object_info *oi,
> -				       unsigned int flags)
> +int parse_loose_header(const char *hdr,
> +		       struct object_info *oi,
> +		       unsigned int flags)

So we are getting rid of the "extended" form and just making the
non-extended way take an OI. That seems kind of orthogonal...

> --- a/streaming.c
> +++ b/streaming.c
> @@ -341,6 +341,9 @@ static struct stream_vtbl loose_vtbl = {
>  
>  static open_method_decl(loose)
>  {
> +	struct object_info oi2 = OBJECT_INFO_INIT;
> +	oi2.sizep = &st->size;
> +

...and is what leads us to having to touch this otherwise unrelated
function.

I don't mind _too_ much getting rid of a helper function that would have
only one caller remaining (though "oi2" is a bit mysterious here). But
it seems like the patch would have been a lot easier to understand if
that were separately done (and explained). AFAICT the functional change
here is just passing the flag to read_loose_object(), which could be
calling parse_loose_header_extended(). I guess that would have to become
public, but that seems reasonable.

-Peff
diff mbox series

Patch

diff --git a/builtin/fsck.c b/builtin/fsck.c
index 70ff95837ae..686f7cdfea0 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -600,7 +600,8 @@  static int fsck_loose(const struct object_id *oid, const char *path, void *data)
 	void *contents;
 	int eaten;
 
-	if (read_loose_object(path, oid, &type, &size, &contents) < 0) {
+	if (read_loose_object(path, oid, &type, &size, &contents,
+			      OBJECT_INFO_ALLOW_UNKNOWN_TYPE) < 0) {
 		errors_found |= ERROR_OBJECT;
 		error(_("%s: object corrupt or missing: %s"),
 		      oid_to_hex(oid), path);
diff --git a/object-file.c b/object-file.c
index 624af408cdc..26560a6281c 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1294,8 +1294,9 @@  static void *unpack_loose_rest(git_zstream *stream,
  * too permissive for what we want to check. So do an anal
  * object header parse by hand.
  */
-static int parse_loose_header_extended(const char *hdr, struct object_info *oi,
-				       unsigned int flags)
+int parse_loose_header(const char *hdr,
+		       struct object_info *oi,
+		       unsigned int flags)
 {
 	const char *type_buf = hdr;
 	unsigned long size;
@@ -1355,14 +1356,6 @@  static int parse_loose_header_extended(const char *hdr, struct object_info *oi,
 	return *hdr ? -1 : type;
 }
 
-int parse_loose_header(const char *hdr, unsigned long *sizep)
-{
-	struct object_info oi = OBJECT_INFO_INIT;
-
-	oi.sizep = sizep;
-	return parse_loose_header_extended(hdr, &oi, 0);
-}
-
 static int loose_object_info(struct repository *r,
 			     const struct object_id *oid,
 			     struct object_info *oi, int flags)
@@ -1417,10 +1410,10 @@  static int loose_object_info(struct repository *r,
 	if (status < 0)
 		; /* Do nothing */
 	else if (hdrbuf.len) {
-		if ((status = parse_loose_header_extended(hdrbuf.buf, oi, flags)) < 0)
+		if ((status = parse_loose_header(hdrbuf.buf, oi, flags)) < 0)
 			status = error(_("unable to parse %s header with --allow-unknown-type"),
 				       oid_to_hex(oid));
-	} else if ((status = parse_loose_header_extended(hdr, oi, flags)) < 0)
+	} else if ((status = parse_loose_header(hdr, oi, flags)) < 0)
 		status = error(_("unable to parse %s header"), oid_to_hex(oid));
 
 	if (status >= 0 && oi->contentp) {
@@ -2497,13 +2490,16 @@  int read_loose_object(const char *path,
 		      const struct object_id *expected_oid,
 		      enum object_type *type,
 		      unsigned long *size,
-		      void **contents)
+		      void **contents,
+		      unsigned int oi_flags)
 {
 	int ret = -1;
 	void *map = NULL;
 	unsigned long mapsize;
 	git_zstream stream;
 	char hdr[MAX_HEADER_LEN];
+	struct object_info oi = OBJECT_INFO_INIT;
+	oi.sizep = size;
 
 	*contents = NULL;
 
@@ -2518,7 +2514,7 @@  int read_loose_object(const char *path,
 		goto out;
 	}
 
-	*type = parse_loose_header(hdr, size);
+	*type = parse_loose_header(hdr, &oi, oi_flags);
 	if (*type < 0) {
 		error(_("unable to parse header of %s"), path);
 		git_inflate_end(&stream);
diff --git a/object-store.h b/object-store.h
index 9117115a50c..ab86c8bf32c 100644
--- a/object-store.h
+++ b/object-store.h
@@ -245,7 +245,8 @@  int read_loose_object(const char *path,
 		      const struct object_id *expected_oid,
 		      enum object_type *type,
 		      unsigned long *size,
-		      void **contents);
+		      void **contents,
+		      unsigned int oi_flags);
 
 /* Retry packed storage after checking packed and loose storage */
 #define HAS_OBJECT_RECHECK_PACKED 1
@@ -480,7 +481,8 @@  int for_each_packed_object(each_packed_object_fn, void *,
 int unpack_loose_header(git_zstream *stream, unsigned char *map,
 			unsigned long mapsize, void *buffer,
 			unsigned long bufsiz);
-int parse_loose_header(const char *hdr, unsigned long *sizep);
+int parse_loose_header(const char *hdr, struct object_info *oi,
+		       unsigned int flags);
 int check_object_signature(struct repository *r, const struct object_id *oid,
 			   void *buf, unsigned long size, const char *type);
 int finalize_object_file(const char *tmpfile, const char *filename);
diff --git a/streaming.c b/streaming.c
index 800f07a52cc..e5d4dd2f654 100644
--- a/streaming.c
+++ b/streaming.c
@@ -341,6 +341,9 @@  static struct stream_vtbl loose_vtbl = {
 
 static open_method_decl(loose)
 {
+	struct object_info oi2 = OBJECT_INFO_INIT;
+	oi2.sizep = &st->size;
+
 	st->u.loose.mapped = map_loose_object(r, oid, &st->u.loose.mapsize);
 	if (!st->u.loose.mapped)
 		return -1;
@@ -349,7 +352,7 @@  static open_method_decl(loose)
 				 st->u.loose.mapsize,
 				 st->u.loose.hdr,
 				 sizeof(st->u.loose.hdr)) < 0) ||
-	    (parse_loose_header(st->u.loose.hdr, &st->size) < 0)) {
+	    (parse_loose_header(st->u.loose.hdr, &oi2, 0) < 0)) {
 		git_inflate_end(&st->z);
 		munmap(st->u.loose.mapped, st->u.loose.mapsize);
 		return -1;
diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh
index 1563b35f88c..025dd1b491a 100755
--- a/t/t1450-fsck.sh
+++ b/t/t1450-fsck.sh
@@ -863,4 +863,13 @@  test_expect_success 'detect corrupt index file in fsck' '
 	test_i18ngrep "bad index file" errors
 '
 
+test_expect_success 'fsck error and recovery on invalid object type' '
+	test_create_repo garbage-type &&
+	empty_blob=$(git -C garbage-type hash-object --stdin -w -t blob </dev/null) &&
+	garbage_blob=$(git -C garbage-type hash-object --stdin -w -t garbage --literally </dev/null) &&
+	test_must_fail git -C garbage-type fsck >out 2>err &&
+	grep "$garbage_blob: object corrupt or missing:" err &&
+	grep "dangling blob $empty_blob" out
+'
+
 test_done