From patchwork Wed Jan 18 20:35:30 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jeff King <peff@peff.net>
X-Patchwork-Id: 13107074
Return-Path: <git-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id A3FBBC32793
	for <git@archiver.kernel.org>; Wed, 18 Jan 2023 20:36:24 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230093AbjARUgX (ORCPT <rfc822;git@archiver.kernel.org>);
        Wed, 18 Jan 2023 15:36:23 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:43506 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S230089AbjARUf5 (ORCPT <rfc822;git@vger.kernel.org>);
        Wed, 18 Jan 2023 15:35:57 -0500
Received: from cloud.peff.net (cloud.peff.net [104.130.231.41])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id CFE0027488
        for <git@vger.kernel.org>; Wed, 18 Jan 2023 12:35:31 -0800 (PST)
Received: (qmail 3319 invoked by uid 109); 18 Jan 2023 20:35:31 -0000
Received: from Unknown (HELO peff.net) (10.0.1.2)
 by cloud.peff.net (qpsmtpd/0.94) with ESMTP; Wed, 18 Jan 2023 20:35:31 +0000
Authentication-Results: cloud.peff.net; auth=none
Received: (qmail 24924 invoked by uid 111); 18 Jan 2023 20:35:32 -0000
Received: from coredump.intra.peff.net (HELO sigill.intra.peff.net) (10.0.0.2)
 by peff.net (qpsmtpd/0.94) with (TLS_AES_256_GCM_SHA384 encrypted) ESMTPS;
 Wed, 18 Jan 2023 15:35:32 -0500
Authentication-Results: peff.net; auth=none
Date: Wed, 18 Jan 2023 15:35:30 -0500
From: Jeff King <peff@peff.net>
To: git@vger.kernel.org
Cc: =?utf-8?b?UmVuw6k=?= Scharfe <l.s.r@web.de>, =?utf-8?b?w4Z2YXIgQXJuZmo=?=
	=?utf-8?b?w7Zyw7A=?= Bjarmason <avarab@gmail.com>
Subject: [PATCH 1/6] t1007: modernize malformed object tests
Message-ID: <Y8hYEgMze3bY44/0@coredump.intra.peff.net>
References: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
MIME-Version: 1.0
Content-Disposition: inline
In-Reply-To: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
Precedence: bulk
List-ID: <git.vger.kernel.org>
X-Mailing-List: git@vger.kernel.org

The tests in t1007 for detecting malformed objects have two
anachronisms:

 - they use "sha1" instead of "oid" in variable names, even though the
   script as a whole has been adapted to handle sha256

 - they use test_i18ngrep, which is no longer necessary

Since we'll be adding a new similar test, let's clean these up so they
are all consistently using the modern style.

Signed-off-by: Jeff King <peff@peff.net>
---
 t/t1007-hash-object.sh | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/t/t1007-hash-object.sh b/t/t1007-hash-object.sh
index ac5ad8c740..2d2148d8fa 100755
--- a/t/t1007-hash-object.sh
+++ b/t/t1007-hash-object.sh
@@ -203,23 +203,23 @@ done
 test_expect_success 'too-short tree' '
 	echo abc >malformed-tree &&
 	test_must_fail git hash-object -t tree malformed-tree 2>err &&
-	test_i18ngrep "too-short tree object" err
+	grep "too-short tree object" err
 '
 
 test_expect_success 'malformed mode in tree' '
-	hex_sha1=$(echo foo | git hash-object --stdin -w) &&
-	bin_sha1=$(echo $hex_sha1 | hex2oct) &&
-	printf "9100644 \0$bin_sha1" >tree-with-malformed-mode &&
+	hex_oid=$(echo foo | git hash-object --stdin -w) &&
+	bin_oid=$(echo $hex_oid | hex2oct) &&
+	printf "9100644 \0$bin_oid" >tree-with-malformed-mode &&
 	test_must_fail git hash-object -t tree tree-with-malformed-mode 2>err &&
-	test_i18ngrep "malformed mode in tree entry" err
+	grep "malformed mode in tree entry" err
 '
 
 test_expect_success 'empty filename in tree' '
-	hex_sha1=$(echo foo | git hash-object --stdin -w) &&
-	bin_sha1=$(echo $hex_sha1 | hex2oct) &&
-	printf "100644 \0$bin_sha1" >tree-with-empty-filename &&
+	hex_oid=$(echo foo | git hash-object --stdin -w) &&
+	bin_oid=$(echo $hex_oid | hex2oct) &&
+	printf "100644 \0$bin_oid" >tree-with-empty-filename &&
 	test_must_fail git hash-object -t tree tree-with-empty-filename 2>err &&
-	test_i18ngrep "empty filename in tree entry" err
+	grep "empty filename in tree entry" err
 '
 
 test_expect_success 'corrupt commit' '

From patchwork Wed Jan 18 20:35:52 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jeff King <peff@peff.net>
X-Patchwork-Id: 13107075
Return-Path: <git-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 9A2ACC38159
	for <git@archiver.kernel.org>; Wed, 18 Jan 2023 20:36:26 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230137AbjARUg0 (ORCPT <rfc822;git@archiver.kernel.org>);
        Wed, 18 Jan 2023 15:36:26 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:43516 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S230120AbjARUf7 (ORCPT <rfc822;git@vger.kernel.org>);
        Wed, 18 Jan 2023 15:35:59 -0500
Received: from cloud.peff.net (cloud.peff.net [104.130.231.41])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 346415FD43
        for <git@vger.kernel.org>; Wed, 18 Jan 2023 12:35:54 -0800 (PST)
Received: (qmail 3329 invoked by uid 109); 18 Jan 2023 20:35:53 -0000
Received: from Unknown (HELO peff.net) (10.0.1.2)
 by cloud.peff.net (qpsmtpd/0.94) with ESMTP; Wed, 18 Jan 2023 20:35:53 +0000
Authentication-Results: cloud.peff.net; auth=none
Received: (qmail 24932 invoked by uid 111); 18 Jan 2023 20:35:55 -0000
Received: from coredump.intra.peff.net (HELO sigill.intra.peff.net) (10.0.0.2)
 by peff.net (qpsmtpd/0.94) with (TLS_AES_256_GCM_SHA384 encrypted) ESMTPS;
 Wed, 18 Jan 2023 15:35:55 -0500
Authentication-Results: peff.net; auth=none
Date: Wed, 18 Jan 2023 15:35:52 -0500
From: Jeff King <peff@peff.net>
To: git@vger.kernel.org
Cc: =?utf-8?b?UmVuw6k=?= Scharfe <l.s.r@web.de>, =?utf-8?b?w4Z2YXIgQXJuZmo=?=
	=?utf-8?b?w7Zyw7A=?= Bjarmason <avarab@gmail.com>
Subject: [PATCH 2/6] t1006: stop using 0-padded timestamps
Message-ID: <Y8hYKD07HulByUcB@coredump.intra.peff.net>
References: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
MIME-Version: 1.0
Content-Disposition: inline
In-Reply-To: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
Precedence: bulk
List-ID: <git.vger.kernel.org>
X-Mailing-List: git@vger.kernel.org

The fake objects in t1006 use dummy timestamps like "0000000000 +0000".
While this does make them look more like normal timestamps (which,
unless it is 1970, have many digits), it actually violates our fsck
checks, which complain about zero-padded timestamps.

This doesn't currently break anything, but let's future-proof our tests
against a version of hash-object which is a little more careful about
its input. We don't actually care about the exact values here (and in
fact, the helper functions in this script end up removing the timestamps
anyway, so we don't even have to adjust other parts of the tests).

Signed-off-by: Jeff King <peff@peff.net>
---
 t/t1006-cat-file.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh
index 23b8942edb..2d875b17d8 100755
--- a/t/t1006-cat-file.sh
+++ b/t/t1006-cat-file.sh
@@ -292,8 +292,8 @@ commit_message="Initial commit"
 commit_sha1=$(echo_without_newline "$commit_message" | git commit-tree $tree_sha1)
 commit_size=$(($(test_oid hexsz) + 137))
 commit_content="tree $tree_sha1
-author $GIT_AUTHOR_NAME <$GIT_AUTHOR_EMAIL> 0000000000 +0000
-committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 0000000000 +0000
+author $GIT_AUTHOR_NAME <$GIT_AUTHOR_EMAIL> 0 +0000
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 0 +0000
 
 $commit_message"
 
@@ -304,7 +304,7 @@ type blob
 tag hellotag
 tagger $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL>"
 tag_description="This is a tag"
-tag_content="$tag_header_without_timestamp 0000000000 +0000
+tag_content="$tag_header_without_timestamp 0 +0000
 
 $tag_description"
 

From patchwork Wed Jan 18 20:36:22 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jeff King <peff@peff.net>
X-Patchwork-Id: 13107076
Return-Path: <git-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id C81F0C32793
	for <git@archiver.kernel.org>; Wed, 18 Jan 2023 20:36:58 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229821AbjARUg5 (ORCPT <rfc822;git@archiver.kernel.org>);
        Wed, 18 Jan 2023 15:36:57 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:40132 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S230015AbjARUgi (ORCPT <rfc822;git@vger.kernel.org>);
        Wed, 18 Jan 2023 15:36:38 -0500
Received: from cloud.peff.net (cloud.peff.net [104.130.231.41])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 86FD193FC
        for <git@vger.kernel.org>; Wed, 18 Jan 2023 12:36:23 -0800 (PST)
Received: (qmail 3338 invoked by uid 109); 18 Jan 2023 20:36:22 -0000
Received: from Unknown (HELO peff.net) (10.0.1.2)
 by cloud.peff.net (qpsmtpd/0.94) with ESMTP; Wed, 18 Jan 2023 20:36:22 +0000
Authentication-Results: cloud.peff.net; auth=none
Received: (qmail 24937 invoked by uid 111); 18 Jan 2023 20:36:24 -0000
Received: from coredump.intra.peff.net (HELO sigill.intra.peff.net) (10.0.0.2)
 by peff.net (qpsmtpd/0.94) with (TLS_AES_256_GCM_SHA384 encrypted) ESMTPS;
 Wed, 18 Jan 2023 15:36:24 -0500
Authentication-Results: peff.net; auth=none
Date: Wed, 18 Jan 2023 15:36:22 -0500
From: Jeff King <peff@peff.net>
To: git@vger.kernel.org
Cc: =?utf-8?b?UmVuw6k=?= Scharfe <l.s.r@web.de>, =?utf-8?b?w4Z2YXIgQXJuZmo=?=
	=?utf-8?b?w7Zyw7A=?= Bjarmason <avarab@gmail.com>
Subject: [PATCH 3/6] t7030: stop using invalid tag name
Message-ID: <Y8hYRn5+/zswl9Zx@coredump.intra.peff.net>
References: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
MIME-Version: 1.0
Content-Disposition: inline
In-Reply-To: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
Precedence: bulk
List-ID: <git.vger.kernel.org>
X-Mailing-List: git@vger.kernel.org

We intentionally invalidate the signature of a tag by switching its tag
name from "seventh" to "7th forged". However, the latter is not a valid
tag name because it contains a space. This doesn't currently affect the
test, but we're better off using something syntactically valid. That
reduces the number of possible failure modes in the test, and
future-proofs us if git hash-object gets more picky about its input.

The t7031 script, which was mostly copied from t7030, has the same
problem, so we'll fix it, too.

Signed-off-by: Jeff King <peff@peff.net>
---
 t/t7030-verify-tag.sh            | 2 +-
 t/t7031-verify-tag-signed-ssh.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/t/t7030-verify-tag.sh b/t/t7030-verify-tag.sh
index 10faa64515..6f526c37c2 100755
--- a/t/t7030-verify-tag.sh
+++ b/t/t7030-verify-tag.sh
@@ -115,7 +115,7 @@ test_expect_success GPGSM 'verify and show signatures x509 with high minTrustLev
 
 test_expect_success GPG 'detect fudged signature' '
 	git cat-file tag seventh-signed >raw &&
-	sed -e "/^tag / s/seventh/7th forged/" raw >forged1 &&
+	sed -e "/^tag / s/seventh/7th-forged/" raw >forged1 &&
 	git hash-object -w -t tag forged1 >forged1.tag &&
 	test_must_fail git verify-tag $(cat forged1.tag) 2>actual1 &&
 	grep "BAD signature from" actual1 &&
diff --git a/t/t7031-verify-tag-signed-ssh.sh b/t/t7031-verify-tag-signed-ssh.sh
index 1cb36b9ab8..36eb86a4b1 100755
--- a/t/t7031-verify-tag-signed-ssh.sh
+++ b/t/t7031-verify-tag-signed-ssh.sh
@@ -125,7 +125,7 @@ test_expect_success GPGSSH,GPGSSH_VERIFYTIME 'verify-tag failes with tag date ou
 test_expect_success GPGSSH 'detect fudged ssh signature' '
 	test_config gpg.ssh.allowedSignersFile "${GPGSSH_ALLOWED_SIGNERS}" &&
 	git cat-file tag seventh-signed >raw &&
-	sed -e "/^tag / s/seventh/7th forged/" raw >forged1 &&
+	sed -e "/^tag / s/seventh/7th-forged/" raw >forged1 &&
 	git hash-object -w -t tag forged1 >forged1.tag &&
 	test_must_fail git verify-tag $(cat forged1.tag) 2>actual1 &&
 	grep "${GPGSSH_BAD_SIGNATURE}" actual1 &&

From patchwork Wed Jan 18 20:41:56 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jeff King <peff@peff.net>
X-Patchwork-Id: 13107078
Return-Path: <git-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 3E9D3C38147
	for <git@archiver.kernel.org>; Wed, 18 Jan 2023 20:42:03 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229705AbjARUmB (ORCPT <rfc822;git@archiver.kernel.org>);
        Wed, 18 Jan 2023 15:42:01 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48866 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229537AbjARUl7 (ORCPT <rfc822;git@vger.kernel.org>);
        Wed, 18 Jan 2023 15:41:59 -0500
Received: from cloud.peff.net (cloud.peff.net [104.130.231.41])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 80F205F399
        for <git@vger.kernel.org>; Wed, 18 Jan 2023 12:41:57 -0800 (PST)
Received: (qmail 3357 invoked by uid 109); 18 Jan 2023 20:41:57 -0000
Received: from Unknown (HELO peff.net) (10.0.1.2)
 by cloud.peff.net (qpsmtpd/0.94) with ESMTP; Wed, 18 Jan 2023 20:41:57 +0000
Authentication-Results: cloud.peff.net; auth=none
Received: (qmail 24980 invoked by uid 111); 18 Jan 2023 20:41:58 -0000
Received: from coredump.intra.peff.net (HELO sigill.intra.peff.net) (10.0.0.2)
 by peff.net (qpsmtpd/0.94) with (TLS_AES_256_GCM_SHA384 encrypted) ESMTPS;
 Wed, 18 Jan 2023 15:41:58 -0500
Authentication-Results: peff.net; auth=none
Date: Wed, 18 Jan 2023 15:41:56 -0500
From: Jeff King <peff@peff.net>
To: git@vger.kernel.org
Cc: =?utf-8?b?UmVuw6k=?= Scharfe <l.s.r@web.de>, =?utf-8?b?w4Z2YXIgQXJuZmo=?=
	=?utf-8?b?w7Zyw7A=?= Bjarmason <avarab@gmail.com>
Subject: [PATCH 4/6] t: use hash-object --literally when created malformed
 objects
Message-ID: <Y8hZlN9rRg1msc0L@coredump.intra.peff.net>
References: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
MIME-Version: 1.0
Content-Disposition: inline
In-Reply-To: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
Precedence: bulk
List-ID: <git.vger.kernel.org>
X-Mailing-List: git@vger.kernel.org

Many test scripts use hash-object to create malformed objects to see how
we handle the results in various commands. In some cases we already have
to use "hash-object --literally", because it does some rudimentary
quality checks. But let's use "--literally" more consistently to
future-proof these tests against hash-object learning to be more
careful.

Signed-off-by: Jeff King <peff@peff.net>
---
This patch is worth looking at because it shows the kinds of things the
new hash-object from patch 6 will reject.

Most of these are obviously terrible things that we'd want to complain
about, like broken emails, embedded NULs, and so on. The most
contentious one is probably a tag without a tagger line, which were
generated by early versions of Git (e.g., see Git's v0.99 tag). This is
an "info" in fsck (which is semantically like a warning, except
transfer.fsckObjects treats warnings as errors due to hysterical
raisins). But the hash-object change in patch 6 will reject it, because
it operates in strict mode.

That seems reasonable to me, since we're helping users avoid doing bad
things, and not dealing with existing objects.

 t/t1450-fsck.sh                 | 28 ++++++++++++++--------------
 t/t4054-diff-bogus-tree.sh      |  2 +-
 t/t4058-diff-duplicates.sh      |  2 +-
 t/t4212-log-corrupt.sh          |  4 ++--
 t/t5302-pack-index.sh           |  2 +-
 t/t5504-fetch-receive-strict.sh |  2 +-
 t/t5702-protocol-v2.sh          |  2 +-
 t/t6300-for-each-ref.sh         |  2 +-
 t/t7509-commit-authorship.sh    |  2 +-
 t/t7510-signed-commit.sh        |  2 +-
 t/t7528-signed-commit-ssh.sh    |  2 +-
 t/t8003-blame-corner-cases.sh   |  2 +-
 t/t9350-fast-export.sh          |  2 +-
 13 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh
index de0f6d5e7f..fdb886dfe4 100755
--- a/t/t1450-fsck.sh
+++ b/t/t1450-fsck.sh
@@ -212,7 +212,7 @@ test_expect_success 'email without @ is okay' '
 test_expect_success 'email with embedded > is not okay' '
 	git cat-file commit HEAD >basis &&
 	sed "s/@[a-z]/&>/" basis >bad-email &&
-	new=$(git hash-object -t commit -w --stdin <bad-email) &&
+	new=$(git hash-object --literally -t commit -w --stdin <bad-email) &&
 	test_when_finished "remove_object $new" &&
 	git update-ref refs/heads/bogus "$new" &&
 	test_when_finished "git update-ref -d refs/heads/bogus" &&
@@ -223,7 +223,7 @@ test_expect_success 'email with embedded > is not okay' '
 test_expect_success 'missing < email delimiter is reported nicely' '
 	git cat-file commit HEAD >basis &&
 	sed "s/<//" basis >bad-email-2 &&
-	new=$(git hash-object -t commit -w --stdin <bad-email-2) &&
+	new=$(git hash-object --literally -t commit -w --stdin <bad-email-2) &&
 	test_when_finished "remove_object $new" &&
 	git update-ref refs/heads/bogus "$new" &&
 	test_when_finished "git update-ref -d refs/heads/bogus" &&
@@ -234,7 +234,7 @@ test_expect_success 'missing < email delimiter is reported nicely' '
 test_expect_success 'missing email is reported nicely' '
 	git cat-file commit HEAD >basis &&
 	sed "s/[a-z]* <[^>]*>//" basis >bad-email-3 &&
-	new=$(git hash-object -t commit -w --stdin <bad-email-3) &&
+	new=$(git hash-object --literally -t commit -w --stdin <bad-email-3) &&
 	test_when_finished "remove_object $new" &&
 	git update-ref refs/heads/bogus "$new" &&
 	test_when_finished "git update-ref -d refs/heads/bogus" &&
@@ -245,7 +245,7 @@ test_expect_success 'missing email is reported nicely' '
 test_expect_success '> in name is reported' '
 	git cat-file commit HEAD >basis &&
 	sed "s/ </> </" basis >bad-email-4 &&
-	new=$(git hash-object -t commit -w --stdin <bad-email-4) &&
+	new=$(git hash-object --literally -t commit -w --stdin <bad-email-4) &&
 	test_when_finished "remove_object $new" &&
 	git update-ref refs/heads/bogus "$new" &&
 	test_when_finished "git update-ref -d refs/heads/bogus" &&
@@ -258,7 +258,7 @@ test_expect_success 'integer overflow in timestamps is reported' '
 	git cat-file commit HEAD >basis &&
 	sed "s/^\\(author .*>\\) [0-9]*/\\1 18446744073709551617/" \
 		<basis >bad-timestamp &&
-	new=$(git hash-object -t commit -w --stdin <bad-timestamp) &&
+	new=$(git hash-object --literally -t commit -w --stdin <bad-timestamp) &&
 	test_when_finished "remove_object $new" &&
 	git update-ref refs/heads/bogus "$new" &&
 	test_when_finished "git update-ref -d refs/heads/bogus" &&
@@ -269,7 +269,7 @@ test_expect_success 'integer overflow in timestamps is reported' '
 test_expect_success 'commit with NUL in header' '
 	git cat-file commit HEAD >basis &&
 	sed "s/author ./author Q/" <basis | q_to_nul >commit-NUL-header &&
-	new=$(git hash-object -t commit -w --stdin <commit-NUL-header) &&
+	new=$(git hash-object --literally -t commit -w --stdin <commit-NUL-header) &&
 	test_when_finished "remove_object $new" &&
 	git update-ref refs/heads/bogus "$new" &&
 	test_when_finished "git update-ref -d refs/heads/bogus" &&
@@ -292,7 +292,7 @@ test_expect_success 'tree object with duplicate entries' '
 			git cat-file tree $T &&
 			git cat-file tree $T
 		) |
-		git hash-object -w -t tree --stdin
+		git hash-object --literally -w -t tree --stdin
 	) &&
 	test_must_fail git fsck 2>out &&
 	test_i18ngrep "error in tree .*contains duplicate file entries" out
@@ -426,7 +426,7 @@ test_expect_success 'tag with incorrect tag name & missing tagger' '
 	This is an invalid tag.
 	EOF
 
-	tag=$(git hash-object -t tag -w --stdin <wrong-tag) &&
+	tag=$(git hash-object --literally -t tag -w --stdin <wrong-tag) &&
 	test_when_finished "remove_object $tag" &&
 	echo $tag >.git/refs/tags/wrong &&
 	test_when_finished "git update-ref -d refs/tags/wrong" &&
@@ -558,7 +558,7 @@ test_expect_success 'rev-list --verify-objects with commit graph (parent)' '
 test_expect_success 'force fsck to ignore double author' '
 	git cat-file commit HEAD >basis &&
 	sed "s/^author .*/&,&/" <basis | tr , \\n >multiple-authors &&
-	new=$(git hash-object -t commit -w --stdin <multiple-authors) &&
+	new=$(git hash-object --literally -t commit -w --stdin <multiple-authors) &&
 	test_when_finished "remove_object $new" &&
 	git update-ref refs/heads/bogus "$new" &&
 	test_when_finished "git update-ref -d refs/heads/bogus" &&
@@ -573,7 +573,7 @@ test_expect_success 'fsck notices blob entry pointing to null sha1' '
 	(git init null-blob &&
 	 cd null-blob &&
 	 sha=$(printf "100644 file$_bz$_bzoid" |
-	       git hash-object -w --stdin -t tree) &&
+	       git hash-object --literally -w --stdin -t tree) &&
 	  git fsck 2>out &&
 	  test_i18ngrep "warning.*null sha1" out
 	)
@@ -583,7 +583,7 @@ test_expect_success 'fsck notices submodule entry pointing to null sha1' '
 	(git init null-commit &&
 	 cd null-commit &&
 	 sha=$(printf "160000 submodule$_bz$_bzoid" |
-	       git hash-object -w --stdin -t tree) &&
+	       git hash-object --literally -w --stdin -t tree) &&
 	  git fsck 2>out &&
 	  test_i18ngrep "warning.*null sha1" out
 	)
@@ -648,7 +648,7 @@ test_expect_success 'NUL in commit' '
 		git commit --allow-empty -m "initial commitQNUL after message" &&
 		git cat-file commit HEAD >original &&
 		q_to_nul <original >munged &&
-		git hash-object -w -t commit --stdin <munged >name &&
+		git hash-object --literally -w -t commit --stdin <munged >name &&
 		git branch bad $(cat name) &&
 
 		test_must_fail git -c fsck.nulInCommit=error fsck 2>warn.1 &&
@@ -794,8 +794,8 @@ test_expect_success 'fsck errors in packed objects' '
 	git cat-file commit HEAD >basis &&
 	sed "s/</one/" basis >one &&
 	sed "s/</foo/" basis >two &&
-	one=$(git hash-object -t commit -w one) &&
-	two=$(git hash-object -t commit -w two) &&
+	one=$(git hash-object --literally -t commit -w one) &&
+	two=$(git hash-object --literally -t commit -w two) &&
 	pack=$(
 		{
 			echo $one &&
diff --git a/t/t4054-diff-bogus-tree.sh b/t/t4054-diff-bogus-tree.sh
index 294fb55313..05c88f8cdf 100755
--- a/t/t4054-diff-bogus-tree.sh
+++ b/t/t4054-diff-bogus-tree.sh
@@ -10,7 +10,7 @@ test_expect_success 'create bogus tree' '
 	bogus_tree=$(
 		printf "100644 fooQ$name" |
 		q_to_nul |
-		git hash-object -w --stdin -t tree
+		git hash-object --literally -w --stdin -t tree
 	)
 '
 
diff --git a/t/t4058-diff-duplicates.sh b/t/t4058-diff-duplicates.sh
index 54614b814d..2501c89c1c 100755
--- a/t/t4058-diff-duplicates.sh
+++ b/t/t4058-diff-duplicates.sh
@@ -29,7 +29,7 @@ make_tree () {
 		make_tree_entry "$1" "$2" "$3"
 		shift; shift; shift
 	done |
-	git hash-object -w -t tree --stdin
+	git hash-object --literally -w -t tree --stdin
 }
 
 # this is kind of a convoluted setup, but matches
diff --git a/t/t4212-log-corrupt.sh b/t/t4212-log-corrupt.sh
index 30a219894b..e89e1f54b6 100755
--- a/t/t4212-log-corrupt.sh
+++ b/t/t4212-log-corrupt.sh
@@ -10,7 +10,7 @@ test_expect_success 'setup' '
 
 	git cat-file commit HEAD |
 	sed "/^author /s/>/>-<>/" >broken_email.commit &&
-	git hash-object -w -t commit broken_email.commit >broken_email.hash &&
+	git hash-object --literally -w -t commit broken_email.commit >broken_email.hash &&
 	git update-ref refs/heads/broken_email $(cat broken_email.hash)
 '
 
@@ -46,7 +46,7 @@ test_expect_success 'git log --format with broken author email' '
 munge_author_date () {
 	git cat-file commit "$1" >commit.orig &&
 	sed "s/^\(author .*>\) [0-9]*/\1 $2/" <commit.orig >commit.munge &&
-	git hash-object -w -t commit commit.munge
+	git hash-object --literally -w -t commit commit.munge
 }
 
 test_expect_success 'unparsable dates produce sentinel value' '
diff --git a/t/t5302-pack-index.sh b/t/t5302-pack-index.sh
index b0095ab41d..59e9e77223 100755
--- a/t/t5302-pack-index.sh
+++ b/t/t5302-pack-index.sh
@@ -263,7 +263,7 @@ tag guten tag
 This is an invalid tag.
 EOF
 
-	tag=$(git hash-object -t tag -w --stdin <wrong-tag) &&
+	tag=$(git hash-object -t tag -w --stdin --literally <wrong-tag) &&
 	pack1=$(echo $tag $sha | git pack-objects tag-test) &&
 	echo remove tag object &&
 	thirtyeight=${tag#??} &&
diff --git a/t/t5504-fetch-receive-strict.sh b/t/t5504-fetch-receive-strict.sh
index ac4099ca89..88d3c56750 100755
--- a/t/t5504-fetch-receive-strict.sh
+++ b/t/t5504-fetch-receive-strict.sh
@@ -138,7 +138,7 @@ This commit object intentionally broken
 EOF
 
 test_expect_success 'setup bogus commit' '
-	commit="$(git hash-object -t commit -w --stdin <bogus-commit)"
+	commit="$(git hash-object --literally -t commit -w --stdin <bogus-commit)"
 '
 
 test_expect_success 'fsck with no skipList input' '
diff --git a/t/t5702-protocol-v2.sh b/t/t5702-protocol-v2.sh
index b33cd4afca..e4db7513f4 100755
--- a/t/t5702-protocol-v2.sh
+++ b/t/t5702-protocol-v2.sh
@@ -1114,7 +1114,7 @@ test_expect_success 'packfile-uri with transfer.fsckobjects fails on bad object'
 
 	This commit object intentionally broken
 	EOF
-	BOGUS=$(git -C "$P" hash-object -t commit -w --stdin <bogus-commit) &&
+	BOGUS=$(git -C "$P" hash-object -t commit -w --stdin --literally <bogus-commit) &&
 	git -C "$P" branch bogus-branch "$BOGUS" &&
 
 	echo my-blob >"$P/my-blob" &&
diff --git a/t/t6300-for-each-ref.sh b/t/t6300-for-each-ref.sh
index 2ae1fc721b..c466fd989f 100755
--- a/t/t6300-for-each-ref.sh
+++ b/t/t6300-for-each-ref.sh
@@ -606,7 +606,7 @@ test_expect_success 'create tag without tagger' '
 	git tag -a -m "Broken tag" taggerless &&
 	git tag -f taggerless $(git cat-file tag taggerless |
 		sed -e "/^tagger /d" |
-		git hash-object --stdin -w -t tag)
+		git hash-object --literally --stdin -w -t tag)
 '
 
 test_atom refs/tags/taggerless type 'commit'
diff --git a/t/t7509-commit-authorship.sh b/t/t7509-commit-authorship.sh
index 21c668f75e..5d890949f7 100755
--- a/t/t7509-commit-authorship.sh
+++ b/t/t7509-commit-authorship.sh
@@ -105,7 +105,7 @@ test_expect_success '--amend option with empty author' '
 test_expect_success '--amend option with missing author' '
 	git cat-file commit Initial >tmp &&
 	sed "s/author [^<]* </author </" tmp >malformed &&
-	sha=$(git hash-object -t commit -w malformed) &&
+	sha=$(git hash-object --literally -t commit -w malformed) &&
 	test_when_finished "remove_object $sha" &&
 	git checkout $sha &&
 	test_when_finished "git checkout Initial" &&
diff --git a/t/t7510-signed-commit.sh b/t/t7510-signed-commit.sh
index 8593b7e3cb..bc7a31ba3e 100755
--- a/t/t7510-signed-commit.sh
+++ b/t/t7510-signed-commit.sh
@@ -202,7 +202,7 @@ test_expect_success GPG 'detect fudged signature with NUL' '
 	git cat-file commit seventh-signed >raw &&
 	cat raw >forged2 &&
 	echo Qwik | tr "Q" "\000" >>forged2 &&
-	git hash-object -w -t commit forged2 >forged2.commit &&
+	git hash-object --literally -w -t commit forged2 >forged2.commit &&
 	test_must_fail git verify-commit $(cat forged2.commit) &&
 	git show --pretty=short --show-signature $(cat forged2.commit) >actual2 &&
 	grep "BAD signature from" actual2 &&
diff --git a/t/t7528-signed-commit-ssh.sh b/t/t7528-signed-commit-ssh.sh
index f47e995179..065f780636 100755
--- a/t/t7528-signed-commit-ssh.sh
+++ b/t/t7528-signed-commit-ssh.sh
@@ -270,7 +270,7 @@ test_expect_success GPGSSH 'detect fudged signature with NUL' '
 	git cat-file commit seventh-signed >raw &&
 	cat raw >forged2 &&
 	echo Qwik | tr "Q" "\000" >>forged2 &&
-	git hash-object -w -t commit forged2 >forged2.commit &&
+	git hash-object --literally -w -t commit forged2 >forged2.commit &&
 	test_must_fail git verify-commit $(cat forged2.commit) &&
 	git show --pretty=short --show-signature $(cat forged2.commit) >actual2 &&
 	grep "${GPGSSH_BAD_SIGNATURE}" actual2 &&
diff --git a/t/t8003-blame-corner-cases.sh b/t/t8003-blame-corner-cases.sh
index d751d48b7d..8bcd39e81b 100755
--- a/t/t8003-blame-corner-cases.sh
+++ b/t/t8003-blame-corner-cases.sh
@@ -201,7 +201,7 @@ committer David Reiss <dreiss@facebook.com> 1234567890 +0000
 
 some message
 EOF
-  COMMIT=$(git hash-object -t commit -w badcommit) &&
+  COMMIT=$(git hash-object --literally -t commit -w badcommit) &&
   git --no-pager blame $COMMIT -- uno >/dev/null
 '
 
diff --git a/t/t9350-fast-export.sh b/t/t9350-fast-export.sh
index ff21a12ee6..26c25c0eb2 100755
--- a/t/t9350-fast-export.sh
+++ b/t/t9350-fast-export.sh
@@ -373,7 +373,7 @@ EOF
 
 test_expect_success 'cope with tagger-less tags' '
 
-	TAG=$(git hash-object -t tag -w tag-content) &&
+	TAG=$(git hash-object --literally -t tag -w tag-content) &&
 	git update-ref refs/tags/sonnenschein $TAG &&
 	git fast-export -C -C --signed-tags=strip --all > output &&
 	test $(grep -c "^tag " output) = 4 &&

From patchwork Wed Jan 18 20:43:53 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jeff King <peff@peff.net>
X-Patchwork-Id: 13107079
Return-Path: <git-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 9674AC38147
	for <git@archiver.kernel.org>; Wed, 18 Jan 2023 20:44:08 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230120AbjARUoF (ORCPT <rfc822;git@archiver.kernel.org>);
        Wed, 18 Jan 2023 15:44:05 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:49664 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229907AbjARUn6 (ORCPT <rfc822;git@vger.kernel.org>);
        Wed, 18 Jan 2023 15:43:58 -0500
Received: from cloud.peff.net (cloud.peff.net [104.130.231.41])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 537E45F3B1
        for <git@vger.kernel.org>; Wed, 18 Jan 2023 12:43:55 -0800 (PST)
Received: (qmail 3369 invoked by uid 109); 18 Jan 2023 20:43:54 -0000
Received: from Unknown (HELO peff.net) (10.0.1.2)
 by cloud.peff.net (qpsmtpd/0.94) with ESMTP; Wed, 18 Jan 2023 20:43:54 +0000
Authentication-Results: cloud.peff.net; auth=none
Received: (qmail 24995 invoked by uid 111); 18 Jan 2023 20:43:56 -0000
Received: from coredump.intra.peff.net (HELO sigill.intra.peff.net) (10.0.0.2)
 by peff.net (qpsmtpd/0.94) with (TLS_AES_256_GCM_SHA384 encrypted) ESMTPS;
 Wed, 18 Jan 2023 15:43:56 -0500
Authentication-Results: peff.net; auth=none
Date: Wed, 18 Jan 2023 15:43:53 -0500
From: Jeff King <peff@peff.net>
To: git@vger.kernel.org
Cc: =?utf-8?b?UmVuw6k=?= Scharfe <l.s.r@web.de>, =?utf-8?b?w4Z2YXIgQXJuZmo=?=
	=?utf-8?b?w7Zyw7A=?= Bjarmason <avarab@gmail.com>
Subject: [PATCH 5/6] fsck: provide a function to fsck buffer without object
 struct
Message-ID: <Y8haCbAQIV9s/95l@coredump.intra.peff.net>
References: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
MIME-Version: 1.0
Content-Disposition: inline
In-Reply-To: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
Precedence: bulk
List-ID: <git.vger.kernel.org>
X-Mailing-List: git@vger.kernel.org

The fsck code has been slowly moving away from requiring an object
struct in commits like 103fb6d43b (fsck: accept an oid instead of a
"struct tag" for fsck_tag(), 2019-10-18), c5b4269b57 (fsck: accept an
oid instead of a "struct commit" for fsck_commit(), 2019-10-18), etc.

However, the only external interface that fsck.c provides is
fsck_object(), which requires an object struct, then promptly discards
everything except its oid and type. Let's factor out the post-discard
part of that function as fsck_buffer(), leaving fsck_object() as a thin
wrapper around it. That will provide more flexibility for callers which
may not have a struct.

Signed-off-by: Jeff King <peff@peff.net>
---
This is obviously preparation for the next patch. But I suspect it could
be used elsewhere, too. Regular fsck wants object structs anyway to hold
flags, I think, but index-pack could probably save some memory and
effort by avoiding them. I didn't look too closely, as it's all out of
scope for this series.

 fsck.c | 29 ++++++++++++++++++-----------
 fsck.h |  8 ++++++++
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/fsck.c b/fsck.c
index 47eaeedd70..c2c8facd2d 100644
--- a/fsck.c
+++ b/fsck.c
@@ -1237,19 +1237,26 @@ int fsck_object(struct object *obj, void *data, unsigned long size,
 	if (!obj)
 		return report(options, NULL, OBJ_NONE, FSCK_MSG_BAD_OBJECT_SHA1, "no valid object to fsck");
 
-	if (obj->type == OBJ_BLOB)
-		return fsck_blob(&obj->oid, data, size, options);
-	if (obj->type == OBJ_TREE)
-		return fsck_tree(&obj->oid, data, size, options);
-	if (obj->type == OBJ_COMMIT)
-		return fsck_commit(&obj->oid, data, size, options);
-	if (obj->type == OBJ_TAG)
-		return fsck_tag(&obj->oid, data, size, options);
-
-	return report(options, &obj->oid, obj->type,
+	return fsck_buffer(&obj->oid, obj->type, data, size, options);
+}
+
+int fsck_buffer(const struct object_id *oid, enum object_type type,
+		void *data, unsigned long size,
+		struct fsck_options *options)
+{
+	if (type == OBJ_BLOB)
+		return fsck_blob(oid, data, size, options);
+	if (type == OBJ_TREE)
+		return fsck_tree(oid, data, size, options);
+	if (type == OBJ_COMMIT)
+		return fsck_commit(oid, data, size, options);
+	if (type == OBJ_TAG)
+		return fsck_tag(oid, data, size, options);
+
+	return report(options, oid, type,
 		      FSCK_MSG_UNKNOWN_TYPE,
 		      "unknown type '%d' (internal fsck error)",
-		      obj->type);
+		      type);
 }
 
 int fsck_error_function(struct fsck_options *o,
diff --git a/fsck.h b/fsck.h
index fcecf4101c..668330880e 100644
--- a/fsck.h
+++ b/fsck.h
@@ -183,6 +183,14 @@ int fsck_walk(struct object *obj, void *data, struct fsck_options *options);
 int fsck_object(struct object *obj, void *data, unsigned long size,
 	struct fsck_options *options);
 
+/*
+ * Same as fsck_object(), but for when the caller doesn't have an object
+ * struct.
+ */
+int fsck_buffer(const struct object_id *oid, enum object_type,
+		void *data, unsigned long size,
+		struct fsck_options *options);
+
 /*
  * fsck a tag, and pass info about it back to the caller. This is
  * exposed fsck_object() internals for git-mktag(1).

From patchwork Wed Jan 18 20:44:12 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jeff King <peff@peff.net>
X-Patchwork-Id: 13107080
Return-Path: <git-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D04B8C38159
	for <git@archiver.kernel.org>; Wed, 18 Jan 2023 20:44:28 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230049AbjARUo2 (ORCPT <rfc822;git@archiver.kernel.org>);
        Wed, 18 Jan 2023 15:44:28 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:50154 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229942AbjARUoW (ORCPT <rfc822;git@vger.kernel.org>);
        Wed, 18 Jan 2023 15:44:22 -0500
Received: from cloud.peff.net (cloud.peff.net [104.130.231.41])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B1DF65FD6B
        for <git@vger.kernel.org>; Wed, 18 Jan 2023 12:44:13 -0800 (PST)
Received: (qmail 3380 invoked by uid 109); 18 Jan 2023 20:44:13 -0000
Received: from Unknown (HELO peff.net) (10.0.1.2)
 by cloud.peff.net (qpsmtpd/0.94) with ESMTP; Wed, 18 Jan 2023 20:44:13 +0000
Authentication-Results: cloud.peff.net; auth=none
Received: (qmail 25002 invoked by uid 111); 18 Jan 2023 20:44:14 -0000
Received: from coredump.intra.peff.net (HELO sigill.intra.peff.net) (10.0.0.2)
 by peff.net (qpsmtpd/0.94) with (TLS_AES_256_GCM_SHA384 encrypted) ESMTPS;
 Wed, 18 Jan 2023 15:44:14 -0500
Authentication-Results: peff.net; auth=none
Date: Wed, 18 Jan 2023 15:44:12 -0500
From: Jeff King <peff@peff.net>
To: git@vger.kernel.org
Cc: =?utf-8?b?UmVuw6k=?= Scharfe <l.s.r@web.de>, =?utf-8?b?w4Z2YXIgQXJuZmo=?=
	=?utf-8?b?w7Zyw7A=?= Bjarmason <avarab@gmail.com>
Subject: [PATCH 6/6] hash-object: use fsck for object checks
Message-ID: <Y8haHL9xIWntSm0/@coredump.intra.peff.net>
References: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
MIME-Version: 1.0
Content-Disposition: inline
In-Reply-To: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
Precedence: bulk
List-ID: <git.vger.kernel.org>
X-Mailing-List: git@vger.kernel.org

Since c879daa237 (Make hash-object more robust against malformed
objects, 2011-02-05), we've done some rudimentary checks against objects
we're about to write by running them through our usual parsers for
trees, commits, and tags.

These parsers catch some problems, but they are not nearly as careful as
the fsck functions (which make sense; the parsers are designed to be
fast and forgiving, bailing only when the input is unintelligible). We
are better off doing the more thorough fsck checks when writing objects.
Doing so at write time is much better than writing garbage only to find
out later (after building more history atop it!) that fsck complains
about it, or hosts with transfer.fsckObjects reject it.

This is obviously going to be a user-visible behavior change, and the
test changes earlier in this series show the scope of the impact. But
I'd argue that this is OK:

  - the documentation for hash-object is already vague about which
    checks we might do, saying that --literally will allow "any
    garbage[...] which might not otherwise pass standard object parsing
    or git-fsck checks". So we are already covered under the documented
    behavior.

  - users don't generally run hash-object anyway. There are a lot of
    spots in the tests that needed to be updated because creating
    garbage objects is something that Git's tests disproportionately do.

  - it's hard to imagine anyone thinking the new behavior is worse. Any
    object we reject would be a potential problem down the road for the
    user. And if they really want to create garbage, --literally is
    already the escape hatch they need.

Note that the change here is actually in index_mem(), which handles the
HASH_FORMAT_CHECK flag passed by hash-object. That flag is also used by
"git-replace --edit" to sanity-check the result. Covering that with more
thorough checks likewise seems like a good thing.

Besides being more thorough, there are a few other bonuses:

  - we get rid of some questionable stack allocations of object structs.
    These don't seem to currently cause any problems in practice, but
    they subtly violate some of the assumptions made by the rest of the
    code (e.g., the "struct commit" we put on the stack and
    zero-initialize will not have a proper index from
    alloc_comit_index().

  - likewise, those parsed object structs are the source of some small
    memory leaks

  - the resulting messages are much better. For example:

      [before]
      $ echo 'tree 123' | git hash-object -t commit --stdin
      error: bogus commit object 0000000000000000000000000000000000000000
      fatal: corrupt commit

      [after]
      $ echo 'tree 123' | git.compile hash-object -t commit --stdin
      error: object fails fsck: badTreeSha1: invalid 'tree' line format - bad sha1
      fatal: refusing to create malformed object

Signed-off-by: Jeff King <peff@peff.net>
---
 object-file.c          | 55 ++++++++++++++++++------------------------
 t/t1007-hash-object.sh | 11 +++++++++
 2 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/object-file.c b/object-file.c
index 80a0cd3b35..5c96384803 100644
--- a/object-file.c
+++ b/object-file.c
@@ -33,6 +33,7 @@
 #include "object-store.h"
 #include "promisor-remote.h"
 #include "submodule.h"
+#include "fsck.h"
 
 /* The maximum size for an object header. */
 #define MAX_HEADER_LEN 32
@@ -2298,32 +2299,21 @@ int repo_has_object_file(struct repository *r,
 	return repo_has_object_file_with_flags(r, oid, 0);
 }
 
-static void check_tree(const void *buf, size_t size)
-{
-	struct tree_desc desc;
-	struct name_entry entry;
-
-	init_tree_desc(&desc, buf, size);
-	while (tree_entry(&desc, &entry))
-		/* do nothing
-		 * tree_entry() will die() on malformed entries */
-		;
-}
-
-static void check_commit(const void *buf, size_t size)
-{
-	struct commit c;
-	memset(&c, 0, sizeof(c));
-	if (parse_commit_buffer(the_repository, &c, buf, size, 0))
-		die(_("corrupt commit"));
-}
-
-static void check_tag(const void *buf, size_t size)
-{
-	struct tag t;
-	memset(&t, 0, sizeof(t));
-	if (parse_tag_buffer(the_repository, &t, buf, size))
-		die(_("corrupt tag"));
+/*
+ * We can't use the normal fsck_error_function() for index_mem(),
+ * because we don't yet have a valid oid for it to report. Instead,
+ * report the minimal fsck error here, and rely on the caller to
+ * give more context.
+ */
+static int hash_format_check_report(struct fsck_options *opts,
+				     const struct object_id *oid,
+				     enum object_type object_type,
+				     enum fsck_msg_type msg_type,
+				     enum fsck_msg_id msg_id,
+				     const char *message)
+{
+	error(_("object fails fsck: %s"), message);
+	return 1;
 }
 
 static int index_mem(struct index_state *istate,
@@ -2350,12 +2340,13 @@ static int index_mem(struct index_state *istate,
 		}
 	}
 	if (flags & HASH_FORMAT_CHECK) {
-		if (type == OBJ_TREE)
-			check_tree(buf, size);
-		if (type == OBJ_COMMIT)
-			check_commit(buf, size);
-		if (type == OBJ_TAG)
-			check_tag(buf, size);
+		struct fsck_options opts = FSCK_OPTIONS_DEFAULT;
+
+		opts.strict = 1;
+		opts.error_func = hash_format_check_report;
+		if (fsck_buffer(null_oid(), type, buf, size, &opts))
+			die(_("refusing to create malformed object"));
+		fsck_finish(&opts);
 	}
 
 	if (write_object)
diff --git a/t/t1007-hash-object.sh b/t/t1007-hash-object.sh
index 2d2148d8fa..ac3d173767 100755
--- a/t/t1007-hash-object.sh
+++ b/t/t1007-hash-object.sh
@@ -222,6 +222,17 @@ test_expect_success 'empty filename in tree' '
 	grep "empty filename in tree entry" err
 '
 
+test_expect_success 'duplicate filename in tree' '
+	hex_oid=$(echo foo | git hash-object --stdin -w) &&
+	bin_oid=$(echo $hex_oid | hex2oct) &&
+	{
+		printf "100644 file\0$bin_oid" &&
+		printf "100644 file\0$bin_oid"
+	} >tree-with-duplicate-filename &&
+	test_must_fail git hash-object -t tree tree-with-duplicate-filename 2>err &&
+	grep "duplicateEntries" err
+'
+
 test_expect_success 'corrupt commit' '
 	test_must_fail git hash-object -t commit --stdin </dev/null
 '

From patchwork Thu Jan 19 23:13:29 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Patchwork-Submitter: Jeff King <peff@peff.net>
X-Patchwork-Id: 13108950
Return-Path: <git-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 8CFBAC46467
	for <git@archiver.kernel.org>; Thu, 19 Jan 2023 23:15:38 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230004AbjASXPf (ORCPT <rfc822;git@archiver.kernel.org>);
        Thu, 19 Jan 2023 18:15:35 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:60392 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S230029AbjASXPM (ORCPT <rfc822;git@vger.kernel.org>);
        Thu, 19 Jan 2023 18:15:12 -0500
Received: from cloud.peff.net (cloud.peff.net [104.130.231.41])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id E3E7A12F10
        for <git@vger.kernel.org>; Thu, 19 Jan 2023 15:13:30 -0800 (PST)
Received: (qmail 17164 invoked by uid 109); 19 Jan 2023 23:13:30 -0000
Received: from Unknown (HELO peff.net) (10.0.1.2)
 by cloud.peff.net (qpsmtpd/0.94) with ESMTP; Thu, 19 Jan 2023 23:13:30 +0000
Authentication-Results: cloud.peff.net; auth=none
Received: (qmail 4774 invoked by uid 111); 19 Jan 2023 23:13:32 -0000
Received: from coredump.intra.peff.net (HELO sigill.intra.peff.net) (10.0.0.2)
 by peff.net (qpsmtpd/0.94) with (TLS_AES_256_GCM_SHA384 encrypted) ESMTPS;
 Thu, 19 Jan 2023 18:13:32 -0500
Authentication-Results: peff.net; auth=none
Date: Thu, 19 Jan 2023 18:13:29 -0500
From: Jeff King <peff@peff.net>
To: git@vger.kernel.org
Cc: Taylor Blau <me@ttaylorr.com>, Junio C Hamano <gitster@pobox.com>,
	=?utf-8?b?UmVuw6k=?= Scharfe <l.s.r@web.de>, =?utf-8?b?w4Z2YXIgQXJuZmrDtnI=?=
	=?utf-8?b?w7A=?= Bjarmason <avarab@gmail.com>
Subject: [PATCH 7/6] fsck: do not assume NUL-termination of buffers
Message-ID: <Y8nOmZHv7T843uBn@coredump.intra.peff.net>
References: <Y8hX+pIZUKXsyYj5@coredump.intra.peff.net>
 <Y8ifa7hyqxSbL92U@coredump.intra.peff.net>
MIME-Version: 1.0
Content-Disposition: inline
In-Reply-To: <Y8ifa7hyqxSbL92U@coredump.intra.peff.net>
Precedence: bulk
List-ID: <git.vger.kernel.org>
X-Mailing-List: git@vger.kernel.org

On Wed, Jan 18, 2023 at 08:39:55PM -0500, Jeff King wrote:

> On Wed, Jan 18, 2023 at 03:35:06PM -0500, Jeff King wrote:
> 
> > The other option is having the fsck code avoid looking past the size it
> > was given. I think the intent is that this should work, from commits
> > like 4d0d89755e (Make sure fsck_commit_buffer() does not run out of the
> > buffer, 2014-09-11). We do use skip_prefix() and parse_oid_hex(), which
> > won't respect the size, but I think[1] that's OK because we'll have
> > parsed up to the end-of-header beforehand (and those functions would
> > never match past there).
> > 
> > Which would mean that 9a1a3a4d4c (mktag: allow omitting the header/body
> > \n separator, 2021-01-05) and acf9de4c94 (mktag: use fsck instead of
> > custom verify_tag(), 2021-01-05) were buggy, and we can just fix them.
> > 
> > [1] But I said "I think" above because it can get pretty subtle. There's
> >     some more discussion in this thread:
> > 
> >       https://lore.kernel.org/git/20150625155128.C3E9738005C@gemini.denx.de/
> > 
> >     but I haven't yet convinced myself it's safe. This is exactly the
> >     kind of analysis I wish I had the power to nerd-snipe René into.
> 
> I poked at this a bit more, and it definitely isn't safe.

So here's the result of my digging on this. The good news is that this
one commit on top of the rest of the series should make everything safe.
I'm sorry the explanation is a bit long, but I wanted to capture a bit
of the history, the subtle assumptions, and how I approached analyzing
and fixing it.

There are a few paths forward here:

  - apply this on top of the earlier 6 patches. This is the simplest
    thing, and my preference. It does mean that t3800 temporarily has a
    read-one-char-past-buffer bug that is detected by ASan after patch 6
    but before this patch is applied.

  - put this fix first. Unfortunately the tests rely on having patch 6
    in order to be able to feed a non-NUL-terminated buffer to fsck.
    Options there are:

      - split this patch into two: code fix goes at the beginning of the
        series, and then the tests come at the end. The downside here is
        that it's very hard to run the tests on the pre-fixed code to
        verify that they are finding problems (you'd have to revert the
        fix, or re-order patches to get the broken state)

      - introduce a test-helper that lets you feed a buffer to
        fsck_buffer(). That can demonstrate the problem and fix
        independently of any hash-object changes. But it ends up being a
        fair bit of boilerplate, and ultimately we want to test
        hash-object anyway.

  - decide the whole "make fsck work with arbitrary buffers" thing is
    too subtle and error-prone. I don't think this, or else I wouldn't
    have made this patch. But I think it's an argument that can be made
    (and is roughly the approach we decided to take way back in the 2015
    thread linked above). The solution there is to make sure we
    NUL-terminate everything. As I said before, this is tricky because
    of mmap. But we could probably just skip using mmap in index_core()
    for non-blobs (which don't tend to be very big), and then assume
    fsck on individual blobs is safe (it is, because they won't have
    been marked as gitmodules, etc for more detailed scanning).

    I think it could work. I kind of prefer just making the fsck
    functions safe. Even though the way they do left-to-right scanning
    is error-prone, at least the ugliness is contained inside them,
    rather than this "sure, I take a ptr/len combo, but make sure you
    allocate an extra NUL byte!" assumption that currently exists.

Anyway, here's the patch. I'm happy to repost the whole 7-patch series,
too, but since the earlier ones didn't change in my preferred path
forward, this seemed easier for now. ;)

-- >8 --
Subject: [PATCH] fsck: do not assume NUL-termination of buffers

The fsck code operates on an object buffer represented as a pointer/len
combination. However, the parsing of commits and tags is a little bit
loose; we mostly scan left-to-right through the buffer, without checking
whether we've gone past the length we were given.

This has traditionally been OK because the buffers we feed to fsck
always have an extra NUL after the end of the object content, which ends
any left-to-right scan. That has always been true for objects we read
from the odb, and we made it true for incoming index-pack/unpack-objects
checks in a1e920a0a7 (index-pack: terminate object buffers with NUL,
2014-12-08).

However, we recently added an exception: hash-object asks index_fd() to
do fsck checks. That _may_ have an extra NUL (if we read from a pipe
into a strbuf), but it might not (if we read the contents from the
file). Nor can we just teach it to always add a NUL. We may mmap the
on-disk file, which will not have any extra bytes (if it's a multiple of
the page size). Not to mention that this is a rather subtle assumption
for the fsck code to make.

Instead, let's make sure that the fsck parsers don't ever look past the
size of the buffer they've been given. This _almost_ works already,
thanks to earlier work in 4d0d89755e (Make sure fsck_commit_buffer()
does not run out of the buffer, 2014-09-11). The theory there is that we
check up front whether we have the end of header double-newline
separator. And then any left-to-right scanning we do is OK as long as it
stops when it hits that boundary.

However, we later softened that in 84d18c0bcf (fsck: it is OK for a tag
and a commit to lack the body, 2015-06-28), which allows the
double-newline header to be missing, but does require that the header
ends in a newline. That was OK back then, because of the NUL-termination
guarantees (including the one from a1e920a0a7 mentioned above).

Because 84d18c0bcf guarantees that any header line does end in a
newline, we are still OK with most of the left-to-right scanning. We
only need to take care after completing a line, to check that there is
another line (and we didn't run out of buffer).

Most of these checks are just need to check "buffer < buffer_end" (where
buffer is advanced as we parse) before scanning for the next header
line. But here are a few notes:

  - we don't technically need to check for remaining buffer before
    parsing the very first line ("tree" for a commit, or "object" for a
    tag), because verify_headers() rejects a totally empty buffer. But
    we'll do so in the name of consistency and defensiveness.

  - there are some calls to strchr('\n'). These are actually OK by the
    "the final header line must end in a newline" guarantee from
    verify_headers(). They will always find that rather than run off the
    end of the buffer. Curiously, they do check for a NULL return and
    complain, but I believe that condition can never be reached.

    However, I converted them to use memchr() with a proper size and
    retained the NULL checks. Using memchr() is not much longer and
    makes it more obvious what is going on. Likewise, retaining the NULL
    checks serves as a defensive measure in case my analysis is wrong.

  - commit 9a1a3a4d4c (mktag: allow omitting the header/body \n
    separator, 2021-01-05), does check for the end-of-buffer condition,
    but does so with "!*buffer", relying explicitly on the NUL
    termination. We can accomplish the same thing with a pointer
    comparison. I also folded it into the follow-on conditional that
    checks the contents of the buffer, for consistency with the other
    checks.

  - fsck_ident() uses parse_timestamp(), which is based on strtoumax().
    That function will happily skip past leading whitespace, including
    newlines, which makes it a risk. We can fix this by scanning to the
    first digit ourselves, and then using parse_timestamp() to do the
    actual numeric conversion.

    Note that as a side effect this fixes the fact that we missed
    zero-padded timestamps like "<email>   0123" (whereas we would
    complain about "<email> 0123"). I doubt anybody cares, but I
    mention it here for completeness.

  - fsck_tree() does not need any modifications. It relies on
    decode_tree_entry() to do the actual parsing, and that function
    checks both that there are enough bytes in the buffer to represent
    an entry, and that there is a NUL at the appropriate spot (one
    hash-length from the end; this may not be the NUL for the entry we
    are parsing, but we know that in the worst case, everything from our
    current position to that NUL is a filename, so we won't run out of
    bytes).

In addition to fixing the code itself, we'd like to make sure our rather
subtle assumptions are not violated in the future. So this patch does
two more things:

  - add comments around verify_headers() documenting the link between
    what it checks and the memory safety of the callers. I don't expect
    this code to be modified frequently, but this may help somebody from
    accidentally breaking things.

  - add a thorough set of tests covering truncations at various key
    spots (e.g., for a "tree $oid" line, in the middle of the word
    "tree", right after it, after the space, in the middle of the $oid,
    and right at the end of the line. Most of these are fine already (it
    is only truncating right at the end of the line that is currently
    broken). And some of them are not even possible with the current
    code (we parse "tree " as a unit, so truncating before the space is
    equivalent). But I aimed here to consider the code a black box and
    look for any truncations that would be a problem for a left-to-right
    parser.

Signed-off-by: Jeff King <peff@peff.net>
---
 fsck.c                 |  67 ++++++++++++++++----
 t/t1451-fsck-buffer.sh | 140 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 194 insertions(+), 13 deletions(-)
 create mode 100755 t/t1451-fsck-buffer.sh

diff --git a/fsck.c b/fsck.c
index c2c8facd2d..2b18717ee8 100644
--- a/fsck.c
+++ b/fsck.c
@@ -748,6 +748,23 @@ static int fsck_tree(const struct object_id *tree_oid,
 	return retval;
 }
 
+/*
+ * Confirm that the headers of a commit or tag object end in a reasonable way,
+ * either with the usual "\n\n" separator, or at least with a trailing newline
+ * on the final header line.
+ *
+ * This property is important for the memory safety of our callers. It allows
+ * them to scan the buffer linewise without constantly checking the remaining
+ * size as long as:
+ *
+ *   - they check that there are bytes left in the buffer at the start of any
+ *     line (i.e., that the last newline they saw was not the final one we
+ *     found here)
+ *
+ *   - any intra-line scanning they do will stop at a newline, which will worst
+ *     case hit the newline we found here as the end-of-header. This makes it
+ *     OK for them to use helpers like parse_oid_hex(), or even skip_prefix().
+ */
 static int verify_headers(const void *data, unsigned long size,
 			  const struct object_id *oid, enum object_type type,
 			  struct fsck_options *options)
@@ -808,6 +825,20 @@ static int fsck_ident(const char **ident,
 	if (*p != ' ')
 		return report(options, oid, type, FSCK_MSG_MISSING_SPACE_BEFORE_DATE, "invalid author/committer line - missing space before date");
 	p++;
+	/*
+	 * Our timestamp parser is based on the C strto*() functions, which
+	 * will happily eat whitespace, including the newline that is supposed
+	 * to prevent us walking past the end of the buffer. So do our own
+	 * scan, skipping linear whitespace but not newlines, and then
+	 * confirming we found a digit. We _could_ be even more strict here,
+	 * as we really expect only a single space, but since we have
+	 * traditionally allowed extra whitespace, we'll continue to do so.
+	 */
+	while (*p == ' ' || *p == '\t')
+		p++;
+	if (!isdigit(*p))
+		return report(options, oid, type, FSCK_MSG_BAD_DATE,
+			      "invalid author/committer line - bad date");
 	if (*p == '0' && p[1] != ' ')
 		return report(options, oid, type, FSCK_MSG_ZERO_PADDED_DATE, "invalid author/committer line - zero-padded date");
 	if (date_overflows(parse_timestamp(p, &end, 10)))
@@ -834,20 +865,26 @@ static int fsck_commit(const struct object_id *oid,
 	unsigned author_count;
 	int err;
 	const char *buffer_begin = buffer;
+	const char *buffer_end = buffer + size;
 	const char *p;
 
+	/*
+	 * We _must_ stop parsing immediately if this reports failure, as the
+	 * memory safety of the rest of the function depends on it. See the
+	 * comment above the definition of verify_headers() for more details.
+	 */
 	if (verify_headers(buffer, size, oid, OBJ_COMMIT, options))
 		return -1;
 
-	if (!skip_prefix(buffer, "tree ", &buffer))
+	if (buffer >= buffer_end || !skip_prefix(buffer, "tree ", &buffer))
 		return report(options, oid, OBJ_COMMIT, FSCK_MSG_MISSING_TREE, "invalid format - expected 'tree' line");
 	if (parse_oid_hex(buffer, &tree_oid, &p) || *p != '\n') {
 		err = report(options, oid, OBJ_COMMIT, FSCK_MSG_BAD_TREE_SHA1, "invalid 'tree' line format - bad sha1");
 		if (err)
 			return err;
 	}
 	buffer = p + 1;
-	while (skip_prefix(buffer, "parent ", &buffer)) {
+	while (buffer < buffer_end && skip_prefix(buffer, "parent ", &buffer)) {
 		if (parse_oid_hex(buffer, &parent_oid, &p) || *p != '\n') {
 			err = report(options, oid, OBJ_COMMIT, FSCK_MSG_BAD_PARENT_SHA1, "invalid 'parent' line format - bad sha1");
 			if (err)
@@ -856,7 +893,7 @@ static int fsck_commit(const struct object_id *oid,
 		buffer = p + 1;
 	}
 	author_count = 0;
-	while (skip_prefix(buffer, "author ", &buffer)) {
+	while (buffer < buffer_end && skip_prefix(buffer, "author ", &buffer)) {
 		author_count++;
 		err = fsck_ident(&buffer, oid, OBJ_COMMIT, options);
 		if (err)
@@ -868,7 +905,7 @@ static int fsck_commit(const struct object_id *oid,
 		err = report(options, oid, OBJ_COMMIT, FSCK_MSG_MULTIPLE_AUTHORS, "invalid format - multiple 'author' lines");
 	if (err)
 		return err;
-	if (!skip_prefix(buffer, "committer ", &buffer))
+	if (buffer >= buffer_end || !skip_prefix(buffer, "committer ", &buffer))
 		return report(options, oid, OBJ_COMMIT, FSCK_MSG_MISSING_COMMITTER, "invalid format - expected 'committer' line");
 	err = fsck_ident(&buffer, oid, OBJ_COMMIT, options);
 	if (err)
@@ -899,13 +936,19 @@ int fsck_tag_standalone(const struct object_id *oid, const char *buffer,
 	int ret = 0;
 	char *eol;
 	struct strbuf sb = STRBUF_INIT;
+	const char *buffer_end = buffer + size;
 	const char *p;
 
+	/*
+	 * We _must_ stop parsing immediately if this reports failure, as the
+	 * memory safety of the rest of the function depends on it. See the
+	 * comment above the definition of verify_headers() for more details.
+	 */
 	ret = verify_headers(buffer, size, oid, OBJ_TAG, options);
 	if (ret)
 		goto done;
 
-	if (!skip_prefix(buffer, "object ", &buffer)) {
+	if (buffer >= buffer_end || !skip_prefix(buffer, "object ", &buffer)) {
 		ret = report(options, oid, OBJ_TAG, FSCK_MSG_MISSING_OBJECT, "invalid format - expected 'object' line");
 		goto done;
 	}
@@ -916,11 +959,11 @@ int fsck_tag_standalone(const struct object_id *oid, const char *buffer,
 	}
 	buffer = p + 1;
 
-	if (!skip_prefix(buffer, "type ", &buffer)) {
+	if (buffer >= buffer_end || !skip_prefix(buffer, "type ", &buffer)) {
 		ret = report(options, oid, OBJ_TAG, FSCK_MSG_MISSING_TYPE_ENTRY, "invalid format - expected 'type' line");
 		goto done;
 	}
-	eol = strchr(buffer, '\n');
+	eol = memchr(buffer, '\n', buffer_end - buffer);
 	if (!eol) {
 		ret = report(options, oid, OBJ_TAG, FSCK_MSG_MISSING_TYPE, "invalid format - unexpected end after 'type' line");
 		goto done;
@@ -932,11 +975,11 @@ int fsck_tag_standalone(const struct object_id *oid, const char *buffer,
 		goto done;
 	buffer = eol + 1;
 
-	if (!skip_prefix(buffer, "tag ", &buffer)) {
+	if (buffer >= buffer_end || !skip_prefix(buffer, "tag ", &buffer)) {
 		ret = report(options, oid, OBJ_TAG, FSCK_MSG_MISSING_TAG_ENTRY, "invalid format - expected 'tag' line");
 		goto done;
 	}
-	eol = strchr(buffer, '\n');
+	eol = memchr(buffer, '\n', buffer_end - buffer);
 	if (!eol) {
 		ret = report(options, oid, OBJ_TAG, FSCK_MSG_MISSING_TAG, "invalid format - unexpected end after 'type' line");
 		goto done;
@@ -952,18 +995,16 @@ int fsck_tag_standalone(const struct object_id *oid, const char *buffer,
 	}
 	buffer = eol + 1;
 
-	if (!skip_prefix(buffer, "tagger ", &buffer)) {
+	if (buffer >= buffer_end || !skip_prefix(buffer, "tagger ", &buffer)) {
 		/* early tags do not contain 'tagger' lines; warn only */
 		ret = report(options, oid, OBJ_TAG, FSCK_MSG_MISSING_TAGGER_ENTRY, "invalid format - expected 'tagger' line");
 		if (ret)
 			goto done;
 	}
 	else
 		ret = fsck_ident(&buffer, oid, OBJ_TAG, options);
-	if (!*buffer)
-		goto done;
 
-	if (!starts_with(buffer, "\n")) {
+	if (buffer < buffer_end && !starts_with(buffer, "\n")) {
 		/*
 		 * The verify_headers() check will allow
 		 * e.g. "[...]tagger <tagger>\nsome
diff --git a/t/t1451-fsck-buffer.sh b/t/t1451-fsck-buffer.sh
new file mode 100755
index 0000000000..9ac270abab
--- /dev/null
+++ b/t/t1451-fsck-buffer.sh
@@ -0,0 +1,140 @@
+#!/bin/sh
+
+test_description='fsck on buffers without NUL termination
+
+The goal here is to make sure that the various fsck parsers never look
+past the end of the buffer they are given, even when encountering broken
+or truncated objects.
+
+We have to use "hash-object" for this because most code paths that read objects
+append an extra NUL for safety after the buffer. But hash-object, since it is
+reading straight from a file (and possibly even mmap-ing it) cannot always do
+so.
+
+These tests _might_ catch such overruns in normal use, but should be run with
+ASan or valgrind for more confidence.
+'
+. ./test-lib.sh
+
+# the general idea for tags and commits is to build up the "base" file
+# progressively, and then test new truncations on top of it.
+reset () {
+	test_expect_success 'reset input to empty' '
+		>base
+	'
+}
+
+add () {
+	content="$1"
+	type=${content%% *}
+	test_expect_success "add $type line" '
+		echo "$content" >>base
+	'
+}
+
+check () {
+	type=$1
+	fsck=$2
+	content=$3
+	test_expect_success "truncated $type ($fsck, \"$content\")" '
+		# do not pipe into hash-object here; we want to increase
+		# the chance that it uses a fixed-size buffer or mmap,
+		# and a pipe would be read into a strbuf.
+		{
+			cat base &&
+			echo "$content"
+		} >input &&
+		test_must_fail git hash-object -t "$type" input 2>err &&
+		grep "$fsck" err
+	'
+}
+
+test_expect_success 'create valid objects' '
+	git commit --allow-empty -m foo &&
+	commit=$(git rev-parse --verify HEAD) &&
+	tree=$(git rev-parse --verify HEAD^{tree})
+'
+
+reset
+check commit missingTree ""
+check commit missingTree "tr"
+check commit missingTree "tree"
+check commit badTreeSha1 "tree "
+check commit badTreeSha1 "tree 1234"
+add "tree $tree"
+
+# these expect missingAuthor because "parent" is optional
+check commit missingAuthor ""
+check commit missingAuthor "par"
+check commit missingAuthor "parent"
+check commit badParentSha1 "parent "
+check commit badParentSha1 "parent 1234"
+add "parent $commit"
+
+check commit missingAuthor ""
+check commit missingAuthor "au"
+check commit missingAuthor "author"
+ident_checks () {
+	check $1 missingEmail "$2 "
+	check $1 missingEmail "$2 name"
+	check $1 badEmail "$2 name <"
+	check $1 badEmail "$2 name <email"
+	check $1 missingSpaceBeforeDate "$2 name <email>"
+	check $1 badDate "$2 name <email> "
+	check $1 badDate "$2 name <email> 1234"
+	check $1 badTimezone "$2 name <email> 1234 "
+	check $1 badTimezone "$2 name <email> 1234 +"
+}
+ident_checks commit author
+add "author name <email> 1234 +0000"
+
+check commit missingCommitter ""
+check commit missingCommitter "co"
+check commit missingCommitter "committer"
+ident_checks commit committer
+add "committer name <email> 1234 +0000"
+
+reset
+check tag missingObject ""
+check tag missingObject "obj"
+check tag missingObject "object"
+check tag badObjectSha1 "object "
+check tag badObjectSha1 "object 1234"
+add "object $commit"
+
+check tag missingType ""
+check tag missingType "ty"
+check tag missingType "type"
+check tag badType "type "
+check tag badType "type com"
+add "type commit"
+
+check tag missingTagEntry ""
+check tag missingTagEntry "ta"
+check tag missingTagEntry "tag"
+check tag badTagName "tag "
+add "tag foo"
+
+check tag missingTagger ""
+check tag missingTagger "ta"
+check tag missingTagger "tagger"
+ident_checks tag tagger
+
+# trees are a binary format and can't use our earlier helpers
+test_expect_success 'truncated tree (short hash)' '
+	printf "100644 foo\0\1\1\1\1" >input &&
+	test_must_fail git hash-object -t tree input 2>err &&
+	grep badTree err
+'
+
+test_expect_success 'truncated tree (missing nul)' '
+	# these two things are indistinguishable to the parser. The important
+	# thing about this is example is that there are enough bytes to
+	# make up a hash, and that there is no NUL (and we confirm that the
+	# parser does not walk past the end of the buffer).
+	printf "100644 a long filename, or a hash with missing nul?" >input &&
+	test_must_fail git hash-object -t tree input 2>err &&
+	grep badTree err
+'
+
+test_done