diff mbox series

fsck: detect bare repos in trees and warn

Message ID 20220406232231.47714-1-chooglen@google.com (mailing list archive)
State New, archived
Headers show
Series fsck: detect bare repos in trees and warn | expand

Commit Message

Glen Choo April 6, 2022, 11:22 p.m. UTC
Git tries not to distribute configs in-repo because they are a security
risk. However, an attacker can do exactly this if they embed a bare
repo inside of another repo.

Teach fsck to detect whether a tree object contains a bare repo (as
determined by setup.c) and warn. This will help hosting sites detect and
prevent transmission of such malicious repos.

See [1] for a more in-depth discussion, including future steps and
alternatives.

[1] https://lore.kernel.org/git/kl6lsfqpygsj.fsf@chooglen-macbookpro.roam.corp.google.com/

Signed-off-by: Glen Choo <chooglen@google.com>
---
 fsck.c          | 19 +++++++++++++++++++
 fsck.h          |  1 +
 setup.c         |  4 ++++
 t/t1450-fsck.sh | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 60 insertions(+)


base-commit: 805e0a68082a217f0112db9ee86a022227a9c81b

Comments

Johannes Schindelin April 7, 2022, 12:42 p.m. UTC | #1
Hi Glen,

On Wed, 6 Apr 2022, Glen Choo wrote:

> Git tries not to distribute configs in-repo because they are a security
> risk. However, an attacker can do exactly this if they embed a bare
> repo inside of another repo.
>
> Teach fsck to detect whether a tree object contains a bare repo (as
> determined by setup.c) and warn. This will help hosting sites detect and
> prevent transmission of such malicious repos.
>
> See [1] for a more in-depth discussion, including future steps and
> alternatives.
>
> [1] https://lore.kernel.org/git/kl6lsfqpygsj.fsf@chooglen-macbookpro.roam.corp.google.com/

Out of curiosity: does this new check trigger with
https://github.com/libgit2/libgit2? AFAIR it has embedded repositories
that are used in its test suite. In other words, libgit2 has a legitimate
use case for embedded bare repositories, I believe.

Thank you,
Johannes

>
> Signed-off-by: Glen Choo <chooglen@google.com>
> ---
>  fsck.c          | 19 +++++++++++++++++++
>  fsck.h          |  1 +
>  setup.c         |  4 ++++
>  t/t1450-fsck.sh | 36 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 60 insertions(+)
>
> diff --git a/fsck.c b/fsck.c
> index 3ec500d707..11c11c348a 100644
> --- a/fsck.c
> +++ b/fsck.c
> @@ -573,6 +573,9 @@ static int fsck_tree(const struct object_id *tree_oid,
>  	int has_bad_modes = 0;
>  	int has_dup_entries = 0;
>  	int not_properly_sorted = 0;
> +	int has_head = 0;
> +	int has_refs_entry = 0;
> +	int has_objects_entry = 0;
>  	struct tree_desc desc;
>  	unsigned o_mode;
>  	const char *o_name;
> @@ -602,6 +605,12 @@ static int fsck_tree(const struct object_id *tree_oid,
>  		has_dotdot |= !strcmp(name, "..");
>  		has_dotgit |= is_hfs_dotgit(name) || is_ntfs_dotgit(name);
>  		has_zero_pad |= *(char *)desc.buffer == '0';
> +		has_head |= !strcasecmp(name, "HEAD")
> +			&& (S_ISLNK(mode) || S_ISREG(mode));
> +		has_refs_entry |= !strcasecmp(name, "refs")
> +			&& (S_ISLNK(mode) || S_ISDIR(mode));
> +		has_objects_entry |= !strcasecmp(name, "objects")
> +			&& (S_ISLNK(mode) || S_ISDIR(mode));
>
>  		if (is_hfs_dotgitmodules(name) || is_ntfs_dotgitmodules(name)) {
>  			if (!S_ISLNK(mode))
> @@ -739,6 +748,16 @@ static int fsck_tree(const struct object_id *tree_oid,
>  		retval += report(options, tree_oid, OBJ_TREE,
>  				 FSCK_MSG_TREE_NOT_SORTED,
>  				 "not properly sorted");
> +	/*
> +	 * Determine if this tree looks like a bare repository according
> +	 * to the rules of setup.c. If those are changed, this should be
> +	 * changed too.
> +	 */
> +	if (has_head && has_refs_entry && has_objects_entry)
> +		retval += report(options, tree_oid, OBJ_TREE,
> +				 FSCK_MSG_EMBEDDED_BARE_REPO,
> +				 "contains bare repository");
> +
>  	return retval;
>  }
>
> diff --git a/fsck.h b/fsck.h
> index d07f7a2459..3f0f73b0f3 100644
> --- a/fsck.h
> +++ b/fsck.h
> @@ -65,6 +65,7 @@ enum fsck_msg_type {
>  	FUNC(NULL_SHA1, WARN) \
>  	FUNC(ZERO_PADDED_FILEMODE, WARN) \
>  	FUNC(NUL_IN_COMMIT, WARN) \
> +	FUNC(EMBEDDED_BARE_REPO, WARN) \
>  	/* infos (reported as warnings, but ignored by default) */ \
>  	FUNC(GITMODULES_PARSE, INFO) \
>  	FUNC(GITIGNORE_SYMLINK, INFO) \
> diff --git a/setup.c b/setup.c
> index 04ce33cdcd..2600548776 100644
> --- a/setup.c
> +++ b/setup.c
> @@ -336,6 +336,10 @@ int get_common_dir_noenv(struct strbuf *sb, const char *gitdir)
>   *  - either a HEAD symlink or a HEAD file that is formatted as
>   *    a proper "ref:", or a regular file HEAD that has a properly
>   *    formatted sha1 object name.
> + *
> + * fsck.c checks for bare repositories in trees using similar rules, but a
> + * duplicated implementation. If these are changed, the correspnding code in
> + * fsck.c should change too.
>   */
>  int is_git_directory(const char *suspect)
>  {
> diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh
> index de50c0ea01..a65827bc03 100755
> --- a/t/t1450-fsck.sh
> +++ b/t/t1450-fsck.sh
> @@ -563,6 +563,42 @@ dot-backslash-case .\\\\.GIT\\\\foobar
>  dotgit-case-backslash .git\\\\foobar
>  EOF
>
> +test_expect_success "fsck notices bare repo" '
> +(
> +	mkdir -p embedded-bare-repo/bare &&
> +	git init embedded-bare-repo &&
> +	(
> +		cd embedded-bare-repo/bare &&
> +		echo content >HEAD &&
> +		mkdir refs/ objects/ &&
> +		echo content >refs/foo &&
> +		echo content >objects/foo &&
> +		git add . &&
> +		git commit -m base &&
> +		bad_tree=$(git rev-parse HEAD:bare) &&
> +		git fsck 2>out &&
> +		test_i18ngrep "warning.*tree $bad_tree: embeddedBareRepo: contains bare repository" out
> +	)
> +)'
> +
> +test_expect_success "fsck notices bare repo with odd casing" '
> +(
> +	mkdir -p embedded-bare-repo-case/bare &&
> +	git init embedded-bare-repo-case &&
> +	(
> +		cd embedded-bare-repo-case/bare &&
> +		echo content >heAD &&
> +		mkdir Refs/ objectS/ &&
> +		echo content >Refs/foo &&
> +		echo content >objectS/foo &&
> +		git add . &&
> +		git commit -m base &&
> +		bad_tree=$(git rev-parse HEAD:bare) &&
> +		git fsck 2>out &&
> +		test_i18ngrep "warning.*tree $bad_tree: embeddedBareRepo: contains bare repository" out
> +	)
> +)'
> +
>  test_expect_success 'fsck allows .??it' '
>  	(
>  		git init not-dotgit &&
>
> base-commit: 805e0a68082a217f0112db9ee86a022227a9c81b
> --
> 2.33.GIT
>
>
Ævar Arnfjörð Bjarmason April 7, 2022, 1:12 p.m. UTC | #2
On Wed, Apr 06 2022, Glen Choo wrote:

> @@ -602,6 +605,12 @@ static int fsck_tree(const struct object_id *tree_oid,
>  		has_dotdot |= !strcmp(name, "..");
>  		has_dotgit |= is_hfs_dotgit(name) || is_ntfs_dotgit(name);
>  		has_zero_pad |= *(char *)desc.buffer == '0';
> +		has_head |= !strcasecmp(name, "HEAD")
> +			&& (S_ISLNK(mode) || S_ISREG(mode));
> +		has_refs_entry |= !strcasecmp(name, "refs")
> +			&& (S_ISLNK(mode) || S_ISDIR(mode));
> +		has_objects_entry |= !strcasecmp(name, "objects")
> +			&& (S_ISLNK(mode) || S_ISDIR(mode));

Doesn't this code need to use is_hfs_dot_str() instead of strcasecmp()
like the other similar checks?

> @@ -336,6 +336,10 @@ int get_common_dir_noenv(struct strbuf *sb, const char *gitdir)
>   *  - either a HEAD symlink or a HEAD file that is formatted as
>   *    a proper "ref:", or a regular file HEAD that has a properly
>   *    formatted sha1 object name.
> + *
> + * fsck.c checks for bare repositories in trees using similar rules, but a
> + * duplicated implementation. If these are changed, the correspnding code in
> + * fsck.c should change too.
>   */

Probably took much hassle to factor these so it can be re-used. Typo:
correspnding.

> +		test_i18ngrep "warning.*tree $bad_tree: embeddedBareRepo: contains bare repository" out

s/test_i18ngrep/grep/
Derrick Stolee April 7, 2022, 1:21 p.m. UTC | #3
On 4/7/2022 8:42 AM, Johannes Schindelin wrote:
> Hi Glen,
> 
> On Wed, 6 Apr 2022, Glen Choo wrote:
> 
>> Git tries not to distribute configs in-repo because they are a security
>> risk. However, an attacker can do exactly this if they embed a bare
>> repo inside of another repo.
>>
>> Teach fsck to detect whether a tree object contains a bare repo (as
>> determined by setup.c) and warn. This will help hosting sites detect and
>> prevent transmission of such malicious repos.
>>
>> See [1] for a more in-depth discussion, including future steps and
>> alternatives.
>>
>> [1] https://lore.kernel.org/git/kl6lsfqpygsj.fsf@chooglen-macbookpro.roam.corp.google.com/
> 
> Out of curiosity: does this new check trigger with
> https://github.com/libgit2/libgit2? AFAIR it has embedded repositories
> that are used in its test suite. In other words, libgit2 has a legitimate
> use case for embedded bare repositories, I believe.

It is definitely good to keep in mind that other repositories have
included bare repositories for convenience. I'm not sure that the behavior
of some good actors should outweigh the benefits of protecting against
this attack vector.

The trouble here is: how could the libgit2 repo change their project to
not trigger this warning? These bare repos are in their history forever if
they don't do go through significant work and pain to remove them from
their history. We would want to have a way to make the warnings less
severe for special cases like this.

Simultaneously, we wouldn't want to bless all _forks_ of libgit2.

Finally, the real thing we want to avoid is having the Git client write
these trees to disk, for example during a 'git checkout', unless the user
gives an override. (We would want 'git bisect' to still work on the
libgit2 repo, for example.)

A more complete protection here would be:

 1. Warn when finding a bare repo as a tree (this patch).

 2. Suppress warnings on trusted repos, scoped to a specific set of known
    trees _or_ based on some set of known commits (in case the known trees
    are too large).

 3. Prevent writing a bare repo to the worktree, unless the user provided
    an opt-in to that behavior.

Since your patch is moving in the right direction here, I don't think
steps (2) and (3) are required to move forward with your patch. However,
it is a good opportunity to discuss the full repercussions of this issue.

Thanks,
-Stolee
Ævar Arnfjörð Bjarmason April 7, 2022, 2:14 p.m. UTC | #4
On Thu, Apr 07 2022, Derrick Stolee wrote:

> A more complete protection here would be:
>
>  1. Warn when finding a bare repo as a tree (this patch).
>
>  2. Suppress warnings on trusted repos, scoped to a specific set of known
>     trees _or_ based on some set of known commits (in case the known trees
>     are too large).
>
>  3. Prevent writing a bare repo to the worktree, unless the user provided
>     an opt-in to that behavior.
>
> Since your patch is moving in the right direction here, I don't think
> steps (2) and (3) are required to move forward with your patch. However,
> it is a good opportunity to discuss the full repercussions of this issue.

Isn't a gentler solution here to:

 1. In setup.c, we detect a repo
 2. Walk up a directory
 3. Do we find a repo?
 4. Does that repo "contain" the first one?
    If yes: die on setup
    If no: it's OK

It also seems to me that there's pretty much perfect overlap between
this and the long-discussed topic of marking a submodule with config
v.s. detecting it on the fly.

Since we're already discussing breaking long-standing repos in the wild
here I think it's good to take a step back and think of a potential more
narrow solution.

There's nothing wrong with having tracked content per-se, it's just that
we ourselves have other code that'll shoot itself in the foot in certain
scenarios, like this one.

But for a newer git that needs to run this fsck check it'll already need
to detect that something "looks like a repo", but if it can do that and
setup.c can walk up from the parent directory & find a repository we can
combine the two.

Of course we'll probably still want an opt-in fsck check for hosting
providers who'll want to protect older clients, but as long as the two
are separated that'll only need to last as long as such clients are
potentially there in the wild.
Junio C Hamano April 7, 2022, 3:11 p.m. UTC | #5
Derrick Stolee <derrickstolee@github.com> writes:

> It is definitely good to keep in mind that other repositories have
> included bare repositories for convenience. I'm not sure that the behavior
> of some good actors should outweigh the benefits of protecting against
> this attack vector.

Good line of thinking.

>  2. Suppress warnings on trusted repos, scoped to a specific set of known
>     trees _or_ based on some set of known commits (in case the known trees
>     are too large).

Is "It is OK to check out an embedded repository from this commit or
any of its ancestors" the kind of suppression you meant by "known
commits"?

>  3. Prevent writing a bare repo to the worktree, unless the user provided
>     an opt-in to that behavior.
>
> Since your patch is moving in the right direction here, I don't think
> steps (2) and (3) are required to move forward with your patch. However,
> it is a good opportunity to discuss the full repercussions of this issue.

We can definitely start without (3).  Unleashing (1) before (2) is
ready would mean folks cannot clone projects like libgit2 until
later, which takes us back to the first point you made above.

Those who use embedded bare repositories as test vector can easily
switch to storing an equivalent of "a tarball that is expanded while
building and/or testing", and as long as the user trusts the project
enough to run its build procedure or tests, we are not adding much
to the attack surface, I would guess.
Junio C Hamano April 7, 2022, 3:20 p.m. UTC | #6
Glen Choo <chooglen@google.com> writes:

> @@ -602,6 +605,12 @@ static int fsck_tree(const struct object_id *tree_oid,
>  		has_dotdot |= !strcmp(name, "..");
>  		has_dotgit |= is_hfs_dotgit(name) || is_ntfs_dotgit(name);
>  		has_zero_pad |= *(char *)desc.buffer == '0';
> +		has_head |= !strcasecmp(name, "HEAD")
> +			&& (S_ISLNK(mode) || S_ISREG(mode));
> +		has_refs_entry |= !strcasecmp(name, "refs")
> +			&& (S_ISLNK(mode) || S_ISDIR(mode));

Sorry, I know I am to blame, but I think it is wrong to insist
"refs" (and others) to be a directory.  setup.c::is_git_directory()
only checks with access(X_OK).  Also, if I am not mistaken, we plan
to take advantage of it and may make refs a regular file, for
example, to signal that the ref-files backend is unwelcome in the
repository.  HEAD can be a symbolic link, not necessarily a regular
file.  We do not create "refs" and "objects" as symbolic links
ourselves, but it is good to see that the code protects us against
such a directory with these entries being one.

> +		has_objects_entry |= !strcasecmp(name, "objects")
> +			&& (S_ISLNK(mode) || S_ISDIR(mode));
>  
>  		if (is_hfs_dotgitmodules(name) || is_ntfs_dotgitmodules(name)) {
>  			if (!S_ISLNK(mode))
> @@ -739,6 +748,16 @@ static int fsck_tree(const struct object_id *tree_oid,
>  		retval += report(options, tree_oid, OBJ_TREE,
>  				 FSCK_MSG_TREE_NOT_SORTED,
>  				 "not properly sorted");
> +	/*
> +	 * Determine if this tree looks like a bare repository according
> +	 * to the rules of setup.c. If those are changed, this should be
> +	 * changed too.
> +	 */
> +	if (has_head && has_refs_entry && has_objects_entry)
> +		retval += report(options, tree_oid, OBJ_TREE,
> +				 FSCK_MSG_EMBEDDED_BARE_REPO,
> +				 "contains bare repository");
> +
>  	return retval;
>  }
Glen Choo April 13, 2022, 10:24 p.m. UTC | #7
Derrick Stolee <derrickstolee@github.com> writes:

> On 4/7/2022 8:42 AM, Johannes Schindelin wrote:
>> Hi Glen,
>> 
>> On Wed, 6 Apr 2022, Glen Choo wrote:
>> 
>>> Git tries not to distribute configs in-repo because they are a security
>>> risk. However, an attacker can do exactly this if they embed a bare
>>> repo inside of another repo.
>>>
>>> Teach fsck to detect whether a tree object contains a bare repo (as
>>> determined by setup.c) and warn. This will help hosting sites detect and
>>> prevent transmission of such malicious repos.
>>>
>>> See [1] for a more in-depth discussion, including future steps and
>>> alternatives.
>>>
>>> [1] https://lore.kernel.org/git/kl6lsfqpygsj.fsf@chooglen-macbookpro.roam.corp.google.com/
>> 
>> Out of curiosity: does this new check trigger with
>> https://github.com/libgit2/libgit2? AFAIR it has embedded repositories
>> that are used in its test suite. In other words, libgit2 has a legitimate
>> use case for embedded bare repositories, I believe.
>
> It is definitely good to keep in mind that other repositories have
> included bare repositories for convenience. I'm not sure that the behavior
> of some good actors should outweigh the benefits of protecting against
> this attack vector.
>
> The trouble here is: how could the libgit2 repo change their project to
> not trigger this warning? These bare repos are in their history forever if
> they don't do go through significant work and pain to remove them from
> their history. We would want to have a way to make the warnings less
> severe for special cases like this.
>
> Simultaneously, we wouldn't want to bless all _forks_ of libgit2.

Yes, that makes sense. Thanks for the thoughtful reply.

>  2. Suppress warnings on trusted repos, scoped to a specific set of known
>     trees _or_ based on some set of known commits (in case the known trees
>     are too large).

Since Junio mentioned downthread that we'd need (2), I'll focus on this.
I'm not sure I follow, though, so let me try to verbalize my thought
process to see what I'm not understanding...

By "Suppress warnings on trusted repos", I assume this is done on the
hosting side? (Since I can't imagine a built-in Git feature that could
selectively trust repos.)

"scoped to a specific set of known trees" sounds like fsck.skipList
i.e. as a host, I can configure a list of "good" libgit2 trees that I
will trust and those will be skipped by fsck.

So from my _very_ naive reading of (2), it seems like we already have
all of the pieces in place for hosts to do (2) on their own, _unless_
we think that fsck.skipList is inadequate for this use case. e.g. I
personally can't imagine any way to list every "good" tree and still
have a cloneable fork of libgit2, so we might to teach fsck to do
something smarter like "skip any objects reachable by these commits".
Glen Choo April 14, 2022, 8:02 p.m. UTC | #8
Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

> On Thu, Apr 07 2022, Derrick Stolee wrote:
>
>> A more complete protection here would be:
>>
>>  1. Warn when finding a bare repo as a tree (this patch).
>>
>>  2. Suppress warnings on trusted repos, scoped to a specific set of known
>>     trees _or_ based on some set of known commits (in case the known trees
>>     are too large).
>>
>>  3. Prevent writing a bare repo to the worktree, unless the user provided
>>     an opt-in to that behavior.
>>
>> Since your patch is moving in the right direction here, I don't think
>> steps (2) and (3) are required to move forward with your patch. However,
>> it is a good opportunity to discuss the full repercussions of this issue.
>
> Isn't a gentler solution here to:
>
>  1. In setup.c, we detect a repo
>  2. Walk up a directory
>  3. Do we find a repo?
>  4. Does that repo "contain" the first one?
>     If yes: die on setup
>     If no: it's OK
>
> It also seems to me that there's pretty much perfect overlap between
> this and the long-discussed topic of marking a submodule with config
> v.s. detecting it on the fly.

Your suggestion seems similar to:

  == 3. Detect that we are in an embedded bare repo and ignore the embedded bare
  repository in favor of the containing repo.

which I also think is a simple, robust mitigation if we put aside the
problem of walking up to the root in too many situations. I seem to
recall that this problem has come up before in [1] (and possibly other
topics? I wasn't really able to locate them through a cursory search..),
so I assume that's what you're referring to by "long-discussed topic".

(Forgive me if I'm asking you to repeat yourself yet another time) I
seem to recall that we weren't able to reach consensus on whether it's
okay for Git to opportunistically walk up the directory hierarchy during
setup, especially since There are some situations where this is
extremely expensive (VFS, network mount).

I actually like this option quite a lot, but I don't see how we could
implement this without imposing a big penalty to all bare repo users -
they'd either be forced to set GIT_DIR or GIT_CEILING_DIRECTORIES, or
take a (potentially big) performance hit. Hopefully I'm just framing
this too narrowly and you're approaching this differently.

PS: As an aside, wouldn't this also break libgit2? We could make this
opt-out behavior, though that requires us to read system config _before_
discovering the gitdir (as I discussed in [2]).

[1] https://lore.kernel.org/git/211109.86v912dtfw.gmgdl@evledraar.gmail.com/
[2] https://lore.kernel.org/git/kl6lv8vc90ts.fsf@chooglen-macbookpro.roam.corp.google.com
Ævar Arnfjörð Bjarmason April 15, 2022, 12:46 p.m. UTC | #9
On Thu, Apr 14 2022, Glen Choo wrote:

> Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
>
>> On Thu, Apr 07 2022, Derrick Stolee wrote:
>>
>>> A more complete protection here would be:
>>>
>>>  1. Warn when finding a bare repo as a tree (this patch).
>>>
>>>  2. Suppress warnings on trusted repos, scoped to a specific set of known
>>>     trees _or_ based on some set of known commits (in case the known trees
>>>     are too large).
>>>
>>>  3. Prevent writing a bare repo to the worktree, unless the user provided
>>>     an opt-in to that behavior.
>>>
>>> Since your patch is moving in the right direction here, I don't think
>>> steps (2) and (3) are required to move forward with your patch. However,
>>> it is a good opportunity to discuss the full repercussions of this issue.
>>
>> Isn't a gentler solution here to:
>>
>>  1. In setup.c, we detect a repo
>>  2. Walk up a directory
>>  3. Do we find a repo?
>>  4. Does that repo "contain" the first one?
>>     If yes: die on setup
>>     If no: it's OK
>>
>> It also seems to me that there's pretty much perfect overlap between
>> this and the long-discussed topic of marking a submodule with config
>> v.s. detecting it on the fly.
>
> Your suggestion seems similar to:
>
>   == 3. Detect that we are in an embedded bare repo and ignore the embedded bare
>   repository in favor of the containing repo.
>
> which I also think is a simple, robust mitigation if we put aside the
> problem of walking up to the root in too many situations. I seem to
> recall that this problem has come up before in [1] (and possibly other
> topics? I wasn't really able to locate them through a cursory search..),
> so I assume that's what you're referring to by "long-discussed topic".

Yes, I mean the submodule.superprojectGitDir topic.

> (Forgive me if I'm asking you to repeat yourself yet another time) I
> seem to recall that we weren't able to reach consensus on whether it's
> okay for Git to opportunistically walk up the directory hierarchy during
> setup, especially since There are some situations where this is
> extremely expensive (VFS, network mount).

I'm not sure, but I think per the later
https://lore.kernel.org/git/220204.86pmo34d2m.gmgdl@evledraar.gmail.com/
and
https://lore.kernel.org/git/220311.8635joj0lf.gmgdl@evledraar.gmail.com/
that any optimization concerns were likely just "this is slow in
shellscript" and not at the FS level.

There were also passing references to some internal Google-specific
NFS-ish implementation that I know nothing about (but you might),
i.e. what I asked about in:
https://lore.kernel.org/git/220212.864k53yfws.gmgdl@evledraar.gmail.com/

But given the v9 superprojectGitDir becoming a boolean instead of a path
in v9 I'm not sure/have no idea.

The only thing I'm sure of is if past iterations of the series were
addressing such a problem as an optimization that doesn't seem to be a
current goal.

As noted in those past exchanges I have tested this method on e.g. AIX
whose FS is unbelievably slow, and I couldn't even tell the differenc.

That's because if you look at the total FS syscalls even for an
uninitialized repo just traversing .git, getting config etc. is going to
dwarf "walking up" in terms of number of calls.

Of course not all calls are going to be equal, and there's that
potential "I'm not NFS-y, but a parent is" case etc.

In any case, I think even *if* we had such a case somewhere that this
plan would still make sense. Such users could simply set
GIT_CEILING_DIRECTORIES or something similar if they cared about the
performance.

But for everyone else we'd do the right thing, and not prematurely
optimize. I.e. we actually *are* concerned not with "does it look like a
bare repo?" but "is this thing that looks like a bare repo within our
current actual repo or not?".

> I actually like this option quite a lot, but I don't see how we could
> implement this without imposing a big penalty to all bare repo users -
> they'd either be forced to set GIT_DIR or GIT_CEILING_DIRECTORIES, or
> take a (potentially big) performance hit. Hopefully I'm just framing
> this too narrowly and you're approaching this differently.

As noted in the [1] you quoted (link below) I tried to quantify that
potential penalty, and it seems to be a complete non-issue.

Of course there may be other scenarios where it matters, but I haven't
seen any concrete data to support that.

Doesn't pretty everyone who cares about the performance of bare in any
capacity do so because they're running a server that's using
git-upload-pack and the like? Those require you to specify the exact
.git directory you want.

I.e. wouldn't this *only* apply to those doing the equivalent of "git -C
some-dir" to "cd" to a bare repo?

> PS: As an aside, wouldn't this also break libgit2? We could make this
> opt-out behavior, though that requires us to read system config _before_
> discovering the gitdir (as I discussed in [2]).

No it wouldn't? I don't use libgit2, but upthread there's concern that
banning things that look-like-a-repo from being tracked would break it.

Whereas I'm pointing out that we don't need to do that, we can just keep
searching upwards.

But yes, it would "break" anything that assumed you could cd to that
tracked-looks-like-or-is--a-gitdir and have e.g. "git config" pick up
its config instead of our "real repo" config, but that's exactly what we
want in this case isn't it?

I'm just pointing out that we can do it on the fly in setup.c, instead
of forbidding such content from ever being tracked within the
repository, which we'd be doing because we know we're doing the wrong
thing in that setup.c codepath.

Let's just fix that bit in setup.c instead.

> [1] https://lore.kernel.org/git/211109.86v912dtfw.gmgdl@evledraar.gmail.com/
> [2] https://lore.kernel.org/git/kl6lv8vc90ts.fsf@chooglen-macbookpro.roam.corp.google.com
diff mbox series

Patch

diff --git a/fsck.c b/fsck.c
index 3ec500d707..11c11c348a 100644
--- a/fsck.c
+++ b/fsck.c
@@ -573,6 +573,9 @@  static int fsck_tree(const struct object_id *tree_oid,
 	int has_bad_modes = 0;
 	int has_dup_entries = 0;
 	int not_properly_sorted = 0;
+	int has_head = 0;
+	int has_refs_entry = 0;
+	int has_objects_entry = 0;
 	struct tree_desc desc;
 	unsigned o_mode;
 	const char *o_name;
@@ -602,6 +605,12 @@  static int fsck_tree(const struct object_id *tree_oid,
 		has_dotdot |= !strcmp(name, "..");
 		has_dotgit |= is_hfs_dotgit(name) || is_ntfs_dotgit(name);
 		has_zero_pad |= *(char *)desc.buffer == '0';
+		has_head |= !strcasecmp(name, "HEAD")
+			&& (S_ISLNK(mode) || S_ISREG(mode));
+		has_refs_entry |= !strcasecmp(name, "refs")
+			&& (S_ISLNK(mode) || S_ISDIR(mode));
+		has_objects_entry |= !strcasecmp(name, "objects")
+			&& (S_ISLNK(mode) || S_ISDIR(mode));
 
 		if (is_hfs_dotgitmodules(name) || is_ntfs_dotgitmodules(name)) {
 			if (!S_ISLNK(mode))
@@ -739,6 +748,16 @@  static int fsck_tree(const struct object_id *tree_oid,
 		retval += report(options, tree_oid, OBJ_TREE,
 				 FSCK_MSG_TREE_NOT_SORTED,
 				 "not properly sorted");
+	/*
+	 * Determine if this tree looks like a bare repository according
+	 * to the rules of setup.c. If those are changed, this should be
+	 * changed too.
+	 */
+	if (has_head && has_refs_entry && has_objects_entry)
+		retval += report(options, tree_oid, OBJ_TREE,
+				 FSCK_MSG_EMBEDDED_BARE_REPO,
+				 "contains bare repository");
+
 	return retval;
 }
 
diff --git a/fsck.h b/fsck.h
index d07f7a2459..3f0f73b0f3 100644
--- a/fsck.h
+++ b/fsck.h
@@ -65,6 +65,7 @@  enum fsck_msg_type {
 	FUNC(NULL_SHA1, WARN) \
 	FUNC(ZERO_PADDED_FILEMODE, WARN) \
 	FUNC(NUL_IN_COMMIT, WARN) \
+	FUNC(EMBEDDED_BARE_REPO, WARN) \
 	/* infos (reported as warnings, but ignored by default) */ \
 	FUNC(GITMODULES_PARSE, INFO) \
 	FUNC(GITIGNORE_SYMLINK, INFO) \
diff --git a/setup.c b/setup.c
index 04ce33cdcd..2600548776 100644
--- a/setup.c
+++ b/setup.c
@@ -336,6 +336,10 @@  int get_common_dir_noenv(struct strbuf *sb, const char *gitdir)
  *  - either a HEAD symlink or a HEAD file that is formatted as
  *    a proper "ref:", or a regular file HEAD that has a properly
  *    formatted sha1 object name.
+ *
+ * fsck.c checks for bare repositories in trees using similar rules, but a
+ * duplicated implementation. If these are changed, the correspnding code in
+ * fsck.c should change too.
  */
 int is_git_directory(const char *suspect)
 {
diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh
index de50c0ea01..a65827bc03 100755
--- a/t/t1450-fsck.sh
+++ b/t/t1450-fsck.sh
@@ -563,6 +563,42 @@  dot-backslash-case .\\\\.GIT\\\\foobar
 dotgit-case-backslash .git\\\\foobar
 EOF
 
+test_expect_success "fsck notices bare repo" '
+(
+	mkdir -p embedded-bare-repo/bare &&
+	git init embedded-bare-repo &&
+	(
+		cd embedded-bare-repo/bare &&
+		echo content >HEAD &&
+		mkdir refs/ objects/ &&
+		echo content >refs/foo &&
+		echo content >objects/foo &&
+		git add . &&
+		git commit -m base &&
+		bad_tree=$(git rev-parse HEAD:bare) &&
+		git fsck 2>out &&
+		test_i18ngrep "warning.*tree $bad_tree: embeddedBareRepo: contains bare repository" out
+	)
+)'
+
+test_expect_success "fsck notices bare repo with odd casing" '
+(
+	mkdir -p embedded-bare-repo-case/bare &&
+	git init embedded-bare-repo-case &&
+	(
+		cd embedded-bare-repo-case/bare &&
+		echo content >heAD &&
+		mkdir Refs/ objectS/ &&
+		echo content >Refs/foo &&
+		echo content >objectS/foo &&
+		git add . &&
+		git commit -m base &&
+		bad_tree=$(git rev-parse HEAD:bare) &&
+		git fsck 2>out &&
+		test_i18ngrep "warning.*tree $bad_tree: embeddedBareRepo: contains bare repository" out
+	)
+)'
+
 test_expect_success 'fsck allows .Ňit' '
 	(
 		git init not-dotgit &&