diff mbox series

[v3,04/11] unpack-trees: introduce preserve_ignored to unpack_trees_options

Message ID f1a0700e598e52d6cdb507fe8a09c4fa9291c982.1632760428.git.gitgitgadget@gmail.com (mailing list archive)
State Accepted
Commit 04988c8d182da945cd9420274f33487157c5636f
Headers show
Series Fix various issues around removal of untracked files/directories | expand

Commit Message

Elijah Newren Sept. 27, 2021, 4:33 p.m. UTC
From: Elijah Newren <newren@gmail.com>

Currently, every caller of unpack_trees() that wants to ensure ignored
files are overwritten by default needs to:
   * allocate unpack_trees_options.dir
   * flip the DIR_SHOW_IGNORED flag in unpack_trees_options.dir->flags
   * call setup_standard_excludes
AND then after the call to unpack_trees() needs to
   * call dir_clear()
   * deallocate unpack_trees_options.dir
That's a fair amount of boilerplate, and every caller uses identical
code.  Make this easier by instead introducing a new boolean value where
the default value (0) does what we want so that new callers of
unpack_trees() automatically get the appropriate behavior.  And move all
the handling of unpack_trees_options.dir into unpack_trees() itself.

While preserve_ignored = 0 is the behavior we feel is the appropriate
default, we defer fixing commands to use the appropriate default until a
later commit.  So, this commit introduces several locations where we
manually set preserve_ignored=1.  This makes it clear where code paths
were previously preserving ignored files when they should not have been;
a future commit will flip these to instead use a value of 0 to get the
behavior we want.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 builtin/am.c        |  3 +++
 builtin/checkout.c  | 11 ++---------
 builtin/clone.c     |  2 ++
 builtin/merge.c     |  2 ++
 builtin/read-tree.c | 13 +++----------
 builtin/reset.c     |  2 ++
 builtin/stash.c     |  3 +++
 merge-ort.c         |  8 +-------
 merge-recursive.c   |  8 +-------
 merge.c             |  8 +-------
 reset.c             |  2 ++
 sequencer.c         |  2 ++
 unpack-trees.c      | 10 ++++++++++
 unpack-trees.h      |  1 +
 14 files changed, 35 insertions(+), 40 deletions(-)

Comments

Ævar Arnfjörð Bjarmason Sept. 29, 2021, 9:22 a.m. UTC | #1
On Mon, Sep 27 2021, Elijah Newren via GitGitGadget wrote:

> From: Elijah Newren <newren@gmail.com>
>
> Currently, every caller of unpack_trees() that wants to ensure ignored
> files are overwritten by default needs to:
>    * allocate unpack_trees_options.dir
>    * flip the DIR_SHOW_IGNORED flag in unpack_trees_options.dir->flags
>    * call setup_standard_excludes
> AND then after the call to unpack_trees() needs to
>    * call dir_clear()
>    * deallocate unpack_trees_options.dir
> That's a fair amount of boilerplate, and every caller uses identical
> code.  Make this easier by instead introducing a new boolean value where
> the default value (0) does what we want so that new callers of
> unpack_trees() automatically get the appropriate behavior.  And move all
> the handling of unpack_trees_options.dir into unpack_trees() itself.
>
> While preserve_ignored = 0 is the behavior we feel is the appropriate
> default, we defer fixing commands to use the appropriate default until a
> later commit.  So, this commit introduces several locations where we
> manually set preserve_ignored=1.  This makes it clear where code paths
> were previously preserving ignored files when they should not have been;
> a future commit will flip these to instead use a value of 0 to get the
> behavior we want.
>
> Signed-off-by: Elijah Newren <newren@gmail.com>
> ---
>  builtin/am.c        |  3 +++
>  builtin/checkout.c  | 11 ++---------
>  builtin/clone.c     |  2 ++
>  builtin/merge.c     |  2 ++
>  builtin/read-tree.c | 13 +++----------
>  builtin/reset.c     |  2 ++
>  builtin/stash.c     |  3 +++
>  merge-ort.c         |  8 +-------
>  merge-recursive.c   |  8 +-------
>  merge.c             |  8 +-------
>  reset.c             |  2 ++
>  sequencer.c         |  2 ++
>  unpack-trees.c      | 10 ++++++++++
>  unpack-trees.h      |  1 +
>  14 files changed, 35 insertions(+), 40 deletions(-)
>
> diff --git a/builtin/am.c b/builtin/am.c
> index e4a0ff9cd7c..1ee70692bc3 100644
> --- a/builtin/am.c
> +++ b/builtin/am.c
> @@ -1918,6 +1918,9 @@ static int fast_forward_to(struct tree *head, struct tree *remote, int reset)
>  	opts.update = 1;
>  	opts.merge = 1;
>  	opts.reset = reset;
> +	if (!reset)
> +		/* FIXME: Default should be to remove ignored files */
> +		opts.preserve_ignored = 1;
>  	opts.fn = twoway_merge;
>  	init_tree_desc(&t[0], head->buffer, head->size);
>  	init_tree_desc(&t[1], remote->buffer, remote->size);
> diff --git a/builtin/checkout.c b/builtin/checkout.c
> index 5335435d616..5e7957dd068 100644
> --- a/builtin/checkout.c
> +++ b/builtin/checkout.c
> @@ -648,6 +648,7 @@ static int reset_tree(struct tree *tree, const struct checkout_opts *o,
>  	opts.skip_unmerged = !worktree;
>  	opts.reset = 1;
>  	opts.merge = 1;
> +	opts.preserve_ignored = 0;
>  	opts.fn = oneway_merge;
>  	opts.verbose_update = o->show_progress;
>  	opts.src_index = &the_index;
> @@ -746,11 +747,7 @@ static int merge_working_tree(const struct checkout_opts *opts,
>  				       new_branch_info->commit ?
>  				       &new_branch_info->commit->object.oid :
>  				       &new_branch_info->oid, NULL);
> -		if (opts->overwrite_ignore) {
> -			topts.dir = xcalloc(1, sizeof(*topts.dir));
> -			topts.dir->flags |= DIR_SHOW_IGNORED;
> -			setup_standard_excludes(topts.dir);
> -		}
> +		topts.preserve_ignored = !opts->overwrite_ignore;
>  		tree = parse_tree_indirect(old_branch_info->commit ?
>  					   &old_branch_info->commit->object.oid :
>  					   the_hash_algo->empty_tree);
> @@ -760,10 +757,6 @@ static int merge_working_tree(const struct checkout_opts *opts,
>  		init_tree_desc(&trees[1], tree->buffer, tree->size);
>  
>  		ret = unpack_trees(2, trees, &topts);
> -		if (topts.dir) {
> -			dir_clear(topts.dir);
> -			FREE_AND_NULL(topts.dir);
> -		}
>  		clear_unpack_trees_porcelain(&topts);
>  		if (ret == -1) {
>  			/*
> diff --git a/builtin/clone.c b/builtin/clone.c
> index ff1d3d447a3..be1c3840d62 100644
> --- a/builtin/clone.c
> +++ b/builtin/clone.c
> @@ -687,6 +687,8 @@ static int checkout(int submodule_progress)
>  	opts.update = 1;
>  	opts.merge = 1;
>  	opts.clone = 1;
> +	/* FIXME: Default should be to remove ignored files */
> +	opts.preserve_ignored = 1;
>  	opts.fn = oneway_merge;
>  	opts.verbose_update = (option_verbosity >= 0);
>  	opts.src_index = &the_index;
> diff --git a/builtin/merge.c b/builtin/merge.c
> index 3fbdacc7db4..1e5fff095fc 100644
> --- a/builtin/merge.c
> +++ b/builtin/merge.c
> @@ -680,6 +680,8 @@ static int read_tree_trivial(struct object_id *common, struct object_id *head,
>  	opts.verbose_update = 1;
>  	opts.trivial_merges_only = 1;
>  	opts.merge = 1;
> +	/* FIXME: Default should be to remove ignored files */
> +	opts.preserve_ignored = 1;
>  	trees[nr_trees] = parse_tree_indirect(common);
>  	if (!trees[nr_trees++])
>  		return -1;
> diff --git a/builtin/read-tree.c b/builtin/read-tree.c
> index 73cb957a69b..443d206eca6 100644
> --- a/builtin/read-tree.c
> +++ b/builtin/read-tree.c
> @@ -201,11 +201,9 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
>  	if ((opts.update || opts.index_only) && !opts.merge)
>  		die("%s is meaningless without -m, --reset, or --prefix",
>  		    opts.update ? "-u" : "-i");
> -	if (opts.update && !opts.reset) {
> -		CALLOC_ARRAY(opts.dir, 1);
> -		opts.dir->flags |= DIR_SHOW_IGNORED;
> -		setup_standard_excludes(opts.dir);
> -	}
> +	if (opts.update && !opts.reset)
> +		opts.preserve_ignored = 0;
> +	/* otherwise, opts.preserve_ignored is irrelevant */
>  	if (opts.merge && !opts.index_only)
>  		setup_work_tree();
>  
> @@ -245,11 +243,6 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
>  	if (unpack_trees(nr_trees, t, &opts))
>  		return 128;
>  
> -	if (opts.dir) {
> -		dir_clear(opts.dir);
> -		FREE_AND_NULL(opts.dir);
> -	}
> -
>  	if (opts.debug_unpack || opts.dry_run)
>  		return 0; /* do not write the index out */
>  
> diff --git a/builtin/reset.c b/builtin/reset.c
> index 51c9e2f43ff..7f38656f018 100644
> --- a/builtin/reset.c
> +++ b/builtin/reset.c
> @@ -67,6 +67,8 @@ static int reset_index(const char *ref, const struct object_id *oid, int reset_t
>  	case KEEP:
>  	case MERGE:
>  		opts.update = 1;
> +		/* FIXME: Default should be to remove ignored files */
> +		opts.preserve_ignored = 1;
>  		break;
>  	case HARD:
>  		opts.update = 1;
> diff --git a/builtin/stash.c b/builtin/stash.c
> index 8f42360ca91..88287b890d5 100644
> --- a/builtin/stash.c
> +++ b/builtin/stash.c
> @@ -258,6 +258,9 @@ static int reset_tree(struct object_id *i_tree, int update, int reset)
>  	opts.merge = 1;
>  	opts.reset = reset;
>  	opts.update = update;
> +	if (update && !reset)
> +		/* FIXME: Default should be to remove ignored files */
> +		opts.preserve_ignored = 1;
>  	opts.fn = oneway_merge;
>  
>  	if (unpack_trees(nr_trees, t, &opts))
> diff --git a/merge-ort.c b/merge-ort.c
> index 35aa979c3a4..0d64ec716bd 100644
> --- a/merge-ort.c
> +++ b/merge-ort.c
> @@ -4045,11 +4045,7 @@ static int checkout(struct merge_options *opt,
>  	unpack_opts.quiet = 0; /* FIXME: sequencer might want quiet? */
>  	unpack_opts.verbose_update = (opt->verbosity > 2);
>  	unpack_opts.fn = twoway_merge;
> -	if (1/* FIXME: opts->overwrite_ignore*/) {
> -		CALLOC_ARRAY(unpack_opts.dir, 1);
> -		unpack_opts.dir->flags |= DIR_SHOW_IGNORED;
> -		setup_standard_excludes(unpack_opts.dir);
> -	}
> +	unpack_opts.preserve_ignored = 0; /* FIXME: !opts->overwrite_ignore*/
>  	parse_tree(prev);
>  	init_tree_desc(&trees[0], prev->buffer, prev->size);
>  	parse_tree(next);
> @@ -4057,8 +4053,6 @@ static int checkout(struct merge_options *opt,
>  
>  	ret = unpack_trees(2, trees, &unpack_opts);
>  	clear_unpack_trees_porcelain(&unpack_opts);
> -	dir_clear(unpack_opts.dir);
> -	FREE_AND_NULL(unpack_opts.dir);
>  	return ret;
>  }
>  
> diff --git a/merge-recursive.c b/merge-recursive.c
> index 233d9f686ad..2be3f5d4044 100644
> --- a/merge-recursive.c
> +++ b/merge-recursive.c
> @@ -411,9 +411,7 @@ static int unpack_trees_start(struct merge_options *opt,
>  	else {
>  		opt->priv->unpack_opts.update = 1;
>  		/* FIXME: should only do this if !overwrite_ignore */
> -		CALLOC_ARRAY(opt->priv->unpack_opts.dir, 1);
> -		opt->priv->unpack_opts.dir->flags |= DIR_SHOW_IGNORED;
> -		setup_standard_excludes(opt->priv->unpack_opts.dir);
> +		opt->priv->unpack_opts.preserve_ignored = 0;
>  	}
>  	opt->priv->unpack_opts.merge = 1;
>  	opt->priv->unpack_opts.head_idx = 2;
> @@ -428,10 +426,6 @@ static int unpack_trees_start(struct merge_options *opt,
>  	init_tree_desc_from_tree(t+2, merge);
>  
>  	rc = unpack_trees(3, t, &opt->priv->unpack_opts);
> -	if (opt->priv->unpack_opts.dir) {
> -		dir_clear(opt->priv->unpack_opts.dir);
> -		FREE_AND_NULL(opt->priv->unpack_opts.dir);
> -	}
>  	cache_tree_free(&opt->repo->index->cache_tree);
>  
>  	/*
> diff --git a/merge.c b/merge.c
> index 6e736881d90..2382ff66d35 100644
> --- a/merge.c
> +++ b/merge.c
> @@ -53,7 +53,6 @@ int checkout_fast_forward(struct repository *r,
>  	struct unpack_trees_options opts;
>  	struct tree_desc t[MAX_UNPACK_TREES];
>  	int i, nr_trees = 0;
> -	struct dir_struct dir = DIR_INIT;
>  	struct lock_file lock_file = LOCK_INIT;
>  
>  	refresh_index(r->index, REFRESH_QUIET, NULL, NULL, NULL);
> @@ -80,11 +79,7 @@ int checkout_fast_forward(struct repository *r,
>  	}
>  
>  	memset(&opts, 0, sizeof(opts));
> -	if (overwrite_ignore) {
> -		dir.flags |= DIR_SHOW_IGNORED;
> -		setup_standard_excludes(&dir);
> -		opts.dir = &dir;
> -	}
> +	opts.preserve_ignored = !overwrite_ignore;
>  
>  	opts.head_idx = 1;
>  	opts.src_index = r->index;
> @@ -101,7 +96,6 @@ int checkout_fast_forward(struct repository *r,
>  		clear_unpack_trees_porcelain(&opts);
>  		return -1;
>  	}
> -	dir_clear(&dir);
>  	clear_unpack_trees_porcelain(&opts);
>  
>  	if (write_locked_index(r->index, &lock_file, COMMIT_LOCK))
> diff --git a/reset.c b/reset.c
> index 79310ae071b..41b3e2d88de 100644
> --- a/reset.c
> +++ b/reset.c
> @@ -56,6 +56,8 @@ int reset_head(struct repository *r, struct object_id *oid, const char *action,
>  	unpack_tree_opts.fn = reset_hard ? oneway_merge : twoway_merge;
>  	unpack_tree_opts.update = 1;
>  	unpack_tree_opts.merge = 1;
> +	/* FIXME: Default should be to remove ignored files */
> +	unpack_tree_opts.preserve_ignored = 1;
>  	init_checkout_metadata(&unpack_tree_opts.meta, switch_to_branch, oid, NULL);
>  	if (!detach_head)
>  		unpack_tree_opts.reset = 1;
> diff --git a/sequencer.c b/sequencer.c
> index 614d56f5e21..098566c68d9 100644
> --- a/sequencer.c
> +++ b/sequencer.c
> @@ -3699,6 +3699,8 @@ static int do_reset(struct repository *r,
>  	unpack_tree_opts.fn = oneway_merge;
>  	unpack_tree_opts.merge = 1;
>  	unpack_tree_opts.update = 1;
> +	/* FIXME: Default should be to remove ignored files */
> +	unpack_tree_opts.preserve_ignored = 1;
>  	init_checkout_metadata(&unpack_tree_opts.meta, name, &oid, NULL);
>  
>  	if (repo_read_index_unmerged(r)) {
> diff --git a/unpack-trees.c b/unpack-trees.c
> index 8ea0a542da8..1e4eae1dc7d 100644
> --- a/unpack-trees.c
> +++ b/unpack-trees.c
> @@ -1707,6 +1707,12 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>  		ensure_full_index(o->dst_index);
>  	}
>  
> +	if (!o->preserve_ignored) {
> +		CALLOC_ARRAY(o->dir, 1);
> +		o->dir->flags |= DIR_SHOW_IGNORED;
> +		setup_standard_excludes(o->dir);
> +	}
> +
>  	if (!core_apply_sparse_checkout || !o->update)
>  		o->skip_sparse_checkout = 1;
>  	if (!o->skip_sparse_checkout && !o->pl) {
> @@ -1868,6 +1874,10 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>  done:
>  	if (free_pattern_list)
>  		clear_pattern_list(&pl);
> +	if (o->dir) {
> +		dir_clear(o->dir);
> +		FREE_AND_NULL(o->dir);
> +	}
>  	trace2_region_leave("unpack_trees", "unpack_trees", the_repository);
>  	trace_performance_leave("unpack_trees");
>  	return ret;
> diff --git a/unpack-trees.h b/unpack-trees.h
> index 2d88b19dca7..f98cfd49d7b 100644
> --- a/unpack-trees.h
> +++ b/unpack-trees.h
> @@ -49,6 +49,7 @@ struct unpack_trees_options {
>  	unsigned int reset,
>  		     merge,
>  		     update,
> +		     preserve_ignored,
>  		     clone,
>  		     index_only,
>  		     nontrivial_merge,

I think getting rid of the boilerplate makes sense, but it doesn't sound
from the commit message like you've considered just making that "struct
dir*" member a "struct dir" instead.

That simplifies things a lot, i.e. we can just DIR_INIT it, and don't
need every caller to malloc/free it.

Sometimes a pointer makes sense, but in this case the "struct
unpack_trees_options" can just own it.

As part of WIP leak fixes I have unsubmitted I'd implemented that, patch
follows below.

I think the part of it that deals with managing the "struct dir" is much
nicer, but you might still want to keep the "preserve_ignored" you've
added.

Oh, and I noticed I removed the dir_clear() here but didn't add it to
clear_unpack_trees_porcelain(), that also needs to be done (and I did it
in a later fix that I should squash in), but I can't be bothered to
re-do the below diff just for that, and since the point is how we manage
the struct itself (the freeing is rather trivial...).

diff --git a/builtin/checkout.c b/builtin/checkout.c
index 8c69dcdf72a..632da036717 100644
--- a/builtin/checkout.c
+++ b/builtin/checkout.c
@@ -747,9 +747,8 @@ static int merge_working_tree(const struct checkout_opts *opts,
 				       &new_branch_info->commit->object.oid :
 				       &new_branch_info->oid, NULL);
 		if (opts->overwrite_ignore) {
-			topts.dir = xcalloc(1, sizeof(*topts.dir));
-			topts.dir->flags |= DIR_SHOW_IGNORED;
-			setup_standard_excludes(topts.dir);
+			topts.dir.flags |= DIR_SHOW_IGNORED;
+			setup_standard_excludes(&topts.dir);
 		}
 		tree = parse_tree_indirect(old_branch_info->commit ?
 					   &old_branch_info->commit->object.oid :
diff --git a/builtin/read-tree.c b/builtin/read-tree.c
index 485e7b04794..6d529c77c49 100644
--- a/builtin/read-tree.c
+++ b/builtin/read-tree.c
@@ -53,20 +53,17 @@ static int index_output_cb(const struct option *opt, const char *arg,
 static int exclude_per_directory_cb(const struct option *opt, const char *arg,
 				    int unset)
 {
-	struct dir_struct *dir;
 	struct unpack_trees_options *opts;
 
 	BUG_ON_OPT_NEG(unset);
 
 	opts = (struct unpack_trees_options *)opt->value;
 
-	if (opts->dir)
+	if (opts->dir.exclude_per_dir)
 		die("more than one --exclude-per-directory given.");
 
-	dir = xcalloc(1, sizeof(*opts->dir));
-	dir->flags |= DIR_SHOW_IGNORED;
-	dir->exclude_per_dir = arg;
-	opts->dir = dir;
+	opts->dir.flags |= DIR_SHOW_IGNORED;
+	opts->dir.exclude_per_dir = arg;
 	/* We do not need to nor want to do read-directory
 	 * here; we are merely interested in reusing the
 	 * per directory ignore stack mechanism.
@@ -209,7 +206,7 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
 	if ((opts.update || opts.index_only) && !opts.merge)
 		die("%s is meaningless without -m, --reset, or --prefix",
 		    opts.update ? "-u" : "-i");
-	if ((opts.dir && !opts.update))
+	if ((opts.dir.exclude_per_dir && !opts.update))
 		die("--exclude-per-directory is meaningless unless -u");
 	if (opts.merge && !opts.index_only)
 		setup_work_tree();
diff --git a/merge-ort.c b/merge-ort.c
index 35aa979c3a4..e526b78b88d 100644
--- a/merge-ort.c
+++ b/merge-ort.c
@@ -4021,9 +4021,8 @@ static int checkout(struct merge_options *opt,
 	/* Switch the index/working copy from old to new */
 	int ret;
 	struct tree_desc trees[2];
-	struct unpack_trees_options unpack_opts;
+	struct unpack_trees_options unpack_opts = UNPACK_TREES_OPTIONS_INIT;
 
-	memset(&unpack_opts, 0, sizeof(unpack_opts));
 	unpack_opts.head_idx = -1;
 	unpack_opts.src_index = opt->repo->index;
 	unpack_opts.dst_index = opt->repo->index;
@@ -4046,9 +4045,8 @@ static int checkout(struct merge_options *opt,
 	unpack_opts.verbose_update = (opt->verbosity > 2);
 	unpack_opts.fn = twoway_merge;
 	if (1/* FIXME: opts->overwrite_ignore*/) {
-		CALLOC_ARRAY(unpack_opts.dir, 1);
-		unpack_opts.dir->flags |= DIR_SHOW_IGNORED;
-		setup_standard_excludes(unpack_opts.dir);
+		unpack_opts.dir.flags |= DIR_SHOW_IGNORED;
+		setup_standard_excludes(&unpack_opts.dir);
 	}
 	parse_tree(prev);
 	init_tree_desc(&trees[0], prev->buffer, prev->size);
@@ -4057,8 +4055,6 @@ static int checkout(struct merge_options *opt,
 
 	ret = unpack_trees(2, trees, &unpack_opts);
 	clear_unpack_trees_porcelain(&unpack_opts);
-	dir_clear(unpack_opts.dir);
-	FREE_AND_NULL(unpack_opts.dir);
 	return ret;
 }
 
diff --git a/merge.c b/merge.c
index 6e736881d90..9cb32990dd9 100644
--- a/merge.c
+++ b/merge.c
@@ -50,10 +50,9 @@ int checkout_fast_forward(struct repository *r,
 			  int overwrite_ignore)
 {
 	struct tree *trees[MAX_UNPACK_TREES];
-	struct unpack_trees_options opts;
+	struct unpack_trees_options opts = UNPACK_TREES_OPTIONS_INIT;
 	struct tree_desc t[MAX_UNPACK_TREES];
 	int i, nr_trees = 0;
-	struct dir_struct dir = DIR_INIT;
 	struct lock_file lock_file = LOCK_INIT;
 
 	refresh_index(r->index, REFRESH_QUIET, NULL, NULL, NULL);
@@ -79,11 +78,9 @@ int checkout_fast_forward(struct repository *r,
 		init_tree_desc(t+i, trees[i]->buffer, trees[i]->size);
 	}
 
-	memset(&opts, 0, sizeof(opts));
 	if (overwrite_ignore) {
-		dir.flags |= DIR_SHOW_IGNORED;
-		setup_standard_excludes(&dir);
-		opts.dir = &dir;
+		opts.dir.flags |= DIR_SHOW_IGNORED;
+		setup_standard_excludes(&opts.dir);
 	}
 
 	opts.head_idx = 1;
@@ -101,7 +98,6 @@ int checkout_fast_forward(struct repository *r,
 		clear_unpack_trees_porcelain(&opts);
 		return -1;
 	}
-	dir_clear(&dir);
 	clear_unpack_trees_porcelain(&opts);
 
 	if (write_locked_index(r->index, &lock_file, COMMIT_LOCK))
diff --git a/unpack-trees.c b/unpack-trees.c
index 8ea0a542da8..33a2dc23ffc 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -2081,7 +2081,7 @@ static int verify_clean_subdirectory(const struct cache_entry *ce,
 	 */
 	int namelen;
 	int i;
-	struct dir_struct d;
+	struct dir_struct d = DIR_INIT;
 	char *pathbuf;
 	int cnt = 0;
 
@@ -2132,9 +2132,7 @@ static int verify_clean_subdirectory(const struct cache_entry *ce,
 	 */
 	pathbuf = xstrfmt("%.*s/", namelen, ce->name);
 
-	memset(&d, 0, sizeof(d));
-	if (o->dir)
-		d.exclude_per_dir = o->dir->exclude_per_dir;
+	d.exclude_per_dir = o->dir.exclude_per_dir;
 	i = read_directory(&d, o->src_index, pathbuf, namelen+1, NULL);
 	if (i)
 		return add_rejected_path(o, ERROR_NOT_UPTODATE_DIR, ce->name);
@@ -2175,8 +2173,7 @@ static int check_ok_to_remove(const char *name, int len, int dtype,
 	if (ignore_case && icase_exists(o, name, len, st))
 		return 0;
 
-	if (o->dir &&
-	    is_excluded(o->dir, o->src_index, name, &dtype))
+	if (is_excluded(&o->dir, o->src_index, name, &dtype))
 		/*
 		 * ce->name is explicitly excluded, so it is Ok to
 		 * overwrite it.
diff --git a/unpack-trees.h b/unpack-trees.h
index 2d88b19dca7..6fa6a4dfc3e 100644
--- a/unpack-trees.h
+++ b/unpack-trees.h
@@ -5,6 +5,7 @@
 #include "strvec.h"
 #include "string-list.h"
 #include "tree-walk.h"
+#include "dir.h"
 
 #define MAX_UNPACK_TREES MAX_TRAVERSE_TREES
 
@@ -66,7 +67,7 @@ struct unpack_trees_options {
 		     dry_run;
 	const char *prefix;
 	int cache_bottom;
-	struct dir_struct *dir;
+	struct dir_struct dir;
 	struct pathspec *pathspec;
 	merge_fn_t fn;
 	const char *msgs[NB_UNPACK_TREES_WARNING_TYPES];
@@ -90,6 +91,9 @@ struct unpack_trees_options {
 	struct pattern_list *pl; /* for internal use */
 	struct checkout_metadata meta;
 };
+#define UNPACK_TREES_OPTIONS_INIT { \
+	.dir = DIR_INIT, \
+}
 
 int unpack_trees(unsigned n, struct tree_desc *t,
 		 struct unpack_trees_options *options);
Elijah Newren Sept. 29, 2021, 3:35 p.m. UTC | #2
On Wed, Sep 29, 2021 at 2:27 AM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Mon, Sep 27 2021, Elijah Newren via GitGitGadget wrote:
>
> > From: Elijah Newren <newren@gmail.com>
> >
> > Currently, every caller of unpack_trees() that wants to ensure ignored
> > files are overwritten by default needs to:
> >    * allocate unpack_trees_options.dir
> >    * flip the DIR_SHOW_IGNORED flag in unpack_trees_options.dir->flags
> >    * call setup_standard_excludes
> > AND then after the call to unpack_trees() needs to
> >    * call dir_clear()
> >    * deallocate unpack_trees_options.dir
> > That's a fair amount of boilerplate, and every caller uses identical
> > code.  Make this easier by instead introducing a new boolean value where
> > the default value (0) does what we want so that new callers of
> > unpack_trees() automatically get the appropriate behavior.  And move all
> > the handling of unpack_trees_options.dir into unpack_trees() itself.
> >
> > While preserve_ignored = 0 is the behavior we feel is the appropriate
> > default, we defer fixing commands to use the appropriate default until a
> > later commit.  So, this commit introduces several locations where we
> > manually set preserve_ignored=1.  This makes it clear where code paths
> > were previously preserving ignored files when they should not have been;
> > a future commit will flip these to instead use a value of 0 to get the
> > behavior we want.
> >
> > Signed-off-by: Elijah Newren <newren@gmail.com>
> > ---
> >  builtin/am.c        |  3 +++
> >  builtin/checkout.c  | 11 ++---------
> >  builtin/clone.c     |  2 ++
> >  builtin/merge.c     |  2 ++
> >  builtin/read-tree.c | 13 +++----------
> >  builtin/reset.c     |  2 ++
> >  builtin/stash.c     |  3 +++
> >  merge-ort.c         |  8 +-------
> >  merge-recursive.c   |  8 +-------
> >  merge.c             |  8 +-------
> >  reset.c             |  2 ++
> >  sequencer.c         |  2 ++
> >  unpack-trees.c      | 10 ++++++++++
> >  unpack-trees.h      |  1 +
> >  14 files changed, 35 insertions(+), 40 deletions(-)
> >
> > diff --git a/builtin/am.c b/builtin/am.c
> > index e4a0ff9cd7c..1ee70692bc3 100644
> > --- a/builtin/am.c
> > +++ b/builtin/am.c
> > @@ -1918,6 +1918,9 @@ static int fast_forward_to(struct tree *head, struct tree *remote, int reset)
> >       opts.update = 1;
> >       opts.merge = 1;
> >       opts.reset = reset;
> > +     if (!reset)
> > +             /* FIXME: Default should be to remove ignored files */
> > +             opts.preserve_ignored = 1;
> >       opts.fn = twoway_merge;
> >       init_tree_desc(&t[0], head->buffer, head->size);
> >       init_tree_desc(&t[1], remote->buffer, remote->size);
> > diff --git a/builtin/checkout.c b/builtin/checkout.c
> > index 5335435d616..5e7957dd068 100644
> > --- a/builtin/checkout.c
> > +++ b/builtin/checkout.c
> > @@ -648,6 +648,7 @@ static int reset_tree(struct tree *tree, const struct checkout_opts *o,
> >       opts.skip_unmerged = !worktree;
> >       opts.reset = 1;
> >       opts.merge = 1;
> > +     opts.preserve_ignored = 0;
> >       opts.fn = oneway_merge;
> >       opts.verbose_update = o->show_progress;
> >       opts.src_index = &the_index;
> > @@ -746,11 +747,7 @@ static int merge_working_tree(const struct checkout_opts *opts,
> >                                      new_branch_info->commit ?
> >                                      &new_branch_info->commit->object.oid :
> >                                      &new_branch_info->oid, NULL);
> > -             if (opts->overwrite_ignore) {
> > -                     topts.dir = xcalloc(1, sizeof(*topts.dir));
> > -                     topts.dir->flags |= DIR_SHOW_IGNORED;
> > -                     setup_standard_excludes(topts.dir);
> > -             }
> > +             topts.preserve_ignored = !opts->overwrite_ignore;
> >               tree = parse_tree_indirect(old_branch_info->commit ?
> >                                          &old_branch_info->commit->object.oid :
> >                                          the_hash_algo->empty_tree);
> > @@ -760,10 +757,6 @@ static int merge_working_tree(const struct checkout_opts *opts,
> >               init_tree_desc(&trees[1], tree->buffer, tree->size);
> >
> >               ret = unpack_trees(2, trees, &topts);
> > -             if (topts.dir) {
> > -                     dir_clear(topts.dir);
> > -                     FREE_AND_NULL(topts.dir);
> > -             }
> >               clear_unpack_trees_porcelain(&topts);
> >               if (ret == -1) {
> >                       /*
> > diff --git a/builtin/clone.c b/builtin/clone.c
> > index ff1d3d447a3..be1c3840d62 100644
> > --- a/builtin/clone.c
> > +++ b/builtin/clone.c
> > @@ -687,6 +687,8 @@ static int checkout(int submodule_progress)
> >       opts.update = 1;
> >       opts.merge = 1;
> >       opts.clone = 1;
> > +     /* FIXME: Default should be to remove ignored files */
> > +     opts.preserve_ignored = 1;
> >       opts.fn = oneway_merge;
> >       opts.verbose_update = (option_verbosity >= 0);
> >       opts.src_index = &the_index;
> > diff --git a/builtin/merge.c b/builtin/merge.c
> > index 3fbdacc7db4..1e5fff095fc 100644
> > --- a/builtin/merge.c
> > +++ b/builtin/merge.c
> > @@ -680,6 +680,8 @@ static int read_tree_trivial(struct object_id *common, struct object_id *head,
> >       opts.verbose_update = 1;
> >       opts.trivial_merges_only = 1;
> >       opts.merge = 1;
> > +     /* FIXME: Default should be to remove ignored files */
> > +     opts.preserve_ignored = 1;
> >       trees[nr_trees] = parse_tree_indirect(common);
> >       if (!trees[nr_trees++])
> >               return -1;
> > diff --git a/builtin/read-tree.c b/builtin/read-tree.c
> > index 73cb957a69b..443d206eca6 100644
> > --- a/builtin/read-tree.c
> > +++ b/builtin/read-tree.c
> > @@ -201,11 +201,9 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
> >       if ((opts.update || opts.index_only) && !opts.merge)
> >               die("%s is meaningless without -m, --reset, or --prefix",
> >                   opts.update ? "-u" : "-i");
> > -     if (opts.update && !opts.reset) {
> > -             CALLOC_ARRAY(opts.dir, 1);
> > -             opts.dir->flags |= DIR_SHOW_IGNORED;
> > -             setup_standard_excludes(opts.dir);
> > -     }
> > +     if (opts.update && !opts.reset)
> > +             opts.preserve_ignored = 0;
> > +     /* otherwise, opts.preserve_ignored is irrelevant */
> >       if (opts.merge && !opts.index_only)
> >               setup_work_tree();
> >
> > @@ -245,11 +243,6 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
> >       if (unpack_trees(nr_trees, t, &opts))
> >               return 128;
> >
> > -     if (opts.dir) {
> > -             dir_clear(opts.dir);
> > -             FREE_AND_NULL(opts.dir);
> > -     }
> > -
> >       if (opts.debug_unpack || opts.dry_run)
> >               return 0; /* do not write the index out */
> >
> > diff --git a/builtin/reset.c b/builtin/reset.c
> > index 51c9e2f43ff..7f38656f018 100644
> > --- a/builtin/reset.c
> > +++ b/builtin/reset.c
> > @@ -67,6 +67,8 @@ static int reset_index(const char *ref, const struct object_id *oid, int reset_t
> >       case KEEP:
> >       case MERGE:
> >               opts.update = 1;
> > +             /* FIXME: Default should be to remove ignored files */
> > +             opts.preserve_ignored = 1;
> >               break;
> >       case HARD:
> >               opts.update = 1;
> > diff --git a/builtin/stash.c b/builtin/stash.c
> > index 8f42360ca91..88287b890d5 100644
> > --- a/builtin/stash.c
> > +++ b/builtin/stash.c
> > @@ -258,6 +258,9 @@ static int reset_tree(struct object_id *i_tree, int update, int reset)
> >       opts.merge = 1;
> >       opts.reset = reset;
> >       opts.update = update;
> > +     if (update && !reset)
> > +             /* FIXME: Default should be to remove ignored files */
> > +             opts.preserve_ignored = 1;
> >       opts.fn = oneway_merge;
> >
> >       if (unpack_trees(nr_trees, t, &opts))
> > diff --git a/merge-ort.c b/merge-ort.c
> > index 35aa979c3a4..0d64ec716bd 100644
> > --- a/merge-ort.c
> > +++ b/merge-ort.c
> > @@ -4045,11 +4045,7 @@ static int checkout(struct merge_options *opt,
> >       unpack_opts.quiet = 0; /* FIXME: sequencer might want quiet? */
> >       unpack_opts.verbose_update = (opt->verbosity > 2);
> >       unpack_opts.fn = twoway_merge;
> > -     if (1/* FIXME: opts->overwrite_ignore*/) {
> > -             CALLOC_ARRAY(unpack_opts.dir, 1);
> > -             unpack_opts.dir->flags |= DIR_SHOW_IGNORED;
> > -             setup_standard_excludes(unpack_opts.dir);
> > -     }
> > +     unpack_opts.preserve_ignored = 0; /* FIXME: !opts->overwrite_ignore*/
> >       parse_tree(prev);
> >       init_tree_desc(&trees[0], prev->buffer, prev->size);
> >       parse_tree(next);
> > @@ -4057,8 +4053,6 @@ static int checkout(struct merge_options *opt,
> >
> >       ret = unpack_trees(2, trees, &unpack_opts);
> >       clear_unpack_trees_porcelain(&unpack_opts);
> > -     dir_clear(unpack_opts.dir);
> > -     FREE_AND_NULL(unpack_opts.dir);
> >       return ret;
> >  }
> >
> > diff --git a/merge-recursive.c b/merge-recursive.c
> > index 233d9f686ad..2be3f5d4044 100644
> > --- a/merge-recursive.c
> > +++ b/merge-recursive.c
> > @@ -411,9 +411,7 @@ static int unpack_trees_start(struct merge_options *opt,
> >       else {
> >               opt->priv->unpack_opts.update = 1;
> >               /* FIXME: should only do this if !overwrite_ignore */
> > -             CALLOC_ARRAY(opt->priv->unpack_opts.dir, 1);
> > -             opt->priv->unpack_opts.dir->flags |= DIR_SHOW_IGNORED;
> > -             setup_standard_excludes(opt->priv->unpack_opts.dir);
> > +             opt->priv->unpack_opts.preserve_ignored = 0;
> >       }
> >       opt->priv->unpack_opts.merge = 1;
> >       opt->priv->unpack_opts.head_idx = 2;
> > @@ -428,10 +426,6 @@ static int unpack_trees_start(struct merge_options *opt,
> >       init_tree_desc_from_tree(t+2, merge);
> >
> >       rc = unpack_trees(3, t, &opt->priv->unpack_opts);
> > -     if (opt->priv->unpack_opts.dir) {
> > -             dir_clear(opt->priv->unpack_opts.dir);
> > -             FREE_AND_NULL(opt->priv->unpack_opts.dir);
> > -     }
> >       cache_tree_free(&opt->repo->index->cache_tree);
> >
> >       /*
> > diff --git a/merge.c b/merge.c
> > index 6e736881d90..2382ff66d35 100644
> > --- a/merge.c
> > +++ b/merge.c
> > @@ -53,7 +53,6 @@ int checkout_fast_forward(struct repository *r,
> >       struct unpack_trees_options opts;
> >       struct tree_desc t[MAX_UNPACK_TREES];
> >       int i, nr_trees = 0;
> > -     struct dir_struct dir = DIR_INIT;
> >       struct lock_file lock_file = LOCK_INIT;
> >
> >       refresh_index(r->index, REFRESH_QUIET, NULL, NULL, NULL);
> > @@ -80,11 +79,7 @@ int checkout_fast_forward(struct repository *r,
> >       }
> >
> >       memset(&opts, 0, sizeof(opts));
> > -     if (overwrite_ignore) {
> > -             dir.flags |= DIR_SHOW_IGNORED;
> > -             setup_standard_excludes(&dir);
> > -             opts.dir = &dir;
> > -     }
> > +     opts.preserve_ignored = !overwrite_ignore;
> >
> >       opts.head_idx = 1;
> >       opts.src_index = r->index;
> > @@ -101,7 +96,6 @@ int checkout_fast_forward(struct repository *r,
> >               clear_unpack_trees_porcelain(&opts);
> >               return -1;
> >       }
> > -     dir_clear(&dir);
> >       clear_unpack_trees_porcelain(&opts);
> >
> >       if (write_locked_index(r->index, &lock_file, COMMIT_LOCK))
> > diff --git a/reset.c b/reset.c
> > index 79310ae071b..41b3e2d88de 100644
> > --- a/reset.c
> > +++ b/reset.c
> > @@ -56,6 +56,8 @@ int reset_head(struct repository *r, struct object_id *oid, const char *action,
> >       unpack_tree_opts.fn = reset_hard ? oneway_merge : twoway_merge;
> >       unpack_tree_opts.update = 1;
> >       unpack_tree_opts.merge = 1;
> > +     /* FIXME: Default should be to remove ignored files */
> > +     unpack_tree_opts.preserve_ignored = 1;
> >       init_checkout_metadata(&unpack_tree_opts.meta, switch_to_branch, oid, NULL);
> >       if (!detach_head)
> >               unpack_tree_opts.reset = 1;
> > diff --git a/sequencer.c b/sequencer.c
> > index 614d56f5e21..098566c68d9 100644
> > --- a/sequencer.c
> > +++ b/sequencer.c
> > @@ -3699,6 +3699,8 @@ static int do_reset(struct repository *r,
> >       unpack_tree_opts.fn = oneway_merge;
> >       unpack_tree_opts.merge = 1;
> >       unpack_tree_opts.update = 1;
> > +     /* FIXME: Default should be to remove ignored files */
> > +     unpack_tree_opts.preserve_ignored = 1;
> >       init_checkout_metadata(&unpack_tree_opts.meta, name, &oid, NULL);
> >
> >       if (repo_read_index_unmerged(r)) {
> > diff --git a/unpack-trees.c b/unpack-trees.c
> > index 8ea0a542da8..1e4eae1dc7d 100644
> > --- a/unpack-trees.c
> > +++ b/unpack-trees.c
> > @@ -1707,6 +1707,12 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
> >               ensure_full_index(o->dst_index);
> >       }
> >
> > +     if (!o->preserve_ignored) {
> > +             CALLOC_ARRAY(o->dir, 1);
> > +             o->dir->flags |= DIR_SHOW_IGNORED;
> > +             setup_standard_excludes(o->dir);
> > +     }
> > +
> >       if (!core_apply_sparse_checkout || !o->update)
> >               o->skip_sparse_checkout = 1;
> >       if (!o->skip_sparse_checkout && !o->pl) {
> > @@ -1868,6 +1874,10 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
> >  done:
> >       if (free_pattern_list)
> >               clear_pattern_list(&pl);
> > +     if (o->dir) {
> > +             dir_clear(o->dir);
> > +             FREE_AND_NULL(o->dir);
> > +     }
> >       trace2_region_leave("unpack_trees", "unpack_trees", the_repository);
> >       trace_performance_leave("unpack_trees");
> >       return ret;
> > diff --git a/unpack-trees.h b/unpack-trees.h
> > index 2d88b19dca7..f98cfd49d7b 100644
> > --- a/unpack-trees.h
> > +++ b/unpack-trees.h
> > @@ -49,6 +49,7 @@ struct unpack_trees_options {
> >       unsigned int reset,
> >                    merge,
> >                    update,
> > +                  preserve_ignored,
> >                    clone,
> >                    index_only,
> >                    nontrivial_merge,
>
> I think getting rid of the boilerplate makes sense, but it doesn't sound
> from the commit message like you've considered just making that "struct
> dir*" member a "struct dir" instead.
>
> That simplifies things a lot, i.e. we can just DIR_INIT it, and don't
> need every caller to malloc/free it.

See the next patch in the series.  :-)

> Sometimes a pointer makes sense, but in this case the "struct
> unpack_trees_options" can just own it.

I did make it internal to unpack_trees_options in the next patch, but
kept it as a pointer just because that let me know whether it was used
or not.  I guess I could have added a boolean as well.  But I don't
actually allocate anything, because it's either a NULL pointer, or a
pointer to something on the stack.  So, I do get to just use DIR_INIT.
Ævar Arnfjörð Bjarmason Sept. 29, 2021, 6:30 p.m. UTC | #3
On Wed, Sep 29 2021, Elijah Newren wrote:

> On Wed, Sep 29, 2021 at 2:27 AM Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
>>
>> On Mon, Sep 27 2021, Elijah Newren via GitGitGadget wrote:
>>
>> > From: Elijah Newren <newren@gmail.com>
>> >
>> > Currently, every caller of unpack_trees() that wants to ensure ignored
>> > files are overwritten by default needs to:
>> >    * allocate unpack_trees_options.dir
>> >    * flip the DIR_SHOW_IGNORED flag in unpack_trees_options.dir->flags
>> >    * call setup_standard_excludes
>> > AND then after the call to unpack_trees() needs to
>> >    * call dir_clear()
>> >    * deallocate unpack_trees_options.dir
>> > That's a fair amount of boilerplate, and every caller uses identical
>> > code.  Make this easier by instead introducing a new boolean value where
>> > the default value (0) does what we want so that new callers of
>> > unpack_trees() automatically get the appropriate behavior.  And move all
>> > the handling of unpack_trees_options.dir into unpack_trees() itself.
>> >
>> > While preserve_ignored = 0 is the behavior we feel is the appropriate
>> > default, we defer fixing commands to use the appropriate default until a
>> > later commit.  So, this commit introduces several locations where we
>> > manually set preserve_ignored=1.  This makes it clear where code paths
>> > were previously preserving ignored files when they should not have been;
>> > a future commit will flip these to instead use a value of 0 to get the
>> > behavior we want.
>> >
>> > Signed-off-by: Elijah Newren <newren@gmail.com>
>> > ---
>> >  builtin/am.c        |  3 +++
>> >  builtin/checkout.c  | 11 ++---------
>> >  builtin/clone.c     |  2 ++
>> >  builtin/merge.c     |  2 ++
>> >  builtin/read-tree.c | 13 +++----------
>> >  builtin/reset.c     |  2 ++
>> >  builtin/stash.c     |  3 +++
>> >  merge-ort.c         |  8 +-------
>> >  merge-recursive.c   |  8 +-------
>> >  merge.c             |  8 +-------
>> >  reset.c             |  2 ++
>> >  sequencer.c         |  2 ++
>> >  unpack-trees.c      | 10 ++++++++++
>> >  unpack-trees.h      |  1 +
>> >  14 files changed, 35 insertions(+), 40 deletions(-)
>> >
>> > diff --git a/builtin/am.c b/builtin/am.c
>> > index e4a0ff9cd7c..1ee70692bc3 100644
>> > --- a/builtin/am.c
>> > +++ b/builtin/am.c
>> > @@ -1918,6 +1918,9 @@ static int fast_forward_to(struct tree *head, struct tree *remote, int reset)
>> >       opts.update = 1;
>> >       opts.merge = 1;
>> >       opts.reset = reset;
>> > +     if (!reset)
>> > +             /* FIXME: Default should be to remove ignored files */
>> > +             opts.preserve_ignored = 1;
>> >       opts.fn = twoway_merge;
>> >       init_tree_desc(&t[0], head->buffer, head->size);
>> >       init_tree_desc(&t[1], remote->buffer, remote->size);
>> > diff --git a/builtin/checkout.c b/builtin/checkout.c
>> > index 5335435d616..5e7957dd068 100644
>> > --- a/builtin/checkout.c
>> > +++ b/builtin/checkout.c
>> > @@ -648,6 +648,7 @@ static int reset_tree(struct tree *tree, const struct checkout_opts *o,
>> >       opts.skip_unmerged = !worktree;
>> >       opts.reset = 1;
>> >       opts.merge = 1;
>> > +     opts.preserve_ignored = 0;
>> >       opts.fn = oneway_merge;
>> >       opts.verbose_update = o->show_progress;
>> >       opts.src_index = &the_index;
>> > @@ -746,11 +747,7 @@ static int merge_working_tree(const struct checkout_opts *opts,
>> >                                      new_branch_info->commit ?
>> >                                      &new_branch_info->commit->object.oid :
>> >                                      &new_branch_info->oid, NULL);
>> > -             if (opts->overwrite_ignore) {
>> > -                     topts.dir = xcalloc(1, sizeof(*topts.dir));
>> > -                     topts.dir->flags |= DIR_SHOW_IGNORED;
>> > -                     setup_standard_excludes(topts.dir);
>> > -             }
>> > +             topts.preserve_ignored = !opts->overwrite_ignore;
>> >               tree = parse_tree_indirect(old_branch_info->commit ?
>> >                                          &old_branch_info->commit->object.oid :
>> >                                          the_hash_algo->empty_tree);
>> > @@ -760,10 +757,6 @@ static int merge_working_tree(const struct checkout_opts *opts,
>> >               init_tree_desc(&trees[1], tree->buffer, tree->size);
>> >
>> >               ret = unpack_trees(2, trees, &topts);
>> > -             if (topts.dir) {
>> > -                     dir_clear(topts.dir);
>> > -                     FREE_AND_NULL(topts.dir);
>> > -             }
>> >               clear_unpack_trees_porcelain(&topts);
>> >               if (ret == -1) {
>> >                       /*
>> > diff --git a/builtin/clone.c b/builtin/clone.c
>> > index ff1d3d447a3..be1c3840d62 100644
>> > --- a/builtin/clone.c
>> > +++ b/builtin/clone.c
>> > @@ -687,6 +687,8 @@ static int checkout(int submodule_progress)
>> >       opts.update = 1;
>> >       opts.merge = 1;
>> >       opts.clone = 1;
>> > +     /* FIXME: Default should be to remove ignored files */
>> > +     opts.preserve_ignored = 1;
>> >       opts.fn = oneway_merge;
>> >       opts.verbose_update = (option_verbosity >= 0);
>> >       opts.src_index = &the_index;
>> > diff --git a/builtin/merge.c b/builtin/merge.c
>> > index 3fbdacc7db4..1e5fff095fc 100644
>> > --- a/builtin/merge.c
>> > +++ b/builtin/merge.c
>> > @@ -680,6 +680,8 @@ static int read_tree_trivial(struct object_id *common, struct object_id *head,
>> >       opts.verbose_update = 1;
>> >       opts.trivial_merges_only = 1;
>> >       opts.merge = 1;
>> > +     /* FIXME: Default should be to remove ignored files */
>> > +     opts.preserve_ignored = 1;
>> >       trees[nr_trees] = parse_tree_indirect(common);
>> >       if (!trees[nr_trees++])
>> >               return -1;
>> > diff --git a/builtin/read-tree.c b/builtin/read-tree.c
>> > index 73cb957a69b..443d206eca6 100644
>> > --- a/builtin/read-tree.c
>> > +++ b/builtin/read-tree.c
>> > @@ -201,11 +201,9 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
>> >       if ((opts.update || opts.index_only) && !opts.merge)
>> >               die("%s is meaningless without -m, --reset, or --prefix",
>> >                   opts.update ? "-u" : "-i");
>> > -     if (opts.update && !opts.reset) {
>> > -             CALLOC_ARRAY(opts.dir, 1);
>> > -             opts.dir->flags |= DIR_SHOW_IGNORED;
>> > -             setup_standard_excludes(opts.dir);
>> > -     }
>> > +     if (opts.update && !opts.reset)
>> > +             opts.preserve_ignored = 0;
>> > +     /* otherwise, opts.preserve_ignored is irrelevant */
>> >       if (opts.merge && !opts.index_only)
>> >               setup_work_tree();
>> >
>> > @@ -245,11 +243,6 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
>> >       if (unpack_trees(nr_trees, t, &opts))
>> >               return 128;
>> >
>> > -     if (opts.dir) {
>> > -             dir_clear(opts.dir);
>> > -             FREE_AND_NULL(opts.dir);
>> > -     }
>> > -
>> >       if (opts.debug_unpack || opts.dry_run)
>> >               return 0; /* do not write the index out */
>> >
>> > diff --git a/builtin/reset.c b/builtin/reset.c
>> > index 51c9e2f43ff..7f38656f018 100644
>> > --- a/builtin/reset.c
>> > +++ b/builtin/reset.c
>> > @@ -67,6 +67,8 @@ static int reset_index(const char *ref, const struct object_id *oid, int reset_t
>> >       case KEEP:
>> >       case MERGE:
>> >               opts.update = 1;
>> > +             /* FIXME: Default should be to remove ignored files */
>> > +             opts.preserve_ignored = 1;
>> >               break;
>> >       case HARD:
>> >               opts.update = 1;
>> > diff --git a/builtin/stash.c b/builtin/stash.c
>> > index 8f42360ca91..88287b890d5 100644
>> > --- a/builtin/stash.c
>> > +++ b/builtin/stash.c
>> > @@ -258,6 +258,9 @@ static int reset_tree(struct object_id *i_tree, int update, int reset)
>> >       opts.merge = 1;
>> >       opts.reset = reset;
>> >       opts.update = update;
>> > +     if (update && !reset)
>> > +             /* FIXME: Default should be to remove ignored files */
>> > +             opts.preserve_ignored = 1;
>> >       opts.fn = oneway_merge;
>> >
>> >       if (unpack_trees(nr_trees, t, &opts))
>> > diff --git a/merge-ort.c b/merge-ort.c
>> > index 35aa979c3a4..0d64ec716bd 100644
>> > --- a/merge-ort.c
>> > +++ b/merge-ort.c
>> > @@ -4045,11 +4045,7 @@ static int checkout(struct merge_options *opt,
>> >       unpack_opts.quiet = 0; /* FIXME: sequencer might want quiet? */
>> >       unpack_opts.verbose_update = (opt->verbosity > 2);
>> >       unpack_opts.fn = twoway_merge;
>> > -     if (1/* FIXME: opts->overwrite_ignore*/) {
>> > -             CALLOC_ARRAY(unpack_opts.dir, 1);
>> > -             unpack_opts.dir->flags |= DIR_SHOW_IGNORED;
>> > -             setup_standard_excludes(unpack_opts.dir);
>> > -     }
>> > +     unpack_opts.preserve_ignored = 0; /* FIXME: !opts->overwrite_ignore*/
>> >       parse_tree(prev);
>> >       init_tree_desc(&trees[0], prev->buffer, prev->size);
>> >       parse_tree(next);
>> > @@ -4057,8 +4053,6 @@ static int checkout(struct merge_options *opt,
>> >
>> >       ret = unpack_trees(2, trees, &unpack_opts);
>> >       clear_unpack_trees_porcelain(&unpack_opts);
>> > -     dir_clear(unpack_opts.dir);
>> > -     FREE_AND_NULL(unpack_opts.dir);
>> >       return ret;
>> >  }
>> >
>> > diff --git a/merge-recursive.c b/merge-recursive.c
>> > index 233d9f686ad..2be3f5d4044 100644
>> > --- a/merge-recursive.c
>> > +++ b/merge-recursive.c
>> > @@ -411,9 +411,7 @@ static int unpack_trees_start(struct merge_options *opt,
>> >       else {
>> >               opt->priv->unpack_opts.update = 1;
>> >               /* FIXME: should only do this if !overwrite_ignore */
>> > -             CALLOC_ARRAY(opt->priv->unpack_opts.dir, 1);
>> > -             opt->priv->unpack_opts.dir->flags |= DIR_SHOW_IGNORED;
>> > -             setup_standard_excludes(opt->priv->unpack_opts.dir);
>> > +             opt->priv->unpack_opts.preserve_ignored = 0;
>> >       }
>> >       opt->priv->unpack_opts.merge = 1;
>> >       opt->priv->unpack_opts.head_idx = 2;
>> > @@ -428,10 +426,6 @@ static int unpack_trees_start(struct merge_options *opt,
>> >       init_tree_desc_from_tree(t+2, merge);
>> >
>> >       rc = unpack_trees(3, t, &opt->priv->unpack_opts);
>> > -     if (opt->priv->unpack_opts.dir) {
>> > -             dir_clear(opt->priv->unpack_opts.dir);
>> > -             FREE_AND_NULL(opt->priv->unpack_opts.dir);
>> > -     }
>> >       cache_tree_free(&opt->repo->index->cache_tree);
>> >
>> >       /*
>> > diff --git a/merge.c b/merge.c
>> > index 6e736881d90..2382ff66d35 100644
>> > --- a/merge.c
>> > +++ b/merge.c
>> > @@ -53,7 +53,6 @@ int checkout_fast_forward(struct repository *r,
>> >       struct unpack_trees_options opts;
>> >       struct tree_desc t[MAX_UNPACK_TREES];
>> >       int i, nr_trees = 0;
>> > -     struct dir_struct dir = DIR_INIT;
>> >       struct lock_file lock_file = LOCK_INIT;
>> >
>> >       refresh_index(r->index, REFRESH_QUIET, NULL, NULL, NULL);
>> > @@ -80,11 +79,7 @@ int checkout_fast_forward(struct repository *r,
>> >       }
>> >
>> >       memset(&opts, 0, sizeof(opts));
>> > -     if (overwrite_ignore) {
>> > -             dir.flags |= DIR_SHOW_IGNORED;
>> > -             setup_standard_excludes(&dir);
>> > -             opts.dir = &dir;
>> > -     }
>> > +     opts.preserve_ignored = !overwrite_ignore;
>> >
>> >       opts.head_idx = 1;
>> >       opts.src_index = r->index;
>> > @@ -101,7 +96,6 @@ int checkout_fast_forward(struct repository *r,
>> >               clear_unpack_trees_porcelain(&opts);
>> >               return -1;
>> >       }
>> > -     dir_clear(&dir);
>> >       clear_unpack_trees_porcelain(&opts);
>> >
>> >       if (write_locked_index(r->index, &lock_file, COMMIT_LOCK))
>> > diff --git a/reset.c b/reset.c
>> > index 79310ae071b..41b3e2d88de 100644
>> > --- a/reset.c
>> > +++ b/reset.c
>> > @@ -56,6 +56,8 @@ int reset_head(struct repository *r, struct object_id *oid, const char *action,
>> >       unpack_tree_opts.fn = reset_hard ? oneway_merge : twoway_merge;
>> >       unpack_tree_opts.update = 1;
>> >       unpack_tree_opts.merge = 1;
>> > +     /* FIXME: Default should be to remove ignored files */
>> > +     unpack_tree_opts.preserve_ignored = 1;
>> >       init_checkout_metadata(&unpack_tree_opts.meta, switch_to_branch, oid, NULL);
>> >       if (!detach_head)
>> >               unpack_tree_opts.reset = 1;
>> > diff --git a/sequencer.c b/sequencer.c
>> > index 614d56f5e21..098566c68d9 100644
>> > --- a/sequencer.c
>> > +++ b/sequencer.c
>> > @@ -3699,6 +3699,8 @@ static int do_reset(struct repository *r,
>> >       unpack_tree_opts.fn = oneway_merge;
>> >       unpack_tree_opts.merge = 1;
>> >       unpack_tree_opts.update = 1;
>> > +     /* FIXME: Default should be to remove ignored files */
>> > +     unpack_tree_opts.preserve_ignored = 1;
>> >       init_checkout_metadata(&unpack_tree_opts.meta, name, &oid, NULL);
>> >
>> >       if (repo_read_index_unmerged(r)) {
>> > diff --git a/unpack-trees.c b/unpack-trees.c
>> > index 8ea0a542da8..1e4eae1dc7d 100644
>> > --- a/unpack-trees.c
>> > +++ b/unpack-trees.c
>> > @@ -1707,6 +1707,12 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>> >               ensure_full_index(o->dst_index);
>> >       }
>> >
>> > +     if (!o->preserve_ignored) {
>> > +             CALLOC_ARRAY(o->dir, 1);
>> > +             o->dir->flags |= DIR_SHOW_IGNORED;
>> > +             setup_standard_excludes(o->dir);
>> > +     }
>> > +
>> >       if (!core_apply_sparse_checkout || !o->update)
>> >               o->skip_sparse_checkout = 1;
>> >       if (!o->skip_sparse_checkout && !o->pl) {
>> > @@ -1868,6 +1874,10 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>> >  done:
>> >       if (free_pattern_list)
>> >               clear_pattern_list(&pl);
>> > +     if (o->dir) {
>> > +             dir_clear(o->dir);
>> > +             FREE_AND_NULL(o->dir);
>> > +     }
>> >       trace2_region_leave("unpack_trees", "unpack_trees", the_repository);
>> >       trace_performance_leave("unpack_trees");
>> >       return ret;
>> > diff --git a/unpack-trees.h b/unpack-trees.h
>> > index 2d88b19dca7..f98cfd49d7b 100644
>> > --- a/unpack-trees.h
>> > +++ b/unpack-trees.h
>> > @@ -49,6 +49,7 @@ struct unpack_trees_options {
>> >       unsigned int reset,
>> >                    merge,
>> >                    update,
>> > +                  preserve_ignored,
>> >                    clone,
>> >                    index_only,
>> >                    nontrivial_merge,
>>
>> I think getting rid of the boilerplate makes sense, but it doesn't sound
>> from the commit message like you've considered just making that "struct
>> dir*" member a "struct dir" instead.
>>
>> That simplifies things a lot, i.e. we can just DIR_INIT it, and don't
>> need every caller to malloc/free it.
>
> See the next patch in the series.  :-)

Ah!

>> Sometimes a pointer makes sense, but in this case the "struct
>> unpack_trees_options" can just own it.
>
> I did make it internal to unpack_trees_options in the next patch, but
> kept it as a pointer just because that let me know whether it was used
> or not.  I guess I could have added a boolean as well.  But I don't
> actually allocate anything, because it's either a NULL pointer, or a
> pointer to something on the stack.  So, I do get to just use DIR_INIT.

I think I'm probably missing something. I just made it allocated on the
stack by the caller using "struct unpack_trees_options", but then you
end up having a dir* in the struct, but that's only filled in as a
pointer to the stack variable? Maybe there's some subtlety I'm missing
here...
Elijah Newren Sept. 30, 2021, 4:25 a.m. UTC | #4
On Wed, Sep 29, 2021 at 11:32 AM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Wed, Sep 29 2021, Elijah Newren wrote:
>
> > On Wed, Sep 29, 2021 at 2:27 AM Ævar Arnfjörð Bjarmason
> > <avarab@gmail.com> wrote:
> >>
...
> >>
> >> I think getting rid of the boilerplate makes sense, but it doesn't sound
> >> from the commit message like you've considered just making that "struct
> >> dir*" member a "struct dir" instead.
> >>
> >> That simplifies things a lot, i.e. we can just DIR_INIT it, and don't
> >> need every caller to malloc/free it.
> >
> > See the next patch in the series.  :-)
>
> Ah!
>
> >> Sometimes a pointer makes sense, but in this case the "struct
> >> unpack_trees_options" can just own it.
> >
> > I did make it internal to unpack_trees_options in the next patch, but
> > kept it as a pointer just because that let me know whether it was used
> > or not.  I guess I could have added a boolean as well.  But I don't
> > actually allocate anything, because it's either a NULL pointer, or a
> > pointer to something on the stack.  So, I do get to just use DIR_INIT.
>
> I think I'm probably missing something. I just made it allocated on the
> stack by the caller using "struct unpack_trees_options", but then you
> end up having a dir* in the struct, but that's only filled in as a
> pointer to the stack variable? Maybe there's some subtlety I'm missing
> here...

As per the next patch:

int unpack_trees(..., struct unpack_trees_options *o)
{
    struct dir_struct dir = DIR_INIT;
    ...
    if (!o->preserve_ignored) {
        /* Setup 'dir', make o->dir point to it */
        ....
        o->dir = &dir;
    }
    ...
    if (o->dir)
        /* cleanup */
    ....
}

The caller doesn't touch o->dir (other than initializing it to zeros);
unpack_trees() is wholly responsible for it.  I'd kind of like to
entirely remove dir from unpack_trees_options(), but I need a way of
passing it down through all the other functions in unpack-trees.c, and
leaving it in unpack_trees_options seems the easiest way to do so.  So
I just marked it as "for internal use only".
Ævar Arnfjörð Bjarmason Sept. 30, 2021, 2:04 p.m. UTC | #5
On Wed, Sep 29 2021, Elijah Newren wrote:

> On Wed, Sep 29, 2021 at 11:32 AM Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
>>
>> On Wed, Sep 29 2021, Elijah Newren wrote:
>>
>> > On Wed, Sep 29, 2021 at 2:27 AM Ævar Arnfjörð Bjarmason
>> > <avarab@gmail.com> wrote:
>> >>
> ...
>> >>
>> >> I think getting rid of the boilerplate makes sense, but it doesn't sound
>> >> from the commit message like you've considered just making that "struct
>> >> dir*" member a "struct dir" instead.
>> >>
>> >> That simplifies things a lot, i.e. we can just DIR_INIT it, and don't
>> >> need every caller to malloc/free it.
>> >
>> > See the next patch in the series.  :-)
>>
>> Ah!
>>
>> >> Sometimes a pointer makes sense, but in this case the "struct
>> >> unpack_trees_options" can just own it.
>> >
>> > I did make it internal to unpack_trees_options in the next patch, but
>> > kept it as a pointer just because that let me know whether it was used
>> > or not.  I guess I could have added a boolean as well.  But I don't
>> > actually allocate anything, because it's either a NULL pointer, or a
>> > pointer to something on the stack.  So, I do get to just use DIR_INIT.
>>
>> I think I'm probably missing something. I just made it allocated on the
>> stack by the caller using "struct unpack_trees_options", but then you
>> end up having a dir* in the struct, but that's only filled in as a
>> pointer to the stack variable? Maybe there's some subtlety I'm missing
>> here...
>
> As per the next patch:
>
> int unpack_trees(..., struct unpack_trees_options *o)
> {
>     struct dir_struct dir = DIR_INIT;
>     ...
>     if (!o->preserve_ignored) {
>         /* Setup 'dir', make o->dir point to it */
>         ....
>         o->dir = &dir;
>     }
>     ...
>     if (o->dir)
>         /* cleanup */
>     ....
> }
>
> The caller doesn't touch o->dir (other than initializing it to zeros);
> unpack_trees() is wholly responsible for it.  I'd kind of like to
> entirely remove dir from unpack_trees_options(), but I need a way of
> passing it down through all the other functions in unpack-trees.c, and
> leaving it in unpack_trees_options seems the easiest way to do so.  So
> I just marked it as "for internal use only".

I think I understand *how* it works, I'm puzzled by why you went for
this whole level of indirection when you're using a struct on the stack
in the end anyway, just ... put that in "struct unpack_trees_options"?

Anyway, I see I have only myself to blame here, as you added these leak
fixes in the v2 in response to some of my offhand comments.

FWIW I then went on to do some deeper fixes not just on these leaks but
the surrounding leaks, which will be blocked by 2/11 & 05/11 of this
topic for a while. I suppose I only have myself to blame :)

Below is a patch-on-top that I think makes this whole thing much simpler
by doing away with the pointer entirely.

I suppose this is also a partial reply to
https://lore.kernel.org/git/CABPp-BG_qigBoirMGR-Yk9Niyxt0UmYCEqojsYxbSEarLAmraA@mail.gmail.com/;
but I quite dislike this pattern of including a pointer like this where
it's not needed just for the practicalities of memory management.

I.e. here you use DIR_INIT. In my local patches to fix up the wider
memory leaks in this area I've got DIR_INIT also using a STRBUF_INIT,
and DIR_INIT will in turn be referenced by a
UNPACK_TREES_OPTIONS_INIT. It's quite nice if you're having to
initialize with "UNPACK_TREES_OPTIONS_INIT" have that initialization
work all the way down the chain, and not need e.g. a manual
strbuf_init(), dir_init() etc.

I removed the dir_init() in ce93a4c6127 (dir.[ch]: replace dir_init()
with DIR_INIT, 2021-07-01), but would probably need to bring it back, of
course you need some "release()" method for the
UNPACK_TREES_OPTIONS_INIT, which in turn needs to call the dir_release()
(well, "dir_clear()" in that case), and it needs to call
"strbuf_release()". It's just nicer if that boilerplate is all on
destruction, but not also on struct/object setup.

We do need that setup in some cases (although a lot could just be
replaced by lazy initialization), but if we don't....

diff --git a/unpack-trees.c b/unpack-trees.c
index a7e1712d236..de5cc6cd025 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1694,15 +1694,12 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 	static struct cache_entry *dfc;
 	struct pattern_list pl;
 	int free_pattern_list = 0;
-	struct dir_struct dir = DIR_INIT;
 
 	if (o->reset == UNPACK_RESET_INVALID)
 		BUG("o->reset had a value of 1; should be UNPACK_TREES_*_UNTRACKED");
 
 	if (len > MAX_UNPACK_TREES)
 		die("unpack_trees takes at most %d trees", MAX_UNPACK_TREES);
-	if (o->dir)
-		BUG("o->dir is for internal use only");
 
 	trace_performance_enter();
 	trace2_region_enter("unpack_trees", "unpack_trees", the_repository);
@@ -1718,9 +1715,8 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 		BUG("UNPACK_RESET_OVERWRITE_UNTRACKED incompatible with preserved ignored files");
 
 	if (!o->preserve_ignored) {
-		o->dir = &dir;
-		o->dir->flags |= DIR_SHOW_IGNORED;
-		setup_standard_excludes(o->dir);
+		o->dir.flags |= DIR_SHOW_IGNORED;
+		setup_standard_excludes(&o->dir);
 	}
 
 	if (!core_apply_sparse_checkout || !o->update)
@@ -1884,10 +1880,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 done:
 	if (free_pattern_list)
 		clear_pattern_list(&pl);
-	if (o->dir) {
-		dir_clear(o->dir);
-		o->dir = NULL;
-	}
+	dir_clear(&o->dir);
 	trace2_region_leave("unpack_trees", "unpack_trees", the_repository);
 	trace_performance_leave("unpack_trees");
 	return ret;
@@ -2153,8 +2146,7 @@ static int verify_clean_subdirectory(const struct cache_entry *ce,
 	pathbuf = xstrfmt("%.*s/", namelen, ce->name);
 
 	memset(&d, 0, sizeof(d));
-	if (o->dir)
-		d.exclude_per_dir = o->dir->exclude_per_dir;
+	d.exclude_per_dir = o->dir.exclude_per_dir;
 	i = read_directory(&d, o->src_index, pathbuf, namelen+1, NULL);
 	if (i)
 		return add_rejected_path(o, ERROR_NOT_UPTODATE_DIR, ce->name);
@@ -2201,8 +2193,7 @@ static int check_ok_to_remove(const char *name, int len, int dtype,
 	if (ignore_case && icase_exists(o, name, len, st))
 		return 0;
 
-	if (o->dir &&
-	    is_excluded(o->dir, o->src_index, name, &dtype))
+	if (is_excluded(&o->dir, o->src_index, name, &dtype))
 		/*
 		 * ce->name is explicitly excluded, so it is Ok to
 		 * overwrite it.
diff --git a/unpack-trees.h b/unpack-trees.h
index 71ffb7eeb0c..a8afbb20170 100644
--- a/unpack-trees.h
+++ b/unpack-trees.h
@@ -5,6 +5,7 @@
 #include "strvec.h"
 #include "string-list.h"
 #include "tree-walk.h"
+#include "dir.h"
 
 #define MAX_UNPACK_TREES MAX_TRAVERSE_TREES
 
@@ -95,7 +96,7 @@ struct unpack_trees_options {
 	struct index_state result;
 
 	struct pattern_list *pl; /* for internal use */
-	struct dir_struct *dir; /* for internal use only */
+	struct dir_struct dir; /* for internal use only */
 	struct checkout_metadata meta;
 };
Elijah Newren Oct. 1, 2021, 1:53 a.m. UTC | #6
On Thu, Sep 30, 2021 at 7:15 AM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Wed, Sep 29 2021, Elijah Newren wrote:
>
> > On Wed, Sep 29, 2021 at 11:32 AM Ævar Arnfjörð Bjarmason
> > <avarab@gmail.com> wrote:
> >>
> >> On Wed, Sep 29 2021, Elijah Newren wrote:
> >>
...
> > As per the next patch:
> >
> > int unpack_trees(..., struct unpack_trees_options *o)
> > {
> >     struct dir_struct dir = DIR_INIT;
> >     ...
> >     if (!o->preserve_ignored) {
> >         /* Setup 'dir', make o->dir point to it */
> >         ....
> >         o->dir = &dir;
> >     }
> >     ...
> >     if (o->dir)
> >         /* cleanup */
> >     ....
> > }
> >
> > The caller doesn't touch o->dir (other than initializing it to zeros);
> > unpack_trees() is wholly responsible for it.  I'd kind of like to
> > entirely remove dir from unpack_trees_options(), but I need a way of
> > passing it down through all the other functions in unpack-trees.c, and
> > leaving it in unpack_trees_options seems the easiest way to do so.  So
> > I just marked it as "for internal use only".
>
> I think I understand *how* it works, I'm puzzled by why you went for
> this whole level of indirection when you're using a struct on the stack
> in the end anyway, just ... put that in "struct unpack_trees_options"?
>
> Anyway, I see I have only myself to blame here, as you added these leak
> fixes in the v2 in response to some of my offhand comments.
>
> FWIW I then went on to do some deeper fixes not just on these leaks but
> the surrounding leaks, which will be blocked by 2/11 & 05/11 of this
> topic for a while. I suppose I only have myself to blame :)
>
> Below is a patch-on-top that I think makes this whole thing much simpler
> by doing away with the pointer entirely.
>
> I suppose this is also a partial reply to
> https://lore.kernel.org/git/CABPp-BG_qigBoirMGR-Yk9Niyxt0UmYCEqojsYxbSEarLAmraA@mail.gmail.com/;
> but I quite dislike this pattern of including a pointer like this where
> it's not needed just for the practicalities of memory management.
>
> I.e. here you use DIR_INIT. In my local patches to fix up the wider
> memory leaks in this area I've got DIR_INIT also using a STRBUF_INIT,
> and DIR_INIT will in turn be referenced by a
> UNPACK_TREES_OPTIONS_INIT. It's quite nice if you're having to
> initialize with "UNPACK_TREES_OPTIONS_INIT" have that initialization
> work all the way down the chain, and not need e.g. a manual
> strbuf_init(), dir_init() etc.

And you can keep using UNPACK_TREES_OPTIONS_INIT, because the
unpack_trees_opts->dir should be initialized to NULL.

> I removed the dir_init() in ce93a4c6127 (dir.[ch]: replace dir_init()
> with DIR_INIT, 2021-07-01),

I might be going on a tangent here, but looking at that patch, I'm
worried that dir_init() was buggy and that you perpetuated that bug
with DIR_INIT.  Note that dir_struct has a struct strbuf basebuf
member, which neither dir_init() or DIR_INIT initialize properly
(using either strbuf_init() or STRBUF_INIT).  As far as I can tell,
dir.c relies on either strbuf_add() calls to just happen to work with
this incorrectly initialized strbuf, or else use the strbuf_init()
call in prep_exclude() to do so, using the following snippet:

    if (!dir->basebuf.buf)
        strbuf_init(&dir->basebuf, PATH_MAX);

However, earlier in that same function we see

    if (stk->baselen <= baselen &&
        !strncmp(dir->basebuf.buf, base, stk->baselen))
            break;

So either that function can never have dir->basebuf.buf be NULL and
the strbuf_init() is dead code, or else it's possible for us to
trigger a segfault.  If it's the former, it may just be a ticking time
bomb that will transform into the latter with some other change,
because it's not at all obvious to me how dir->basebuf gets
initialized appropriately to avoid that strncmp call.  Perhaps there
is some invariant where exclude_stack is only set up by previous calls
to prep_exclude() and those won't set up exclude_stack until first
initializing basebuf.  But that really at least deserves a comment
about how we're abusing basebuf, and would probably be cleaner if we
initialized basebuf to STRBUF_INIT.

> but would probably need to bring it back, of

If you need to bring it back, it's unrelated to my changes here, and
would only be because of the lack of basebuf initialization above.

> course you need some "release()" method for the
> UNPACK_TREES_OPTIONS_INIT, which in turn needs to call the dir_release()
> (well, "dir_clear()" in that case), and it needs to call
> "strbuf_release()". It's just nicer if that boilerplate is all on
> destruction, but not also on struct/object setup.

The caller should *not* be initializing or tearing down
unpack_trees_options->dir beyond setting that field to NULL; it should
then leave it alone.

> We do need that setup in some cases (although a lot could just be
> replaced by lazy initialization), but if we don't....
>
> diff --git a/unpack-trees.c b/unpack-trees.c
> index a7e1712d236..de5cc6cd025 100644
> --- a/unpack-trees.c
> +++ b/unpack-trees.c
> @@ -1694,15 +1694,12 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>         static struct cache_entry *dfc;
>         struct pattern_list pl;
>         int free_pattern_list = 0;
> -       struct dir_struct dir = DIR_INIT;
>
>         if (o->reset == UNPACK_RESET_INVALID)
>                 BUG("o->reset had a value of 1; should be UNPACK_TREES_*_UNTRACKED");
>
>         if (len > MAX_UNPACK_TREES)
>                 die("unpack_trees takes at most %d trees", MAX_UNPACK_TREES);
> -       if (o->dir)
> -               BUG("o->dir is for internal use only");

I think this was an important check that you've tossed without
replacement.  Historically, callers set up and tweaked o->dir with
various values.  With my patch, we are no longer allowing that..which
introduces a transition problem -- people might have written or are
now writing patches that make new calls of unpack_trees() previous to
this change of mine, but submit them after this change of mine gets
merged.  Without this check I added, they'd probably just do a
mechanical `o->dir->` change to `o->dir.` and assume it's good...and
then possibly have ugly bugs to hunt down.

So, I think it's helpful to have a check that provides an early
warning that tweaking o->dir is not only no longer required, but also
no longer allowed.

>         trace_performance_enter();
>         trace2_region_enter("unpack_trees", "unpack_trees", the_repository);
> @@ -1718,9 +1715,8 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>                 BUG("UNPACK_RESET_OVERWRITE_UNTRACKED incompatible with preserved ignored files");
>
>         if (!o->preserve_ignored) {
> -               o->dir = &dir;
> -               o->dir->flags |= DIR_SHOW_IGNORED;
> -               setup_standard_excludes(o->dir);
> +               o->dir.flags |= DIR_SHOW_IGNORED;
> +               setup_standard_excludes(&o->dir);
>         }
>
>         if (!core_apply_sparse_checkout || !o->update)
> @@ -1884,10 +1880,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>  done:
>         if (free_pattern_list)
>                 clear_pattern_list(&pl);
> -       if (o->dir) {
> -               dir_clear(o->dir);
> -               o->dir = NULL;
> -       }
> +       dir_clear(&o->dir);

Unconditionally calling dir_clear()...

>         trace2_region_leave("unpack_trees", "unpack_trees", the_repository);
>         trace_performance_leave("unpack_trees");
>         return ret;
> @@ -2153,8 +2146,7 @@ static int verify_clean_subdirectory(const struct cache_entry *ce,
>         pathbuf = xstrfmt("%.*s/", namelen, ce->name);
>
>         memset(&d, 0, sizeof(d));
> -       if (o->dir)
> -               d.exclude_per_dir = o->dir->exclude_per_dir;
> +       d.exclude_per_dir = o->dir.exclude_per_dir;
>         i = read_directory(&d, o->src_index, pathbuf, namelen+1, NULL);
>         if (i)
>                 return add_rejected_path(o, ERROR_NOT_UPTODATE_DIR, ce->name);
> @@ -2201,8 +2193,7 @@ static int check_ok_to_remove(const char *name, int len, int dtype,
>         if (ignore_case && icase_exists(o, name, len, st))
>                 return 0;
>
> -       if (o->dir &&
> -           is_excluded(o->dir, o->src_index, name, &dtype))
> +       if (is_excluded(&o->dir, o->src_index, name, &dtype))

Unconditionally calling is_excluded()...

>                 /*
>                  * ce->name is explicitly excluded, so it is Ok to
>                  * overwrite it.
> diff --git a/unpack-trees.h b/unpack-trees.h
> index 71ffb7eeb0c..a8afbb20170 100644
> --- a/unpack-trees.h
> +++ b/unpack-trees.h
> @@ -5,6 +5,7 @@
>  #include "strvec.h"
>  #include "string-list.h"
>  #include "tree-walk.h"
> +#include "dir.h"
>
>  #define MAX_UNPACK_TREES MAX_TRAVERSE_TREES
>
> @@ -95,7 +96,7 @@ struct unpack_trees_options {
>         struct index_state result;
>
>         struct pattern_list *pl; /* for internal use */
> -       struct dir_struct *dir; /* for internal use only */
> +       struct dir_struct dir; /* for internal use only */
>         struct checkout_metadata meta;
>  };
>

Not only did you drop the important safety check that o->dir not be
setup by the caller (which needs to be reinstated in some form), your
solution also involves unconditionally calling dir_clear() and
is_excluded().  It is not clear to me that those calls are safe...and
that they will continue to be safe in the future.  Even if it is safe
and will continue to be, I don't think this should be squashed into my
patches.  I think it should be a separate patch with its own commit
message that explicitly calls out this assumption.  Especially since
this is dir.c, which is an area where attempting to fix one very
simple little bug results in years of refactoring and fixing all kinds
of historical messes, sometimes waiting a year and a half for
responses to RFCs/review requests, and where we have to sometimes just
give up on attempting to understand the purpose of various bits of
code and instead rely on the regression tests and hope they are good
enough.  I still think that dir.c deserves a little warning at the
top, like the one I suggested in [1].

[1] https://lore.kernel.org/git/CABPp-BFiwzzUgiTj_zu+vF5x20L0=1cf25cHwk7KZQj2YkVzXw@mail.gmail.com/
Ævar Arnfjörð Bjarmason Oct. 1, 2021, 8:15 a.m. UTC | #7
On Thu, Sep 30 2021, Elijah Newren wrote:

> On Thu, Sep 30, 2021 at 7:15 AM Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
>>
>> On Wed, Sep 29 2021, Elijah Newren wrote:
>>
>> > On Wed, Sep 29, 2021 at 11:32 AM Ævar Arnfjörð Bjarmason
>> > <avarab@gmail.com> wrote:
>> >>
>> >> On Wed, Sep 29 2021, Elijah Newren wrote:
>> >>
> ...
>> > As per the next patch:
>> >
>> > int unpack_trees(..., struct unpack_trees_options *o)
>> > {
>> >     struct dir_struct dir = DIR_INIT;
>> >     ...
>> >     if (!o->preserve_ignored) {
>> >         /* Setup 'dir', make o->dir point to it */
>> >         ....
>> >         o->dir = &dir;
>> >     }
>> >     ...
>> >     if (o->dir)
>> >         /* cleanup */
>> >     ....
>> > }
>> >
>> > The caller doesn't touch o->dir (other than initializing it to zeros);
>> > unpack_trees() is wholly responsible for it.  I'd kind of like to
>> > entirely remove dir from unpack_trees_options(), but I need a way of
>> > passing it down through all the other functions in unpack-trees.c, and
>> > leaving it in unpack_trees_options seems the easiest way to do so.  So
>> > I just marked it as "for internal use only".
>>
>> I think I understand *how* it works, I'm puzzled by why you went for
>> this whole level of indirection when you're using a struct on the stack
>> in the end anyway, just ... put that in "struct unpack_trees_options"?
>>
>> Anyway, I see I have only myself to blame here, as you added these leak
>> fixes in the v2 in response to some of my offhand comments.
>>
>> FWIW I then went on to do some deeper fixes not just on these leaks but
>> the surrounding leaks, which will be blocked by 2/11 & 05/11 of this
>> topic for a while. I suppose I only have myself to blame :)
>>
>> Below is a patch-on-top that I think makes this whole thing much simpler
>> by doing away with the pointer entirely.
>>
>> I suppose this is also a partial reply to
>> https://lore.kernel.org/git/CABPp-BG_qigBoirMGR-Yk9Niyxt0UmYCEqojsYxbSEarLAmraA@mail.gmail.com/;
>> but I quite dislike this pattern of including a pointer like this where
>> it's not needed just for the practicalities of memory management.
>>
>> I.e. here you use DIR_INIT. In my local patches to fix up the wider
>> memory leaks in this area I've got DIR_INIT also using a STRBUF_INIT,
>> and DIR_INIT will in turn be referenced by a
>> UNPACK_TREES_OPTIONS_INIT. It's quite nice if you're having to
>> initialize with "UNPACK_TREES_OPTIONS_INIT" have that initialization
>> work all the way down the chain, and not need e.g. a manual
>> strbuf_init(), dir_init() etc.
>
> And you can keep using UNPACK_TREES_OPTIONS_INIT, because the
> unpack_trees_opts->dir should be initialized to NULL.

But I don't want it initialized to NULL, I want DIR_INIT....

>> I removed the dir_init() in ce93a4c6127 (dir.[ch]: replace dir_init()
>> with DIR_INIT, 2021-07-01),
>
> I might be going on a tangent here, but looking at that patch, I'm
> worried that dir_init() was buggy and that you perpetuated that bug
> with DIR_INIT.  Note that dir_struct has a struct strbuf basebuf
> member, which neither dir_init() or DIR_INIT initialize properly
> (using either strbuf_init() or STRBUF_INIT).  As far as I can tell,
> dir.c relies on either strbuf_add() calls to just happen to work with
> this incorrectly initialized strbuf, or else use the strbuf_init()
> call in prep_exclude() to do so, using the following snippet:
>
>     if (!dir->basebuf.buf)
>         strbuf_init(&dir->basebuf, PATH_MAX);
>
> However, earlier in that same function we see
>
>     if (stk->baselen <= baselen &&
>         !strncmp(dir->basebuf.buf, base, stk->baselen))
>             break;
>
> So either that function can never have dir->basebuf.buf be NULL and
> the strbuf_init() is dead code, or else it's possible for us to
> trigger a segfault.  If it's the former, it may just be a ticking time
> bomb that will transform into the latter with some other change,
> because it's not at all obvious to me how dir->basebuf gets
> initialized appropriately to avoid that strncmp call.  Perhaps there
> is some invariant where exclude_stack is only set up by previous calls
> to prep_exclude() and those won't set up exclude_stack until first
> initializing basebuf.  But that really at least deserves a comment
> about how we're abusing basebuf, and would probably be cleaner if we
> initialized basebuf to STRBUF_INIT.

...because yes, I forgot about that when sending you the diff-on-top,
sorry. Yes that's buggy with the diff-on-top I sent you.

I've got that fixed in the version I have. I.e. first I add a
UNPACK_TREES_OPTIONS_INIT macro, then deal with that lazy initialization
case (at which point DIR_INIT starts initializing that strbuf), then
change the "dir_struct" from a pointer to embedding it, and finally fix
a memory leak with that new API.

WIP patches here:
https://github.com/avar/git/compare/avar/post-sanitize-leak-test-mode-add-and-use-revisions-release...avar/post-sanitize-leak-test-mode-unpack-trees-and-dir

>> but would probably need to bring it back, of
>
> If you need to bring it back, it's unrelated to my changes here, and
> would only be because of the lack of basebuf initialization above.

Yes, in this case. I mean that generally speaking I think it's a good
pattern to use to have structs be initialized by macros like this,
because it means you can embed them N levels deep (as we sometimes do)
without having to call functions to initialize them.

So yes, in this case as long as DIR_INIT is { 0 } it doesn't matter, but
it does as soon as it has a member that needs initialization, and
generally speaking for any FOO_INIT that needs a BAR_INIT ....

>> course you need some "release()" method for the
>> UNPACK_TREES_OPTIONS_INIT, which in turn needs to call the dir_release()
>> (well, "dir_clear()" in that case), and it needs to call
>> "strbuf_release()". It's just nicer if that boilerplate is all on
>> destruction, but not also on struct/object setup.
>
> The caller should *not* be initializing or tearing down
> unpack_trees_options->dir beyond setting that field to NULL; it should
> then leave it alone.

s/NULL/DIR_INIT/ in my version, but yes.

>> We do need that setup in some cases (although a lot could just be
>> replaced by lazy initialization), but if we don't....
>>
>> diff --git a/unpack-trees.c b/unpack-trees.c
>> index a7e1712d236..de5cc6cd025 100644
>> --- a/unpack-trees.c
>> +++ b/unpack-trees.c
>> @@ -1694,15 +1694,12 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>>         static struct cache_entry *dfc;
>>         struct pattern_list pl;
>>         int free_pattern_list = 0;
>> -       struct dir_struct dir = DIR_INIT;
>>
>>         if (o->reset == UNPACK_RESET_INVALID)
>>                 BUG("o->reset had a value of 1; should be UNPACK_TREES_*_UNTRACKED");
>>
>>         if (len > MAX_UNPACK_TREES)
>>                 die("unpack_trees takes at most %d trees", MAX_UNPACK_TREES);
>> -       if (o->dir)
>> -               BUG("o->dir is for internal use only");
>
> I think this was an important check that you've tossed without
> replacement.  Historically, callers set up and tweaked o->dir with
> various values.  With my patch, we are no longer allowing that..which
> introduces a transition problem -- people might have written or are
> now writing patches that make new calls of unpack_trees() previous to
> this change of mine, but submit them after this change of mine gets
> merged.  Without this check I added, they'd probably just do a
> mechanical `o->dir->` change to `o->dir.` and assume it's good...and
> then possibly have ugly bugs to hunt down.
>
> So, I think it's helpful to have a check that provides an early
> warning that tweaking o->dir is not only no longer required, but also
> no longer allowed.

The compiler will catch any such use of the pointer version on a
mis-merge, or do you just mean that the person running into that might
get the resolution wrong? I.e. before we could check o->dir being NULL
for "do we have an exclude", but &o->dir will always be true?

>>         trace_performance_enter();
>>         trace2_region_enter("unpack_trees", "unpack_trees", the_repository);
>> @@ -1718,9 +1715,8 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>>                 BUG("UNPACK_RESET_OVERWRITE_UNTRACKED incompatible with preserved ignored files");
>>
>>         if (!o->preserve_ignored) {
>> -               o->dir = &dir;
>> -               o->dir->flags |= DIR_SHOW_IGNORED;
>> -               setup_standard_excludes(o->dir);
>> +               o->dir.flags |= DIR_SHOW_IGNORED;
>> +               setup_standard_excludes(&o->dir);
>>         }
>>
>>         if (!core_apply_sparse_checkout || !o->update)
>> @@ -1884,10 +1880,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>>  done:
>>         if (free_pattern_list)
>>                 clear_pattern_list(&pl);
>> -       if (o->dir) {
>> -               dir_clear(o->dir);
>> -               o->dir = NULL;
>> -       }
>> +       dir_clear(&o->dir);
>
> Unconditionally calling dir_clear()...

As before I'm not sure about bugs in the ad-hoc patch on top, but I
don't think this is a bug in my version linked above.

I.e. it's zero'd out, and the dir_clear() either ends up calling
free(NULL) or tries to loop over 0..N where N will be 0, no?

>>         trace2_region_leave("unpack_trees", "unpack_trees", the_repository);
>>         trace_performance_leave("unpack_trees");
>>         return ret;
>> @@ -2153,8 +2146,7 @@ static int verify_clean_subdirectory(const struct cache_entry *ce,
>>         pathbuf = xstrfmt("%.*s/", namelen, ce->name);
>>
>>         memset(&d, 0, sizeof(d));
>> -       if (o->dir)
>> -               d.exclude_per_dir = o->dir->exclude_per_dir;
>> +       d.exclude_per_dir = o->dir.exclude_per_dir;
>>         i = read_directory(&d, o->src_index, pathbuf, namelen+1, NULL);
>>         if (i)
>>                 return add_rejected_path(o, ERROR_NOT_UPTODATE_DIR, ce->name);
>> @@ -2201,8 +2193,7 @@ static int check_ok_to_remove(const char *name, int len, int dtype,
>>         if (ignore_case && icase_exists(o, name, len, st))
>>                 return 0;
>>
>> -       if (o->dir &&
>> -           is_excluded(o->dir, o->src_index, name, &dtype))
>> +       if (is_excluded(&o->dir, o->src_index, name, &dtype))
>
> Unconditionally calling is_excluded()...

Which will just return "it's not", won't it? Just lik dir_clear() deals
with an "empty" dir_struct. There's existing callers of both with that
pattern in e.g. builtin/{add,clean}.c.

Maybe I've missed an edge case, but I think the only reason that "o->dir
&&" was there was because it was dynamically malloc'd before, but in my
version where we'll always have it initialized...

>>                 /*
>>                  * ce->name is explicitly excluded, so it is Ok to
>>                  * overwrite it.
>> diff --git a/unpack-trees.h b/unpack-trees.h
>> index 71ffb7eeb0c..a8afbb20170 100644
>> --- a/unpack-trees.h
>> +++ b/unpack-trees.h
>> @@ -5,6 +5,7 @@
>>  #include "strvec.h"
>>  #include "string-list.h"
>>  #include "tree-walk.h"
>> +#include "dir.h"
>>
>>  #define MAX_UNPACK_TREES MAX_TRAVERSE_TREES
>>
>> @@ -95,7 +96,7 @@ struct unpack_trees_options {
>>         struct index_state result;
>>
>>         struct pattern_list *pl; /* for internal use */
>> -       struct dir_struct *dir; /* for internal use only */
>> +       struct dir_struct dir; /* for internal use only */
>>         struct checkout_metadata meta;
>>  };
>>
>
> Not only did you drop the important safety check that o->dir not be
> setup by the caller (which needs to be reinstated in some form), your
> solution also involves unconditionally calling dir_clear() and
> is_excluded().  It is not clear to me that those calls are safe...and
> that they will continue to be safe in the future.

It is a common pattern we rely on, e.g. strbuf_release() and various
other custom free-like functions generally act as NOOP if they've got
nothing to do, just like free()...

> Even if it is safe
> and will continue to be, I don't think this should be squashed into my
> patches.  I think it should be a separate patch with its own commit
> message that explicitly calls out this assumption.  Especially since
> this is dir.c, which is an area where attempting to fix one very
> simple little bug results in years of refactoring and fixing all kinds
> of historical messes, sometimes waiting a year and a half for
> responses to RFCs/review requests, and where we have to sometimes just
> give up on attempting to understand the purpose of various bits of
> code and instead rely on the regression tests and hope they are good
> enough.  I still think that dir.c deserves a little warning at the
> top, like the one I suggested in [1].
>
> [1] https://lore.kernel.org/git/CABPp-BFiwzzUgiTj_zu+vF5x20L0=1cf25cHwk7KZQj2YkVzXw@mail.gmail.com/

*nod* I can always submit something like this afterwards.

Just on this series: Perhaps this discussion is a sign that this memory
leak fixing should be its own cleanup series where we could hash out any
approaches to doing that? I.e. as noted before I realize I'm to blame
for suggesting it in the first place, but those parts of these changes
don't seem like they're needed by other parts of the series (I tried
compiling with the two relevant patches ejected out).

Having a humongous set of memory leak fixes locally at this point, I
think it's generally not very worth the effort to fix a leak in b()
where a() calls b() and b() calls c(), and all of [abc]() are
leaking. I.e. often narrowly fixing leaks in b() will lead to different
solutions than if you're trying to resolve all of [abc](), as their
interaction comes into play.

Aside about safety: One thing I'll sometimes do when I'm unsure about
those sorts of fixes is to have my new INIT set a new "sentinel" field
to "12345" or whatever, then just BUG() out in an entry point in the API
that you can't avoid calling if it's not set like that, e.g. dir_clear()
or whatever the setup/work function is.

We don't have 100% test coverage, but we usually have at least *some*,
and doing that is good about catching e.g. a memset() at a distance, as
happens in this code with the merge code embedding the relevant struct
and memsetting it, which might be missed in some migration of just a
grep for "struct dir_struct" or whatever...
Ævar Arnfjörð Bjarmason Oct. 1, 2021, 9:53 a.m. UTC | #8
On Fri, Oct 01 2021, Ævar Arnfjörð Bjarmason wrote:

> Aside about safety: One thing I'll sometimes do when I'm unsure about
> those sorts of fixes is to have my new INIT set a new "sentinel" field
> to "12345" or whatever, then just BUG() out in an entry point in the API
> that you can't avoid calling if it's not set like that, e.g. dir_clear()
> or whatever the setup/work function is.

For reference: Something like the below, which passes with my WIP
patches. Showing that no non-static entry point can reach the code in
unpack-trees.c without "foo" being 12345, which can only be the case if
callers have used the macro (and the code internal to unpack-trees.c is
easy enough to audit).

 unpack-trees.c | 25 +++++++++++++++++++++++++
 unpack-trees.h |  2 ++
 2 files changed, 27 insertions(+)

diff --git a/unpack-trees.c b/unpack-trees.c
index d40af221e1c..f2365ecf215 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -199,6 +199,8 @@ void clear_unpack_trees_porcelain(struct unpack_trees_options *opts)
 {
 	strvec_clear(&opts->msgs_to_free);
 	dir_clear(&opts->dir);
+	if (opts->foo != 12345)
+		BUG("noes");
 	memset(opts->msgs, 0, sizeof(opts->msgs));
 }
 
@@ -1702,6 +1704,9 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 	struct pattern_list pl;
 	int free_pattern_list = 0;
 
+	if (o->foo != 12345)
+		BUG("noes");
+
 	if (len > MAX_UNPACK_TREES)
 		die("unpack_trees takes at most %d trees", MAX_UNPACK_TREES);
 
@@ -1903,6 +1908,9 @@ enum update_sparsity_result update_sparsity(struct unpack_trees_options *o)
 	unsigned old_show_all_errors;
 	int free_pattern_list = 0;
 
+	if (o->foo != 12345)
+		BUG("noes");
+
 	old_show_all_errors = o->show_all_errors;
 	o->show_all_errors = 1;
 
@@ -2033,6 +2041,8 @@ static int verify_uptodate_1(const struct cache_entry *ce,
 int verify_uptodate(const struct cache_entry *ce,
 		    struct unpack_trees_options *o)
 {
+	if (o->foo != 12345)
+		BUG("noes");
 	if (!o->skip_sparse_checkout && (ce->ce_flags & CE_NEW_SKIP_WORKTREE))
 		return 0;
 	return verify_uptodate_1(ce, o, ERROR_NOT_UPTODATE_FILE);
@@ -2417,6 +2427,9 @@ int threeway_merge(const struct cache_entry * const *stages,
 	int no_anc_exists = 1;
 	int i;
 
+	if (o->foo != 12345)
+		BUG("noes");
+
 	for (i = 1; i < o->head_idx; i++) {
 		if (!stages[i] || stages[i] == o->df_conflict_entry)
 			any_anc_missing = 1;
@@ -2580,6 +2593,9 @@ int twoway_merge(const struct cache_entry * const *src,
 	const struct cache_entry *oldtree = src[1];
 	const struct cache_entry *newtree = src[2];
 
+	if (o->foo != 12345)
+		BUG("noes");
+
 	if (o->merge_size != 2)
 		return error("Cannot do a twoway merge of %d trees",
 			     o->merge_size);
@@ -2654,6 +2670,9 @@ int bind_merge(const struct cache_entry * const *src,
 	const struct cache_entry *old = src[0];
 	const struct cache_entry *a = src[1];
 
+	if (o->foo != 12345)
+		BUG("noes");
+
 	if (o->merge_size != 1)
 		return error("Cannot do a bind merge of %d trees",
 			     o->merge_size);
@@ -2680,6 +2699,9 @@ int oneway_merge(const struct cache_entry * const *src,
 	const struct cache_entry *old = src[0];
 	const struct cache_entry *a = src[1];
 
+	if (o->foo != 12345)
+		BUG("noes");
+
 	if (o->merge_size != 1)
 		return error("Cannot do a oneway merge of %d trees",
 			     o->merge_size);
@@ -2717,6 +2739,9 @@ int stash_worktree_untracked_merge(const struct cache_entry * const *src,
 	const struct cache_entry *worktree = src[1];
 	const struct cache_entry *untracked = src[2];
 
+	if (o->foo != 12345)
+		BUG("noes");
+
 	if (o->merge_size != 2)
 		BUG("invalid merge_size: %d", o->merge_size);
 
diff --git a/unpack-trees.h b/unpack-trees.h
index 75b67f90ccd..8dae0938ad1 100644
--- a/unpack-trees.h
+++ b/unpack-trees.h
@@ -90,10 +90,12 @@ struct unpack_trees_options {
 
 	struct pattern_list *pl; /* for internal use */
 	struct checkout_metadata meta;
+	int foo;
 };
 #define UNPACK_TREES_OPTIONS_INIT { \
 	.msgs_to_free = STRVEC_INIT, \
 	.dir = DIR_INIT, \
+	.foo = 12345, \
 }
 
 void unpack_trees_init(struct unpack_trees_options *options);
Elijah Newren Oct. 1, 2021, 6:50 p.m. UTC | #9
On Fri, Oct 1, 2021 at 1:47 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
> On Thu, Sep 30 2021, Elijah Newren wrote:
>
> > On Thu, Sep 30, 2021 at 7:15 AM Ævar Arnfjörð Bjarmason
> > <avarab@gmail.com> wrote:
> >>
> >> On Wed, Sep 29 2021, Elijah Newren wrote:
> >>
> >> > On Wed, Sep 29, 2021 at 11:32 AM Ævar Arnfjörð Bjarmason
> >> > <avarab@gmail.com> wrote:
> >> >>
> >> >> On Wed, Sep 29 2021, Elijah Newren wrote:
> >> >>
> > ...
> >> > As per the next patch:
> >> >
> >> > int unpack_trees(..., struct unpack_trees_options *o)
> >> > {
> >> >     struct dir_struct dir = DIR_INIT;
> >> >     ...
> >> >     if (!o->preserve_ignored) {
> >> >         /* Setup 'dir', make o->dir point to it */
> >> >         ....
> >> >         o->dir = &dir;
> >> >     }
> >> >     ...
> >> >     if (o->dir)
> >> >         /* cleanup */
> >> >     ....
> >> > }
> >> >
> >> > The caller doesn't touch o->dir (other than initializing it to zeros);
> >> > unpack_trees() is wholly responsible for it.  I'd kind of like to
> >> > entirely remove dir from unpack_trees_options(), but I need a way of
> >> > passing it down through all the other functions in unpack-trees.c, and
> >> > leaving it in unpack_trees_options seems the easiest way to do so.  So
> >> > I just marked it as "for internal use only".
> >>
> >> I think I understand *how* it works, I'm puzzled by why you went for
> >> this whole level of indirection when you're using a struct on the stack
> >> in the end anyway, just ... put that in "struct unpack_trees_options"?
> >>
> >> Anyway, I see I have only myself to blame here, as you added these leak
> >> fixes in the v2 in response to some of my offhand comments.
> >>
> >> FWIW I then went on to do some deeper fixes not just on these leaks but
> >> the surrounding leaks, which will be blocked by 2/11 & 05/11 of this
> >> topic for a while. I suppose I only have myself to blame :)
> >>
> >> Below is a patch-on-top that I think makes this whole thing much simpler
> >> by doing away with the pointer entirely.
> >>
> >> I suppose this is also a partial reply to
> >> https://lore.kernel.org/git/CABPp-BG_qigBoirMGR-Yk9Niyxt0UmYCEqojsYxbSEarLAmraA@mail.gmail.com/;
> >> but I quite dislike this pattern of including a pointer like this where
> >> it's not needed just for the practicalities of memory management.
> >>
> >> I.e. here you use DIR_INIT. In my local patches to fix up the wider
> >> memory leaks in this area I've got DIR_INIT also using a STRBUF_INIT,
> >> and DIR_INIT will in turn be referenced by a
> >> UNPACK_TREES_OPTIONS_INIT. It's quite nice if you're having to
> >> initialize with "UNPACK_TREES_OPTIONS_INIT" have that initialization
> >> work all the way down the chain, and not need e.g. a manual
> >> strbuf_init(), dir_init() etc.
> >
> > And you can keep using UNPACK_TREES_OPTIONS_INIT, because the
> > unpack_trees_opts->dir should be initialized to NULL.
>
> But I don't want it initialized to NULL, I want DIR_INIT....

Why?  For what purpose?  How does that help anything?  I've seen you
say that you want it, but I haven't yet seen you state how it helps
you do anything easier.

What I really want, though, is not even to have it be a pointer, but
to avoid exposing internal implementation details inside a struct that
is meant to convey the public API.  Instead unpack-trees should do
something similar to merge-ort, where it hides all those internal-only
details (by e.g. having a void* priv that happens to point to a struct
unpack_trees_options_internal, the latter of which is only defined in
unpack_trees.c).  However, I didn't want to go through that work for
just one member.

But you've inspired me to check if there are other fields that
shouldn't be exposed.  Turns out that there is a lot of cruft in
unpack_trees_options that callers shouldn't be messing with (and which
isn't at all clear to people trying to use the API): cache_bottom,
dir, msgs, msgs_to_free, nontrivial_merge, skip_sparse_checkout,
show_all_errors (!), unpack_rejects, df_conflict_entry, merge_size,
result, and perhaps pl.  A few of those have gotten slightly
entangled.  And there may have been others that people just started
setting because it was an existing field, and now I can't
differentiate between an intentional API usage passing some kind of
interesting value and an accidental setting of something meant to be
internal.

So maybe I'll submit some patches on top that rip these direct members
out of of unpack_trees_options and push them inside some opaque
struct.

> >> I removed the dir_init() in ce93a4c6127 (dir.[ch]: replace dir_init()
> >> with DIR_INIT, 2021-07-01),
> >
> > I might be going on a tangent here, but looking at that patch, I'm
> > worried that dir_init() was buggy and that you perpetuated that bug
> > with DIR_INIT.  Note that dir_struct has a struct strbuf basebuf
> > member, which neither dir_init() or DIR_INIT initialize properly
> > (using either strbuf_init() or STRBUF_INIT).  As far as I can tell,
> > dir.c relies on either strbuf_add() calls to just happen to work with
> > this incorrectly initialized strbuf, or else use the strbuf_init()
> > call in prep_exclude() to do so, using the following snippet:
> >
> >     if (!dir->basebuf.buf)
> >         strbuf_init(&dir->basebuf, PATH_MAX);
> >
> > However, earlier in that same function we see
> >
> >     if (stk->baselen <= baselen &&
> >         !strncmp(dir->basebuf.buf, base, stk->baselen))
> >             break;
> >
> > So either that function can never have dir->basebuf.buf be NULL and
> > the strbuf_init() is dead code, or else it's possible for us to
> > trigger a segfault.  If it's the former, it may just be a ticking time
> > bomb that will transform into the latter with some other change,
> > because it's not at all obvious to me how dir->basebuf gets
> > initialized appropriately to avoid that strncmp call.  Perhaps there
> > is some invariant where exclude_stack is only set up by previous calls
> > to prep_exclude() and those won't set up exclude_stack until first
> > initializing basebuf.  But that really at least deserves a comment
> > about how we're abusing basebuf, and would probably be cleaner if we
> > initialized basebuf to STRBUF_INIT.
>
> ...because yes, I forgot about that when sending you the diff-on-top,
> sorry. Yes that's buggy with the diff-on-top I sent you.

That bug didn't come from the diff-on-top you sent me, it came from
the commit already merged to master -- ce93a4c6127  (dir.[ch]: replace
dir_init() with DIR_INIT, 2021-07-01), merged as part of
ab/struct-init on Jul 16.

> I've got that fixed in the version I have. I.e. first I add a
> UNPACK_TREES_OPTIONS_INIT macro, then deal with that lazy initialization
> case (at which point DIR_INIT starts initializing that strbuf), then
> change the "dir_struct" from a pointer to embedding it, and finally fix
> a memory leak with that new API.
>
> WIP patches here:
> https://github.com/avar/git/compare/avar/post-sanitize-leak-test-mode-add-and-use-revisions-release...avar/post-sanitize-leak-test-mode-unpack-trees-and-dir

Yes, that fixes DIR_INIT nicely.  Looks good!

> >> but would probably need to bring it back, of
> >
> > If you need to bring it back, it's unrelated to my changes here, and
> > would only be because of the lack of basebuf initialization above.
>
> Yes, in this case. I mean that generally speaking I think it's a good
> pattern to use to have structs be initialized by macros like this,
> because it means you can embed them N levels deep (as we sometimes do)
> without having to call functions to initialize them.
>
> So yes, in this case as long as DIR_INIT is { 0 } it doesn't matter, but
> it does as soon as it has a member that needs initialization, and
> generally speaking for any FOO_INIT that needs a BAR_INIT ...

Callers SHOULD NOT call a function to initialize
unpacked_trees_opts->dir in my patches.  It's a ****BUG**** if they do
so.  So if you're complaining that my changes force callers to also
invoke some additional function, then I think you're just not
understanding my patch.

So, I still see no reason given for wanting opts->dir to be a struct.
But maybe we can fix this by just removing 'dir' (and several other
members) from opts, so that callers can't initialize it in any way to
anything.

> >> course you need some "release()" method for the
> >> UNPACK_TREES_OPTIONS_INIT, which in turn needs to call the dir_release()
> >> (well, "dir_clear()" in that case), and it needs to call
> >> "strbuf_release()". It's just nicer if that boilerplate is all on
> >> destruction, but not also on struct/object setup.
> >
> > The caller should *not* be initializing or tearing down
> > unpack_trees_options->dir beyond setting that field to NULL; it should
> > then leave it alone.
>
> s/NULL/DIR_INIT/ in my version, but yes.
>
> >> We do need that setup in some cases (although a lot could just be
> >> replaced by lazy initialization), but if we don't....
> >>
> >> diff --git a/unpack-trees.c b/unpack-trees.c
> >> index a7e1712d236..de5cc6cd025 100644
> >> --- a/unpack-trees.c
> >> +++ b/unpack-trees.c
> >> @@ -1694,15 +1694,12 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
> >>         static struct cache_entry *dfc;
> >>         struct pattern_list pl;
> >>         int free_pattern_list = 0;
> >> -       struct dir_struct dir = DIR_INIT;
> >>
> >>         if (o->reset == UNPACK_RESET_INVALID)
> >>                 BUG("o->reset had a value of 1; should be UNPACK_TREES_*_UNTRACKED");
> >>
> >>         if (len > MAX_UNPACK_TREES)
> >>                 die("unpack_trees takes at most %d trees", MAX_UNPACK_TREES);
> >> -       if (o->dir)
> >> -               BUG("o->dir is for internal use only");
> >
> > I think this was an important check that you've tossed without
> > replacement.  Historically, callers set up and tweaked o->dir with
> > various values.  With my patch, we are no longer allowing that..which
> > introduces a transition problem -- people might have written or are
> > now writing patches that make new calls of unpack_trees() previous to
> > this change of mine, but submit them after this change of mine gets
> > merged.  Without this check I added, they'd probably just do a
> > mechanical `o->dir->` change to `o->dir.` and assume it's good...and
> > then possibly have ugly bugs to hunt down.
> >
> > So, I think it's helpful to have a check that provides an early
> > warning that tweaking o->dir is not only no longer required, but also
> > no longer allowed.
>
> The compiler will catch any such use of the pointer version on a
> mis-merge, or do you just mean that the person running into that might
> get the resolution wrong? I.e. before we could check o->dir being NULL
> for "do we have an exclude", but &o->dir will always be true?

The compiler catches and reports, and then the human sees the error
and just transliterates "o->dir->" to "o->dir.".  And then it
compiles, and the person assumes they fixed it correctly, but the
transliteration was WRONG and has subtle bugs because they had been
setting up o->dir with special values and something needs to warn them
that they shouldn't be touching o->dir anymore.  You removed the
safety check that would have let them know that their straightforward
transliteration was wrong.  I added that safety check intentionally,
and don't like seeing it ripped out without a replacement.

> >>         trace_performance_enter();
> >>         trace2_region_enter("unpack_trees", "unpack_trees", the_repository);
> >> @@ -1718,9 +1715,8 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
> >>                 BUG("UNPACK_RESET_OVERWRITE_UNTRACKED incompatible with preserved ignored files");
> >>
> >>         if (!o->preserve_ignored) {
> >> -               o->dir = &dir;
> >> -               o->dir->flags |= DIR_SHOW_IGNORED;
> >> -               setup_standard_excludes(o->dir);
> >> +               o->dir.flags |= DIR_SHOW_IGNORED;
> >> +               setup_standard_excludes(&o->dir);
> >>         }
> >>
> >>         if (!core_apply_sparse_checkout || !o->update)
> >> @@ -1884,10 +1880,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
> >>  done:
> >>         if (free_pattern_list)
> >>                 clear_pattern_list(&pl);
> >> -       if (o->dir) {
> >> -               dir_clear(o->dir);
> >> -               o->dir = NULL;
> >> -       }
> >> +       dir_clear(&o->dir);
> >
> > Unconditionally calling dir_clear()...
>
> As before I'm not sure about bugs in the ad-hoc patch on top, but I
> don't think this is a bug in my version linked above.
>
> I.e. it's zero'd out, and the dir_clear() either ends up calling
> free(NULL) or tries to loop over 0..N where N will be 0, no?
>
> >>         trace2_region_leave("unpack_trees", "unpack_trees", the_repository);
> >>         trace_performance_leave("unpack_trees");
> >>         return ret;
> >> @@ -2153,8 +2146,7 @@ static int verify_clean_subdirectory(const struct cache_entry *ce,
> >>         pathbuf = xstrfmt("%.*s/", namelen, ce->name);
> >>
> >>         memset(&d, 0, sizeof(d));
> >> -       if (o->dir)
> >> -               d.exclude_per_dir = o->dir->exclude_per_dir;
> >> +       d.exclude_per_dir = o->dir.exclude_per_dir;
> >>         i = read_directory(&d, o->src_index, pathbuf, namelen+1, NULL);
> >>         if (i)
> >>                 return add_rejected_path(o, ERROR_NOT_UPTODATE_DIR, ce->name);
> >> @@ -2201,8 +2193,7 @@ static int check_ok_to_remove(const char *name, int len, int dtype,
> >>         if (ignore_case && icase_exists(o, name, len, st))
> >>                 return 0;
> >>
> >> -       if (o->dir &&
> >> -           is_excluded(o->dir, o->src_index, name, &dtype))
> >> +       if (is_excluded(&o->dir, o->src_index, name, &dtype))
> >
> > Unconditionally calling is_excluded()...
>
> Which will just return "it's not", won't it? Just lik dir_clear() deals
> with an "empty" dir_struct. There's existing callers of both with that
> pattern in e.g. builtin/{add,clean}.c.

That's a question that someone writing this patch should investigate
and document.  I read through is_excluded() and the functions it
calls, and I _think_ it's possibly correct in the current code.  But
it's not trivial to verify.  And I'd look a bit closer at it to be
sure it's correct before making this change.

Yeah, I'm being pretty picky here, but this is dir.c we are dealing with.

> Maybe I've missed an edge case, but I think the only reason that "o->dir
> &&" was there was because it was dynamically malloc'd before, but in my
> version where we'll always have it initialized...

I think after checking the code to verify this, that it deserves being
mentioned in the commit message (at least given that this is dir.c
we're talking about).

> >>                 /*
> >>                  * ce->name is explicitly excluded, so it is Ok to
> >>                  * overwrite it.
> >> diff --git a/unpack-trees.h b/unpack-trees.h
> >> index 71ffb7eeb0c..a8afbb20170 100644
> >> --- a/unpack-trees.h
> >> +++ b/unpack-trees.h
> >> @@ -5,6 +5,7 @@
> >>  #include "strvec.h"
> >>  #include "string-list.h"
> >>  #include "tree-walk.h"
> >> +#include "dir.h"
> >>
> >>  #define MAX_UNPACK_TREES MAX_TRAVERSE_TREES
> >>
> >> @@ -95,7 +96,7 @@ struct unpack_trees_options {
> >>         struct index_state result;
> >>
> >>         struct pattern_list *pl; /* for internal use */
> >> -       struct dir_struct *dir; /* for internal use only */
> >> +       struct dir_struct dir; /* for internal use only */
> >>         struct checkout_metadata meta;
> >>  };
> >>
> >
> > Not only did you drop the important safety check that o->dir not be
> > setup by the caller (which needs to be reinstated in some form), your
> > solution also involves unconditionally calling dir_clear() and
> > is_excluded().  It is not clear to me that those calls are safe...and
> > that they will continue to be safe in the future.
>
> It is a common pattern we rely on, e.g. strbuf_release() and various
> other custom free-like functions generally act as NOOP if they've got
> nothing to do, just like free()...

Yeah, "common pattern" is a useful way to want things to behave, and
people assuming that in other parts of git's codebase is probably
fine, but I don't trust dir.c to already behave according to common
patterns.  My mistrust of it is deep enough that I think someone
should verify the relevant parts of dir.c do behave that way before
making changes like this, and then explicitly call it out in the
commit message when the change is made.  dir.c had some weird dragons
in it.  And after being repeatedly sucked back into dir.c problems for
years because of weird side effects, and seeing how many piles of
nearly-cancelling bugs it had, I don't trust it anymore.

> > Even if it is safe
> > and will continue to be, I don't think this should be squashed into my
> > patches.  I think it should be a separate patch with its own commit
> > message that explicitly calls out this assumption.  Especially since
> > this is dir.c, which is an area where attempting to fix one very
> > simple little bug results in years of refactoring and fixing all kinds
> > of historical messes, sometimes waiting a year and a half for
> > responses to RFCs/review requests, and where we have to sometimes just
> > give up on attempting to understand the purpose of various bits of
> > code and instead rely on the regression tests and hope they are good
> > enough.  I still think that dir.c deserves a little warning at the
> > top, like the one I suggested in [1].
> >
> > [1] https://lore.kernel.org/git/CABPp-BFiwzzUgiTj_zu+vF5x20L0=1cf25cHwk7KZQj2YkVzXw@mail.gmail.com/
>
> *nod* I can always submit something like this afterwards.

:-)

> Just on this series: Perhaps this discussion is a sign that this memory
> leak fixing should be its own cleanup series where we could hash out any
> approaches to doing that? I.e. as noted before I realize I'm to blame
> for suggesting it in the first place, but those parts of these changes
> don't seem like they're needed by other parts of the series (I tried
> compiling with the two relevant patches ejected out).
>
> Having a humongous set of memory leak fixes locally at this point, I
> think it's generally not very worth the effort to fix a leak in b()
> where a() calls b() and b() calls c(), and all of [abc]() are
> leaking. I.e. often narrowly fixing leaks in b() will lead to different
> solutions than if you're trying to resolve all of [abc](), as their
> interaction comes into play.

The patch we are commenting on isn't a leakfix, and is very much
integral to the series for reasons other than actual leakfix (which
occurred two patches before this one).  It's simplifying the API to
not require a bunch of boilerplate setup of opts->dir and instead
requiring callers to just set a simple boolean INSTEAD (and only if
they want non-default behavior, rather than requiring setting
opts->dir by all those who did want default behavior).  That way, when
I need to make sure several additional callers who should have gotten
the default behavior actually get it, they don't have to call a bunch
of functions to setup and cleanup opts->dir.

> Aside about safety: One thing I'll sometimes do when I'm unsure about
> those sorts of fixes is to have my new INIT set a new "sentinel" field
> to "12345" or whatever, then just BUG() out in an entry point in the API
> that you can't avoid calling if it's not set like that, e.g. dir_clear()
> or whatever the setup/work function is.
>
> We don't have 100% test coverage, but we usually have at least *some*,
> and doing that is good about catching e.g. a memset() at a distance, as
> happens in this code with the merge code embedding the relevant struct
> and memsetting it, which might be missed in some migration of just a
> grep for "struct dir_struct" or whatever...
Ævar Arnfjörð Bjarmason Oct. 2, 2021, 8:44 a.m. UTC | #10
On Fri, Oct 01 2021, Elijah Newren wrote:

> On Fri, Oct 1, 2021 at 1:47 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>>
>> On Thu, Sep 30 2021, Elijah Newren wrote:
>>
>> > On Thu, Sep 30, 2021 at 7:15 AM Ævar Arnfjörð Bjarmason
>> > <avarab@gmail.com> wrote:
>> >>
>> >> On Wed, Sep 29 2021, Elijah Newren wrote:
>> >>
>> >> > On Wed, Sep 29, 2021 at 11:32 AM Ævar Arnfjörð Bjarmason
>> >> > <avarab@gmail.com> wrote:
>> >> >>
>> >> >> On Wed, Sep 29 2021, Elijah Newren wrote:
>> >> >>
>> > ...
>> >> > As per the next patch:
>> >> >
>> >> > int unpack_trees(..., struct unpack_trees_options *o)
>> >> > {
>> >> >     struct dir_struct dir = DIR_INIT;
>> >> >     ...
>> >> >     if (!o->preserve_ignored) {
>> >> >         /* Setup 'dir', make o->dir point to it */
>> >> >         ....
>> >> >         o->dir = &dir;
>> >> >     }
>> >> >     ...
>> >> >     if (o->dir)
>> >> >         /* cleanup */
>> >> >     ....
>> >> > }
>> >> >
>> >> > The caller doesn't touch o->dir (other than initializing it to zeros);
>> >> > unpack_trees() is wholly responsible for it.  I'd kind of like to
>> >> > entirely remove dir from unpack_trees_options(), but I need a way of
>> >> > passing it down through all the other functions in unpack-trees.c, and
>> >> > leaving it in unpack_trees_options seems the easiest way to do so.  So
>> >> > I just marked it as "for internal use only".
>> >>
>> >> I think I understand *how* it works, I'm puzzled by why you went for
>> >> this whole level of indirection when you're using a struct on the stack
>> >> in the end anyway, just ... put that in "struct unpack_trees_options"?
>> >>
>> >> Anyway, I see I have only myself to blame here, as you added these leak
>> >> fixes in the v2 in response to some of my offhand comments.
>> >>
>> >> FWIW I then went on to do some deeper fixes not just on these leaks but
>> >> the surrounding leaks, which will be blocked by 2/11 & 05/11 of this
>> >> topic for a while. I suppose I only have myself to blame :)
>> >>
>> >> Below is a patch-on-top that I think makes this whole thing much simpler
>> >> by doing away with the pointer entirely.
>> >>
>> >> I suppose this is also a partial reply to
>> >> https://lore.kernel.org/git/CABPp-BG_qigBoirMGR-Yk9Niyxt0UmYCEqojsYxbSEarLAmraA@mail.gmail.com/;
>> >> but I quite dislike this pattern of including a pointer like this where
>> >> it's not needed just for the practicalities of memory management.
>> >>
>> >> I.e. here you use DIR_INIT. In my local patches to fix up the wider
>> >> memory leaks in this area I've got DIR_INIT also using a STRBUF_INIT,
>> >> and DIR_INIT will in turn be referenced by a
>> >> UNPACK_TREES_OPTIONS_INIT. It's quite nice if you're having to
>> >> initialize with "UNPACK_TREES_OPTIONS_INIT" have that initialization
>> >> work all the way down the chain, and not need e.g. a manual
>> >> strbuf_init(), dir_init() etc.
>> >
>> > And you can keep using UNPACK_TREES_OPTIONS_INIT, because the
>> > unpack_trees_opts->dir should be initialized to NULL.
>>
>> But I don't want it initialized to NULL, I want DIR_INIT....
>
> Why?  For what purpose?  How does that help anything?  I've seen you
> say that you want it, but I haven't yet seen you state how it helps
> you do anything easier.

Upthread of here in <87k0ixrv23.fsf@evledraar.gmail.com> I linked to WIP
patches that get rid of lazy initialization in dir.c. I.e. this change:
https://github.com/avar/git/commit/5a18133e927f82a9b6ffcb0c43c3f8657f8836af

[I see from reading ahead a bit that you saw that later...]

I.e. if you initialize it to NULL the embedded strbuf won't be in the
right state to be used, which is why we initialize it on the fly in that
part of prep_exclude(). If everything uses DIR_INIT we can just have it
be initialized already.

More generally I think that even if you don't have an immediate use-case
like that it makes sense to consistently use the macros, because once
something *does* need a strbuf or whatever you're not needing to do a
refactoring of ensuring that everything that hardcoded a "{ 0 }" or its
own memset() is converted to use it, or needing to do initialization
on-the-fly as in that code in prep_exclude()>

> What I really want, though, is not even to have it be a pointer, but
> to avoid exposing internal implementation details inside a struct that
> is meant to convey the public API.  Instead unpack-trees should do
> something similar to merge-ort, where it hides all those internal-only
> details (by e.g. having a void* priv that happens to point to a struct
> unpack_trees_options_internal, the latter of which is only defined in
> unpack_trees.c).  However, I didn't want to go through that work for
> just one member.

Yes, and I like the part of your change that's not having callers change
"struct dir_struct" but instead just exposing a field in unpack_trees to
flag the desired behavior, that that code wants to then use a "struct
dir_struct" can just be an implementation detail.

> But you've inspired me to check if there are other fields that
> shouldn't be exposed.  Turns out that there is a lot of cruft in
> unpack_trees_options that callers shouldn't be messing with (and which
> isn't at all clear to people trying to use the API): cache_bottom,
> dir, msgs, msgs_to_free, nontrivial_merge, skip_sparse_checkout,
> show_all_errors (!), unpack_rejects, df_conflict_entry, merge_size,
> result, and perhaps pl.  A few of those have gotten slightly
> entangled.  And there may have been others that people just started
> setting because it was an existing field, and now I can't
> differentiate between an intentional API usage passing some kind of
> interesting value and an accidental setting of something meant to be
> internal.
>
> So maybe I'll submit some patches on top that rip these direct members
> out of of unpack_trees_options and push them inside some opaque
> struct.

Sure, that sounds good. I only had a mild objection to doing it in a way
where you'll need that sort of code I removed in the linked commit in
prep_exclude() because you were trying not to expose that at any cost,
including via some *_INIT macro. I.e. if it's private we can just name
it "priv_*" or have a :

    struct dont_touch_this {
        struct dir_struct dir;
    };

Which are both ways of /messaging/ that it's private, and since the
target audience is just the rest of the git.git codebase I think that
ultimately something that 1) sends the right message 2) makes accidents
pretty much impossible suffices. I.e. you don't accidentally introduce a
new API user accessing a field called "->priv_*" or
"->private_*". Someone will review those patches...

>> >> I removed the dir_init() in ce93a4c6127 (dir.[ch]: replace dir_init()
>> >> with DIR_INIT, 2021-07-01),
>> >
>> > I might be going on a tangent here, but looking at that patch, I'm
>> > worried that dir_init() was buggy and that you perpetuated that bug
>> > with DIR_INIT.  Note that dir_struct has a struct strbuf basebuf
>> > member, which neither dir_init() or DIR_INIT initialize properly
>> > (using either strbuf_init() or STRBUF_INIT).  As far as I can tell,
>> > dir.c relies on either strbuf_add() calls to just happen to work with
>> > this incorrectly initialized strbuf, or else use the strbuf_init()
>> > call in prep_exclude() to do so, using the following snippet:
>> >
>> >     if (!dir->basebuf.buf)
>> >         strbuf_init(&dir->basebuf, PATH_MAX);
>> >
>> > However, earlier in that same function we see
>> >
>> >     if (stk->baselen <= baselen &&
>> >         !strncmp(dir->basebuf.buf, base, stk->baselen))
>> >             break;
>> >
>> > So either that function can never have dir->basebuf.buf be NULL and
>> > the strbuf_init() is dead code, or else it's possible for us to
>> > trigger a segfault.  If it's the former, it may just be a ticking time
>> > bomb that will transform into the latter with some other change,
>> > because it's not at all obvious to me how dir->basebuf gets
>> > initialized appropriately to avoid that strncmp call.  Perhaps there
>> > is some invariant where exclude_stack is only set up by previous calls
>> > to prep_exclude() and those won't set up exclude_stack until first
>> > initializing basebuf.  But that really at least deserves a comment
>> > about how we're abusing basebuf, and would probably be cleaner if we
>> > initialized basebuf to STRBUF_INIT.
>>
>> ...because yes, I forgot about that when sending you the diff-on-top,
>> sorry. Yes that's buggy with the diff-on-top I sent you.
>
> That bug didn't come from the diff-on-top you sent me, it came from
> the commit already merged to master -- ce93a4c6127  (dir.[ch]: replace
> dir_init() with DIR_INIT, 2021-07-01), merged as part of
> ab/struct-init on Jul 16.

Ah, I misunderstood you there. I'll look at that / fix it. Sorry.

>> I've got that fixed in the version I have. I.e. first I add a
>> UNPACK_TREES_OPTIONS_INIT macro, then deal with that lazy initialization
>> case (at which point DIR_INIT starts initializing that strbuf), then
>> change the "dir_struct" from a pointer to embedding it, and finally fix
>> a memory leak with that new API.
>>
>> WIP patches here:
>> https://github.com/avar/git/compare/avar/post-sanitize-leak-test-mode-add-and-use-revisions-release...avar/post-sanitize-leak-test-mode-unpack-trees-and-dir
>
> Yes, that fixes DIR_INIT nicely.  Looks good!
>
>> >> but would probably need to bring it back, of
>> >
>> > If you need to bring it back, it's unrelated to my changes here, and
>> > would only be because of the lack of basebuf initialization above.
>>
>> Yes, in this case. I mean that generally speaking I think it's a good
>> pattern to use to have structs be initialized by macros like this,
>> because it means you can embed them N levels deep (as we sometimes do)
>> without having to call functions to initialize them.
>>
>> So yes, in this case as long as DIR_INIT is { 0 } it doesn't matter, but
>> it does as soon as it has a member that needs initialization, and
>> generally speaking for any FOO_INIT that needs a BAR_INIT ...
>
> Callers SHOULD NOT call a function to initialize
> unpacked_trees_opts->dir in my patches.  It's a ****BUG**** if they do
> so.  So if you're complaining that my changes force callers to also
> invoke some additional function, then I think you're just not
> understanding my patch.
>
> So, I still see no reason given for wanting opts->dir to be a struct.
> But maybe we can fix this by just removing 'dir' (and several other
> members) from opts, so that callers can't initialize it in any way to
> anything.

I think your patches are fine as-is, and yes that would be a bug without
some of the changes I have in that WIP series.

As noted in
https://lore.kernel.org/git/87fstlrumj.fsf@evledraar.gmail.com/ I think
we're just having a side-discussion about init patterns, and what
direction to take these APIs when it comes to that...

>> >> course you need some "release()" method for the
>> >> UNPACK_TREES_OPTIONS_INIT, which in turn needs to call the dir_release()
>> >> (well, "dir_clear()" in that case), and it needs to call
>> >> "strbuf_release()". It's just nicer if that boilerplate is all on
>> >> destruction, but not also on struct/object setup.
>> >
>> > The caller should *not* be initializing or tearing down
>> > unpack_trees_options->dir beyond setting that field to NULL; it should
>> > then leave it alone.
>>
>> s/NULL/DIR_INIT/ in my version, but yes.
>>
>> >> We do need that setup in some cases (although a lot could just be
>> >> replaced by lazy initialization), but if we don't....
>> >>
>> >> diff --git a/unpack-trees.c b/unpack-trees.c
>> >> index a7e1712d236..de5cc6cd025 100644
>> >> --- a/unpack-trees.c
>> >> +++ b/unpack-trees.c
>> >> @@ -1694,15 +1694,12 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>> >>         static struct cache_entry *dfc;
>> >>         struct pattern_list pl;
>> >>         int free_pattern_list = 0;
>> >> -       struct dir_struct dir = DIR_INIT;
>> >>
>> >>         if (o->reset == UNPACK_RESET_INVALID)
>> >>                 BUG("o->reset had a value of 1; should be UNPACK_TREES_*_UNTRACKED");
>> >>
>> >>         if (len > MAX_UNPACK_TREES)
>> >>                 die("unpack_trees takes at most %d trees", MAX_UNPACK_TREES);
>> >> -       if (o->dir)
>> >> -               BUG("o->dir is for internal use only");
>> >
>> > I think this was an important check that you've tossed without
>> > replacement.  Historically, callers set up and tweaked o->dir with
>> > various values.  With my patch, we are no longer allowing that..which
>> > introduces a transition problem -- people might have written or are
>> > now writing patches that make new calls of unpack_trees() previous to
>> > this change of mine, but submit them after this change of mine gets
>> > merged.  Without this check I added, they'd probably just do a
>> > mechanical `o->dir->` change to `o->dir.` and assume it's good...and
>> > then possibly have ugly bugs to hunt down.
>> >
>> > So, I think it's helpful to have a check that provides an early
>> > warning that tweaking o->dir is not only no longer required, but also
>> > no longer allowed.
>>
>> The compiler will catch any such use of the pointer version on a
>> mis-merge, or do you just mean that the person running into that might
>> get the resolution wrong? I.e. before we could check o->dir being NULL
>> for "do we have an exclude", but &o->dir will always be true?
>
> The compiler catches and reports, and then the human sees the error
> and just transliterates "o->dir->" to "o->dir.".  And then it
> compiles, and the person assumes they fixed it correctly, but the
> transliteration was WRONG and has subtle bugs because they had been
> setting up o->dir with special values and something needs to warn them
> that they shouldn't be touching o->dir anymore.  You removed the
> safety check that would have let them know that their straightforward
> transliteration was wrong.  I added that safety check intentionally,
> and don't like seeing it ripped out without a replacement.

I think that in general we've dealt with that in other places, e.g. if
we say have a "struct strbuf *" and memzero the containing struct we can
check against NULL as an alias for "do we have this at all", but if it's
a "struct strbuf" initialized with STRBUF_INIT we'll either need a
"have_the_string" side-member, or (if unambiguous) to check if .len == 0
(or more nastily perhaps, check .alloc or if it's the slopbuf).

>> >>         trace_performance_enter();
>> >>         trace2_region_enter("unpack_trees", "unpack_trees", the_repository);
>> >> @@ -1718,9 +1715,8 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>> >>                 BUG("UNPACK_RESET_OVERWRITE_UNTRACKED incompatible with preserved ignored files");
>> >>
>> >>         if (!o->preserve_ignored) {
>> >> -               o->dir = &dir;
>> >> -               o->dir->flags |= DIR_SHOW_IGNORED;
>> >> -               setup_standard_excludes(o->dir);
>> >> +               o->dir.flags |= DIR_SHOW_IGNORED;
>> >> +               setup_standard_excludes(&o->dir);
>> >>         }
>> >>
>> >>         if (!core_apply_sparse_checkout || !o->update)
>> >> @@ -1884,10 +1880,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
>> >>  done:
>> >>         if (free_pattern_list)
>> >>                 clear_pattern_list(&pl);
>> >> -       if (o->dir) {
>> >> -               dir_clear(o->dir);
>> >> -               o->dir = NULL;
>> >> -       }
>> >> +       dir_clear(&o->dir);
>> >
>> > Unconditionally calling dir_clear()...
>>
>> As before I'm not sure about bugs in the ad-hoc patch on top, but I
>> don't think this is a bug in my version linked above.
>>
>> I.e. it's zero'd out, and the dir_clear() either ends up calling
>> free(NULL) or tries to loop over 0..N where N will be 0, no?
>>
>> >>         trace2_region_leave("unpack_trees", "unpack_trees", the_repository);
>> >>         trace_performance_leave("unpack_trees");
>> >>         return ret;
>> >> @@ -2153,8 +2146,7 @@ static int verify_clean_subdirectory(const struct cache_entry *ce,
>> >>         pathbuf = xstrfmt("%.*s/", namelen, ce->name);
>> >>
>> >>         memset(&d, 0, sizeof(d));
>> >> -       if (o->dir)
>> >> -               d.exclude_per_dir = o->dir->exclude_per_dir;
>> >> +       d.exclude_per_dir = o->dir.exclude_per_dir;
>> >>         i = read_directory(&d, o->src_index, pathbuf, namelen+1, NULL);
>> >>         if (i)
>> >>                 return add_rejected_path(o, ERROR_NOT_UPTODATE_DIR, ce->name);
>> >> @@ -2201,8 +2193,7 @@ static int check_ok_to_remove(const char *name, int len, int dtype,
>> >>         if (ignore_case && icase_exists(o, name, len, st))
>> >>                 return 0;
>> >>
>> >> -       if (o->dir &&
>> >> -           is_excluded(o->dir, o->src_index, name, &dtype))
>> >> +       if (is_excluded(&o->dir, o->src_index, name, &dtype))
>> >
>> > Unconditionally calling is_excluded()...
>>
>> Which will just return "it's not", won't it? Just lik dir_clear() deals
>> with an "empty" dir_struct. There's existing callers of both with that
>> pattern in e.g. builtin/{add,clean}.c.
>
> That's a question that someone writing this patch should investigate
> and document.  I read through is_excluded() and the functions it
> calls, and I _think_ it's possibly correct in the current code.  But
> it's not trivial to verify.  And I'd look a bit closer at it to be
> sure it's correct before making this change.
>
> Yeah, I'm being pretty picky here, but this is dir.c we are dealing with.

I'll make sure to make that clear / test it etc. if & when I do end up
submitting that WIP stuff...

>> Maybe I've missed an edge case, but I think the only reason that "o->dir
>> &&" was there was because it was dynamically malloc'd before, but in my
>> version where we'll always have it initialized...
>
> I think after checking the code to verify this, that it deserves being
> mentioned in the commit message (at least given that this is dir.c
> we're talking about).

*nod*. See also my side-thread <877dexrqvg.fsf@evledraar.gmail.com> for
 some exhaustive verification. Will reference that...

>> >>                 /*
>> >>                  * ce->name is explicitly excluded, so it is Ok to
>> >>                  * overwrite it.
>> >> diff --git a/unpack-trees.h b/unpack-trees.h
>> >> index 71ffb7eeb0c..a8afbb20170 100644
>> >> --- a/unpack-trees.h
>> >> +++ b/unpack-trees.h
>> >> @@ -5,6 +5,7 @@
>> >>  #include "strvec.h"
>> >>  #include "string-list.h"
>> >>  #include "tree-walk.h"
>> >> +#include "dir.h"
>> >>
>> >>  #define MAX_UNPACK_TREES MAX_TRAVERSE_TREES
>> >>
>> >> @@ -95,7 +96,7 @@ struct unpack_trees_options {
>> >>         struct index_state result;
>> >>
>> >>         struct pattern_list *pl; /* for internal use */
>> >> -       struct dir_struct *dir; /* for internal use only */
>> >> +       struct dir_struct dir; /* for internal use only */
>> >>         struct checkout_metadata meta;
>> >>  };
>> >>
>> >
>> > Not only did you drop the important safety check that o->dir not be
>> > setup by the caller (which needs to be reinstated in some form), your
>> > solution also involves unconditionally calling dir_clear() and
>> > is_excluded().  It is not clear to me that those calls are safe...and
>> > that they will continue to be safe in the future.
>>
>> It is a common pattern we rely on, e.g. strbuf_release() and various
>> other custom free-like functions generally act as NOOP if they've got
>> nothing to do, just like free()...
>
> Yeah, "common pattern" is a useful way to want things to behave, and
> people assuming that in other parts of git's codebase is probably
> fine, but I don't trust dir.c to already behave according to common
> patterns.  My mistrust of it is deep enough that I think someone
> should verify the relevant parts of dir.c do behave that way before
> making changes like this, and then explicitly call it out in the
> commit message when the change is made.  dir.c had some weird dragons
> in it.  And after being repeatedly sucked back into dir.c problems for
> years because of weird side effects, and seeing how many piles of
> nearly-cancelling bugs it had, I don't trust it anymore.

*nod*

>> > Even if it is safe
>> > and will continue to be, I don't think this should be squashed into my
>> > patches.  I think it should be a separate patch with its own commit
>> > message that explicitly calls out this assumption.  Especially since
>> > this is dir.c, which is an area where attempting to fix one very
>> > simple little bug results in years of refactoring and fixing all kinds
>> > of historical messes, sometimes waiting a year and a half for
>> > responses to RFCs/review requests, and where we have to sometimes just
>> > give up on attempting to understand the purpose of various bits of
>> > code and instead rely on the regression tests and hope they are good
>> > enough.  I still think that dir.c deserves a little warning at the
>> > top, like the one I suggested in [1].
>> >
>> > [1] https://lore.kernel.org/git/CABPp-BFiwzzUgiTj_zu+vF5x20L0=1cf25cHwk7KZQj2YkVzXw@mail.gmail.com/
>>
>> *nod* I can always submit something like this afterwards.
>
> :-)
>
>> Just on this series: Perhaps this discussion is a sign that this memory
>> leak fixing should be its own cleanup series where we could hash out any
>> approaches to doing that? I.e. as noted before I realize I'm to blame
>> for suggesting it in the first place, but those parts of these changes
>> don't seem like they're needed by other parts of the series (I tried
>> compiling with the two relevant patches ejected out).
>>
>> Having a humongous set of memory leak fixes locally at this point, I
>> think it's generally not very worth the effort to fix a leak in b()
>> where a() calls b() and b() calls c(), and all of [abc]() are
>> leaking. I.e. often narrowly fixing leaks in b() will lead to different
>> solutions than if you're trying to resolve all of [abc](), as their
>> interaction comes into play.
>
> The patch we are commenting on isn't a leakfix, and is very much
> integral to the series for reasons other than actual leakfix (which
> occurred two patches before this one).  It's simplifying the API to
> not require a bunch of boilerplate setup of opts->dir and instead
> requiring callers to just set a simple boolean INSTEAD (and only if
> they want non-default behavior, rather than requiring setting
> opts->dir by all those who did want default behavior).  That way, when
> I need to make sure several additional callers who should have gotten
> the default behavior actually get it, they don't have to call a bunch
> of functions to setup and cleanup opts->dir.

I stand corrected. I have not given this series as a whole any careful
review, sorry. This whole side-thread came about because I noticed the
memory allocation part of this since it was something I was working in
on in that WIP...
Ævar Arnfjörð Bjarmason Oct. 3, 2021, 10:21 p.m. UTC | #11
On Sat, Oct 02 2021, Ævar Arnfjörð Bjarmason wrote:

> On Fri, Oct 01 2021, Elijah Newren wrote:
>
>> On Fri, Oct 1, 2021 at 1:47 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>>>
>>> On Thu, Sep 30 2021, Elijah Newren wrote:
>>>
>>> > On Thu, Sep 30, 2021 at 7:15 AM Ævar Arnfjörð Bjarmason
>>> > <avarab@gmail.com> wrote:
>>> >>
>>> >> On Wed, Sep 29 2021, Elijah Newren wrote:
[...]
>>> > I might be going on a tangent here, but looking at that patch, I'm
>>> > worried that dir_init() was buggy and that you perpetuated that bug
>>> > with DIR_INIT.  Note that dir_struct has a struct strbuf basebuf
>>> > member, which neither dir_init() or DIR_INIT initialize properly
>>> > (using either strbuf_init() or STRBUF_INIT).  As far as I can tell,
>>> > dir.c relies on either strbuf_add() calls to just happen to work with
>>> > this incorrectly initialized strbuf, or else use the strbuf_init()
>>> > call in prep_exclude() to do so, using the following snippet:
>>> >
>>> >     if (!dir->basebuf.buf)
>>> >         strbuf_init(&dir->basebuf, PATH_MAX);
>>> >
>>> > However, earlier in that same function we see
>>> >
>>> >     if (stk->baselen <= baselen &&
>>> >         !strncmp(dir->basebuf.buf, base, stk->baselen))
>>> >             break;
>>> >
>>> > So either that function can never have dir->basebuf.buf be NULL and
>>> > the strbuf_init() is dead code, or else it's possible for us to
>>> > trigger a segfault.  If it's the former, it may just be a ticking time
>>> > bomb that will transform into the latter with some other change,
>>> > because it's not at all obvious to me how dir->basebuf gets
>>> > initialized appropriately to avoid that strncmp call.  Perhaps there
>>> > is some invariant where exclude_stack is only set up by previous calls
>>> > to prep_exclude() and those won't set up exclude_stack until first
>>> > initializing basebuf.  But that really at least deserves a comment
>>> > about how we're abusing basebuf, and would probably be cleaner if we
>>> > initialized basebuf to STRBUF_INIT.
>>>
>>> ...because yes, I forgot about that when sending you the diff-on-top,
>>> sorry. Yes that's buggy with the diff-on-top I sent you.
>>
>> That bug didn't come from the diff-on-top you sent me, it came from
>> the commit already merged to master -- ce93a4c6127  (dir.[ch]: replace
>> dir_init() with DIR_INIT, 2021-07-01), merged as part of
>> ab/struct-init on Jul 16.
>
> Ah, I misunderstood you there. I'll look at that / fix it. Sorry.

Just to tie up this loose end: Yes this control flow suck, and I've got
some patches to unpack-trees.[ch] & dir.[ch] I'm about to submit to fix
it. But just to comment on the existing behavior of the code, i.e. your
(above):

    "So either that function can never have dir->basebuf.buf be NULL and
    the strbuf_init() is dead code, or else it's possible for us to
    trigger a segfault.".

I hadn't had time to look into it when I said I'd fix it, but now that I
have I found thath there's nothing to fix, and this code wasn't buggy
either before or after my ce93a4c6127 (dir.[ch]: replace dir_init() with
DIR_INIT, 2021-07-01). I.e. we do have the invariant you mentioned.

The dir.[ch] API has always relied on the "struct dir_struct" being
zero'd out. First with memset() before your eceba532141 (dir: fix
problematic API to avoid memory leaks, 2020-08-18), and after my
ce93a4c6127 with the DIR_INIT, which both amount to the same thing.

We both missed a caller that used neither dir_init() nor uses DIR_INIT
now, but it uses "{ 0 }", so it's always zero'd.

Now, of course it being zero'd *would* segfault if you feed
"dir->basebuf.buf" to strncmp() as you note above, but that code isn't
reachable. The structure of that function is (pseudocode):

void prep_exclude(...)
{
	struct exclude_stack *stk = NULL;
	[...]

	while ((stk = dir->exclude_stack) != NULL)
		/* the strncmp() against "dir->basebuf.buf" is here */

	/* maybe we'll early return here */

	if (!dir->basebuf.buf)
		strbuf_init(&dir->basebuf, PATH_MAX);

	/*
         * Code that sets dir->exclude_stack to non-NULL for the first
	 * time follows...
	 */
}

I.e. dir->exclude_stack is *only* referenced in this function and
dir_clear() (where we also check it for NULL first).

It's state management between calls to prep_exclude(). So that that
initial while-loop can only be entered the the >1th time prep_exclude()
is called.

We'll then either have reached that strbuf_init() already, or if we took
an early return before the strbuf_init() we couldn't have set
dir->exclude_stack either. So that "dir->basebuf.buf" dereference is
safe in either case.
Elijah Newren Oct. 4, 2021, 1:45 p.m. UTC | #12
On Sat, Oct 2, 2021 at 2:07 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
> On Fri, Oct 01 2021, Elijah Newren wrote:
>
...
> > So maybe I'll submit some patches on top that rip these direct members
> > out of of unpack_trees_options and push them inside some opaque
> > struct.
>
> Sure, that sounds good. I only had a mild objection to doing it in a way
> where you'll need that sort of code I removed in the linked commit in
> prep_exclude() because you were trying not to expose that at any cost,
> including via some *_INIT macro. I.e. if it's private we can just name
> it "priv_*" or have a :
>
>     struct dont_touch_this {
>         struct dir_struct dir;
>     };
>
> Which are both ways of /messaging/ that it's private, and since the
> target audience is just the rest of the git.git codebase I think that
> ultimately something that 1) sends the right message 2) makes accidents
> pretty much impossible suffices. I.e. you don't accidentally introduce a
> new API user accessing a field called "->priv_*" or
> "->private_*". Someone will review those patches...

An internal struct with all the members meant to be internal-only
provides nearly all the advantages that I was going for with the
opaque struct, while also being a smaller change than what I was
thinking of doing.  I like that idea; thanks for the suggestion.
Elijah Newren Oct. 4, 2021, 1:45 p.m. UTC | #13
On Sun, Oct 3, 2021 at 3:38 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
> On Sat, Oct 02 2021, Ævar Arnfjörð Bjarmason wrote:
>
> > On Fri, Oct 01 2021, Elijah Newren wrote:
> >
> >> On Fri, Oct 1, 2021 at 1:47 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
> >>>
> >>> On Thu, Sep 30 2021, Elijah Newren wrote:
> >>>
> >>> > On Thu, Sep 30, 2021 at 7:15 AM Ævar Arnfjörð Bjarmason
> >>> > <avarab@gmail.com> wrote:
> >>> >>
> >>> >> On Wed, Sep 29 2021, Elijah Newren wrote:
> [...]
> >>> > I might be going on a tangent here, but looking at that patch, I'm
> >>> > worried that dir_init() was buggy and that you perpetuated that bug
> >>> > with DIR_INIT.  Note that dir_struct has a struct strbuf basebuf
> >>> > member, which neither dir_init() or DIR_INIT initialize properly
> >>> > (using either strbuf_init() or STRBUF_INIT).  As far as I can tell,
> >>> > dir.c relies on either strbuf_add() calls to just happen to work with
> >>> > this incorrectly initialized strbuf, or else use the strbuf_init()
> >>> > call in prep_exclude() to do so, using the following snippet:
> >>> >
> >>> >     if (!dir->basebuf.buf)
> >>> >         strbuf_init(&dir->basebuf, PATH_MAX);
> >>> >
> >>> > However, earlier in that same function we see
> >>> >
> >>> >     if (stk->baselen <= baselen &&
> >>> >         !strncmp(dir->basebuf.buf, base, stk->baselen))
> >>> >             break;
> >>> >
> >>> > So either that function can never have dir->basebuf.buf be NULL and
> >>> > the strbuf_init() is dead code, or else it's possible for us to
> >>> > trigger a segfault.  If it's the former, it may just be a ticking time
> >>> > bomb that will transform into the latter with some other change,
> >>> > because it's not at all obvious to me how dir->basebuf gets
> >>> > initialized appropriately to avoid that strncmp call.  Perhaps there
> >>> > is some invariant where exclude_stack is only set up by previous calls
> >>> > to prep_exclude() and those won't set up exclude_stack until first
> >>> > initializing basebuf.  But that really at least deserves a comment
> >>> > about how we're abusing basebuf, and would probably be cleaner if we
> >>> > initialized basebuf to STRBUF_INIT.
> >>>
> >>> ...because yes, I forgot about that when sending you the diff-on-top,
> >>> sorry. Yes that's buggy with the diff-on-top I sent you.
> >>
> >> That bug didn't come from the diff-on-top you sent me, it came from
> >> the commit already merged to master -- ce93a4c6127  (dir.[ch]: replace
> >> dir_init() with DIR_INIT, 2021-07-01), merged as part of
> >> ab/struct-init on Jul 16.
> >
> > Ah, I misunderstood you there. I'll look at that / fix it. Sorry.
>
> Just to tie up this loose end: Yes this control flow suck, and I've got
> some patches to unpack-trees.[ch] & dir.[ch] I'm about to submit to fix
> it. But just to comment on the existing behavior of the code, i.e. your
> (above):
>
>     "So either that function can never have dir->basebuf.buf be NULL and
>     the strbuf_init() is dead code, or else it's possible for us to
>     trigger a segfault.".
>
> I hadn't had time to look into it when I said I'd fix it, but now that I
> have I found thath there's nothing to fix, and this code wasn't buggy
> either before or after my ce93a4c6127 (dir.[ch]: replace dir_init() with
> DIR_INIT, 2021-07-01). I.e. we do have the invariant you mentioned.
>
> The dir.[ch] API has always relied on the "struct dir_struct" being
> zero'd out. First with memset() before your eceba532141 (dir: fix
> problematic API to avoid memory leaks, 2020-08-18), and after my
> ce93a4c6127 with the DIR_INIT, which both amount to the same thing.
>
> We both missed a caller that used neither dir_init() nor uses DIR_INIT
> now, but it uses "{ 0 }", so it's always zero'd.
>
> Now, of course it being zero'd *would* segfault if you feed
> "dir->basebuf.buf" to strncmp() as you note above, but that code isn't
> reachable. The structure of that function is (pseudocode):
>
> void prep_exclude(...)
> {
>         struct exclude_stack *stk = NULL;
>         [...]
>
>         while ((stk = dir->exclude_stack) != NULL)
>                 /* the strncmp() against "dir->basebuf.buf" is here */
>
>         /* maybe we'll early return here */
>
>         if (!dir->basebuf.buf)
>                 strbuf_init(&dir->basebuf, PATH_MAX);
>
>         /*
>          * Code that sets dir->exclude_stack to non-NULL for the first
>          * time follows...
>          */
> }
>
> I.e. dir->exclude_stack is *only* referenced in this function and
> dir_clear() (where we also check it for NULL first).
>
> It's state management between calls to prep_exclude(). So that that
> initial while-loop can only be entered the the >1th time prep_exclude()
> is called.
>
> We'll then either have reached that strbuf_init() already, or if we took
> an early return before the strbuf_init() we couldn't have set
> dir->exclude_stack either. So that "dir->basebuf.buf" dereference is
> safe in either case.

Thanks for digging into this.  I wonder if dir_struct could use some
separation of putting things inside an embedded internal struct as
well, similar to our discussions with unpack_trees_options.
Ævar Arnfjörð Bjarmason Oct. 4, 2021, 2:07 p.m. UTC | #14
On Mon, Oct 04 2021, Elijah Newren wrote:

> On Sat, Oct 2, 2021 at 2:07 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>>
>> On Fri, Oct 01 2021, Elijah Newren wrote:
>>
> ...
>> > So maybe I'll submit some patches on top that rip these direct members
>> > out of of unpack_trees_options and push them inside some opaque
>> > struct.
>>
>> Sure, that sounds good. I only had a mild objection to doing it in a way
>> where you'll need that sort of code I removed in the linked commit in
>> prep_exclude() because you were trying not to expose that at any cost,
>> including via some *_INIT macro. I.e. if it's private we can just name
>> it "priv_*" or have a :
>>
>>     struct dont_touch_this {
>>         struct dir_struct dir;
>>     };
>>
>> Which are both ways of /messaging/ that it's private, and since the
>> target audience is just the rest of the git.git codebase I think that
>> ultimately something that 1) sends the right message 2) makes accidents
>> pretty much impossible suffices. I.e. you don't accidentally introduce a
>> new API user accessing a field called "->priv_*" or
>> "->private_*". Someone will review those patches...
>
> An internal struct with all the members meant to be internal-only
> provides nearly all the advantages that I was going for with the
> opaque struct, while also being a smaller change than what I was
> thinking of doing.  I like that idea; thanks for the suggestion.

Yeah, just to provide an explicit example something like the below. It
compiles to the same assembly (at least under -O3, didn't exhaustively
try other optimization levels).

I'm rather "meh" on it v.s. just prefixing the relevant member names
with "priv_" or "private_", but it results in the same semantics &
machine code, so it's effectively just a way of doing the labeling for
human consumption.

diff --git a/dir.c b/dir.c
index 39fce3bcba7..a714640e782 100644
--- a/dir.c
+++ b/dir.c
@@ -1533,12 +1533,12 @@ static void prep_exclude(struct dir_struct *dir,
 	 * which originate from directories not in the prefix of the
 	 * path being checked.
 	 */
-	while ((stk = dir->exclude_stack) != NULL) {
+	while ((stk = dir->private.exclude_stack) != NULL) {
 		if (stk->baselen <= baselen &&
 		    !strncmp(dir->basebuf.buf, base, stk->baselen))
 			break;
-		pl = &group->pl[dir->exclude_stack->exclude_ix];
-		dir->exclude_stack = stk->prev;
+		pl = &group->pl[dir->private.exclude_stack->exclude_ix];
+		dir->private.exclude_stack = stk->prev;
 		dir->pattern = NULL;
 		free((char *)pl->src); /* see strbuf_detach() below */
 		clear_pattern_list(pl);
@@ -1584,7 +1584,7 @@ static void prep_exclude(struct dir_struct *dir,
 						 base + current,
 						 cp - base - current);
 		}
-		stk->prev = dir->exclude_stack;
+		stk->prev = dir->private.exclude_stack;
 		stk->baselen = cp - base;
 		stk->exclude_ix = group->nr;
 		stk->ucd = untracked;
@@ -1605,7 +1605,7 @@ static void prep_exclude(struct dir_struct *dir,
 			    dir->pattern->flags & PATTERN_FLAG_NEGATIVE)
 				dir->pattern = NULL;
 			if (dir->pattern) {
-				dir->exclude_stack = stk;
+				dir->private.exclude_stack = stk;
 				return;
 			}
 		}
@@ -1662,7 +1662,7 @@ static void prep_exclude(struct dir_struct *dir,
 			invalidate_gitignore(dir->untracked, untracked);
 			oidcpy(&untracked->exclude_oid, &oid_stat.oid);
 		}
-		dir->exclude_stack = stk;
+		dir->private.exclude_stack = stk;
 		current = stk->baselen;
 	}
 	strbuf_setlen(&dir->basebuf, baselen);
@@ -3302,7 +3302,7 @@ void dir_clear(struct dir_struct *dir)
 	free(dir->ignored);
 	free(dir->entries);
 
-	stk = dir->exclude_stack;
+	stk = dir->private.exclude_stack;
 	while (stk) {
 		struct exclude_stack *prev = stk->prev;
 		free(stk);
diff --git a/dir.h b/dir.h
index 83f46c0fb4c..d30d294308d 100644
--- a/dir.h
+++ b/dir.h
@@ -209,6 +209,11 @@ struct untracked_cache {
  * record the paths discovered. A single `struct dir_struct` is used regardless
  * of whether or not the traversal recursively descends into subdirectories.
  */
+
+struct dir_struct_private {
+	struct exclude_stack *exclude_stack;
+};
+
 struct dir_struct {
 
 	/* The number of members in `entries[]` array. */
@@ -327,7 +332,7 @@ struct dir_struct {
 	 * (sub)directory in the traversal. Exclude points to the
 	 * matching exclude struct if the directory is excluded.
 	 */
-	struct exclude_stack *exclude_stack;
+	struct dir_struct_private private;
 	struct path_pattern *pattern;
 	struct strbuf basebuf;
Elijah Newren Oct. 4, 2021, 2:57 p.m. UTC | #15
On Mon, Oct 4, 2021 at 7:12 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Mon, Oct 04 2021, Elijah Newren wrote:
>
> > On Sat, Oct 2, 2021 at 2:07 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
> >>
> >> On Fri, Oct 01 2021, Elijah Newren wrote:
> >>
> > ...
> >> > So maybe I'll submit some patches on top that rip these direct members
> >> > out of of unpack_trees_options and push them inside some opaque
> >> > struct.
> >>
> >> Sure, that sounds good. I only had a mild objection to doing it in a way
> >> where you'll need that sort of code I removed in the linked commit in
> >> prep_exclude() because you were trying not to expose that at any cost,
> >> including via some *_INIT macro. I.e. if it's private we can just name
> >> it "priv_*" or have a :
> >>
> >>     struct dont_touch_this {
> >>         struct dir_struct dir;
> >>     };
> >>
> >> Which are both ways of /messaging/ that it's private, and since the
> >> target audience is just the rest of the git.git codebase I think that
> >> ultimately something that 1) sends the right message 2) makes accidents
> >> pretty much impossible suffices. I.e. you don't accidentally introduce a
> >> new API user accessing a field called "->priv_*" or
> >> "->private_*". Someone will review those patches...
> >
> > An internal struct with all the members meant to be internal-only
> > provides nearly all the advantages that I was going for with the
> > opaque struct, while also being a smaller change than what I was
> > thinking of doing.  I like that idea; thanks for the suggestion.
>
> Yeah, just to provide an explicit example something like the below. It
> compiles to the same assembly (at least under -O3, didn't exhaustively
> try other optimization levels).
>
> I'm rather "meh" on it v.s. just prefixing the relevant member names
> with "priv_" or "private_", but it results in the same semantics &
> machine code, so it's effectively just a way of doing the labeling for
> human consumption.
>
> diff --git a/dir.c b/dir.c
> index 39fce3bcba7..a714640e782 100644
> --- a/dir.c
> +++ b/dir.c
> @@ -1533,12 +1533,12 @@ static void prep_exclude(struct dir_struct *dir,
>          * which originate from directories not in the prefix of the
>          * path being checked.
>          */
> -       while ((stk = dir->exclude_stack) != NULL) {
> +       while ((stk = dir->private.exclude_stack) != NULL) {
>                 if (stk->baselen <= baselen &&
>                     !strncmp(dir->basebuf.buf, base, stk->baselen))
>                         break;
> -               pl = &group->pl[dir->exclude_stack->exclude_ix];
> -               dir->exclude_stack = stk->prev;
> +               pl = &group->pl[dir->private.exclude_stack->exclude_ix];
> +               dir->private.exclude_stack = stk->prev;
>                 dir->pattern = NULL;
>                 free((char *)pl->src); /* see strbuf_detach() below */
>                 clear_pattern_list(pl);
> @@ -1584,7 +1584,7 @@ static void prep_exclude(struct dir_struct *dir,
>                                                  base + current,
>                                                  cp - base - current);
>                 }
> -               stk->prev = dir->exclude_stack;
> +               stk->prev = dir->private.exclude_stack;
>                 stk->baselen = cp - base;
>                 stk->exclude_ix = group->nr;
>                 stk->ucd = untracked;
> @@ -1605,7 +1605,7 @@ static void prep_exclude(struct dir_struct *dir,
>                             dir->pattern->flags & PATTERN_FLAG_NEGATIVE)
>                                 dir->pattern = NULL;
>                         if (dir->pattern) {
> -                               dir->exclude_stack = stk;
> +                               dir->private.exclude_stack = stk;
>                                 return;
>                         }
>                 }
> @@ -1662,7 +1662,7 @@ static void prep_exclude(struct dir_struct *dir,
>                         invalidate_gitignore(dir->untracked, untracked);
>                         oidcpy(&untracked->exclude_oid, &oid_stat.oid);
>                 }
> -               dir->exclude_stack = stk;
> +               dir->private.exclude_stack = stk;
>                 current = stk->baselen;
>         }
>         strbuf_setlen(&dir->basebuf, baselen);
> @@ -3302,7 +3302,7 @@ void dir_clear(struct dir_struct *dir)
>         free(dir->ignored);
>         free(dir->entries);
>
> -       stk = dir->exclude_stack;
> +       stk = dir->private.exclude_stack;
>         while (stk) {
>                 struct exclude_stack *prev = stk->prev;
>                 free(stk);
> diff --git a/dir.h b/dir.h
> index 83f46c0fb4c..d30d294308d 100644
> --- a/dir.h
> +++ b/dir.h
> @@ -209,6 +209,11 @@ struct untracked_cache {
>   * record the paths discovered. A single `struct dir_struct` is used regardless
>   * of whether or not the traversal recursively descends into subdirectories.
>   */
> +
> +struct dir_struct_private {
> +       struct exclude_stack *exclude_stack;
> +};
> +
>  struct dir_struct {
>
>         /* The number of members in `entries[]` array. */
> @@ -327,7 +332,7 @@ struct dir_struct {
>          * (sub)directory in the traversal. Exclude points to the
>          * matching exclude struct if the directory is excluded.
>          */
> -       struct exclude_stack *exclude_stack;
> +       struct dir_struct_private private;
>         struct path_pattern *pattern;
>         struct strbuf basebuf;

Yeah, that doesn't help much at all, and I'd argue even makes things
worse, because you're just looking at a single member.  This subtly
implies that all the other private variables are public API.  The
dir.h portion of the patch should look more like this:

$ git diff -w dir.h
diff --git a/dir.h b/dir.h
index 83f46c0fb4..93a9f02688 100644
--- a/dir.h
+++ b/dir.h
@@ -214,14 +214,9 @@ struct dir_struct {
        /* The number of members in `entries[]` array. */
        int nr;

-       /* Internal use; keeps track of allocation of `entries[]` array.*/
-       int alloc;
-
        /* The number of members in `ignored[]` array. */
        int ignored_nr;

-       int ignored_alloc;
-
        /* bit-field of options */
        enum {

@@ -301,11 +296,19 @@ struct dir_struct {
         */
        const char *exclude_per_dir;

+       struct dir_struct_internal {
+               /* Keeps track of allocation of `entries[]` array.*/
+               int alloc;
+
+               /* Keeps track of allocation of `ignored[]` array. */
+               int ignored_alloc;
+
                /*
                 * We maintain three groups of exclude pattern lists:
                 *
                 * EXC_CMDL lists patterns explicitly given on the command line.
-        * EXC_DIRS lists patterns obtained from per-directory ignore files.
+                * EXC_DIRS lists patterns obtained from per-directory ignore
+                *          files.
                 * EXC_FILE lists patterns from fallback ignore files, e.g.
                 *   - .git/info/exclude
                 *   - core.excludesfile
@@ -340,6 +343,7 @@ struct dir_struct {
                /* Stats about the traversal */
                unsigned visited_paths;
                unsigned visited_directories;
+       } internal;
 };

 #define DIR_INIT { 0 }


The above change would make it clear that there are 12 variables meant
for use only within dir.c that external callers should not be
initializing or reading for output after the fact -- and only 6 that
are part of the public API that they need worry about.  It also makes
it easier for folks messing with dir.c to know which parts are just
internal state management, which I think would have made it easier to
understand the weird basebuf/exclude_stack stuff in prep_exclude()
that you nicely tracked down.  But overall, I'm really most happy
about the part of this patch that lets external callers realize they
only need to worry about 6 out of 18 fields and that they can ignore
the rest.

unpack_trees_options should have something similar done with it, and
maybe some others.
diff mbox series

Patch

diff --git a/builtin/am.c b/builtin/am.c
index e4a0ff9cd7c..1ee70692bc3 100644
--- a/builtin/am.c
+++ b/builtin/am.c
@@ -1918,6 +1918,9 @@  static int fast_forward_to(struct tree *head, struct tree *remote, int reset)
 	opts.update = 1;
 	opts.merge = 1;
 	opts.reset = reset;
+	if (!reset)
+		/* FIXME: Default should be to remove ignored files */
+		opts.preserve_ignored = 1;
 	opts.fn = twoway_merge;
 	init_tree_desc(&t[0], head->buffer, head->size);
 	init_tree_desc(&t[1], remote->buffer, remote->size);
diff --git a/builtin/checkout.c b/builtin/checkout.c
index 5335435d616..5e7957dd068 100644
--- a/builtin/checkout.c
+++ b/builtin/checkout.c
@@ -648,6 +648,7 @@  static int reset_tree(struct tree *tree, const struct checkout_opts *o,
 	opts.skip_unmerged = !worktree;
 	opts.reset = 1;
 	opts.merge = 1;
+	opts.preserve_ignored = 0;
 	opts.fn = oneway_merge;
 	opts.verbose_update = o->show_progress;
 	opts.src_index = &the_index;
@@ -746,11 +747,7 @@  static int merge_working_tree(const struct checkout_opts *opts,
 				       new_branch_info->commit ?
 				       &new_branch_info->commit->object.oid :
 				       &new_branch_info->oid, NULL);
-		if (opts->overwrite_ignore) {
-			topts.dir = xcalloc(1, sizeof(*topts.dir));
-			topts.dir->flags |= DIR_SHOW_IGNORED;
-			setup_standard_excludes(topts.dir);
-		}
+		topts.preserve_ignored = !opts->overwrite_ignore;
 		tree = parse_tree_indirect(old_branch_info->commit ?
 					   &old_branch_info->commit->object.oid :
 					   the_hash_algo->empty_tree);
@@ -760,10 +757,6 @@  static int merge_working_tree(const struct checkout_opts *opts,
 		init_tree_desc(&trees[1], tree->buffer, tree->size);
 
 		ret = unpack_trees(2, trees, &topts);
-		if (topts.dir) {
-			dir_clear(topts.dir);
-			FREE_AND_NULL(topts.dir);
-		}
 		clear_unpack_trees_porcelain(&topts);
 		if (ret == -1) {
 			/*
diff --git a/builtin/clone.c b/builtin/clone.c
index ff1d3d447a3..be1c3840d62 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -687,6 +687,8 @@  static int checkout(int submodule_progress)
 	opts.update = 1;
 	opts.merge = 1;
 	opts.clone = 1;
+	/* FIXME: Default should be to remove ignored files */
+	opts.preserve_ignored = 1;
 	opts.fn = oneway_merge;
 	opts.verbose_update = (option_verbosity >= 0);
 	opts.src_index = &the_index;
diff --git a/builtin/merge.c b/builtin/merge.c
index 3fbdacc7db4..1e5fff095fc 100644
--- a/builtin/merge.c
+++ b/builtin/merge.c
@@ -680,6 +680,8 @@  static int read_tree_trivial(struct object_id *common, struct object_id *head,
 	opts.verbose_update = 1;
 	opts.trivial_merges_only = 1;
 	opts.merge = 1;
+	/* FIXME: Default should be to remove ignored files */
+	opts.preserve_ignored = 1;
 	trees[nr_trees] = parse_tree_indirect(common);
 	if (!trees[nr_trees++])
 		return -1;
diff --git a/builtin/read-tree.c b/builtin/read-tree.c
index 73cb957a69b..443d206eca6 100644
--- a/builtin/read-tree.c
+++ b/builtin/read-tree.c
@@ -201,11 +201,9 @@  int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
 	if ((opts.update || opts.index_only) && !opts.merge)
 		die("%s is meaningless without -m, --reset, or --prefix",
 		    opts.update ? "-u" : "-i");
-	if (opts.update && !opts.reset) {
-		CALLOC_ARRAY(opts.dir, 1);
-		opts.dir->flags |= DIR_SHOW_IGNORED;
-		setup_standard_excludes(opts.dir);
-	}
+	if (opts.update && !opts.reset)
+		opts.preserve_ignored = 0;
+	/* otherwise, opts.preserve_ignored is irrelevant */
 	if (opts.merge && !opts.index_only)
 		setup_work_tree();
 
@@ -245,11 +243,6 @@  int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix)
 	if (unpack_trees(nr_trees, t, &opts))
 		return 128;
 
-	if (opts.dir) {
-		dir_clear(opts.dir);
-		FREE_AND_NULL(opts.dir);
-	}
-
 	if (opts.debug_unpack || opts.dry_run)
 		return 0; /* do not write the index out */
 
diff --git a/builtin/reset.c b/builtin/reset.c
index 51c9e2f43ff..7f38656f018 100644
--- a/builtin/reset.c
+++ b/builtin/reset.c
@@ -67,6 +67,8 @@  static int reset_index(const char *ref, const struct object_id *oid, int reset_t
 	case KEEP:
 	case MERGE:
 		opts.update = 1;
+		/* FIXME: Default should be to remove ignored files */
+		opts.preserve_ignored = 1;
 		break;
 	case HARD:
 		opts.update = 1;
diff --git a/builtin/stash.c b/builtin/stash.c
index 8f42360ca91..88287b890d5 100644
--- a/builtin/stash.c
+++ b/builtin/stash.c
@@ -258,6 +258,9 @@  static int reset_tree(struct object_id *i_tree, int update, int reset)
 	opts.merge = 1;
 	opts.reset = reset;
 	opts.update = update;
+	if (update && !reset)
+		/* FIXME: Default should be to remove ignored files */
+		opts.preserve_ignored = 1;
 	opts.fn = oneway_merge;
 
 	if (unpack_trees(nr_trees, t, &opts))
diff --git a/merge-ort.c b/merge-ort.c
index 35aa979c3a4..0d64ec716bd 100644
--- a/merge-ort.c
+++ b/merge-ort.c
@@ -4045,11 +4045,7 @@  static int checkout(struct merge_options *opt,
 	unpack_opts.quiet = 0; /* FIXME: sequencer might want quiet? */
 	unpack_opts.verbose_update = (opt->verbosity > 2);
 	unpack_opts.fn = twoway_merge;
-	if (1/* FIXME: opts->overwrite_ignore*/) {
-		CALLOC_ARRAY(unpack_opts.dir, 1);
-		unpack_opts.dir->flags |= DIR_SHOW_IGNORED;
-		setup_standard_excludes(unpack_opts.dir);
-	}
+	unpack_opts.preserve_ignored = 0; /* FIXME: !opts->overwrite_ignore*/
 	parse_tree(prev);
 	init_tree_desc(&trees[0], prev->buffer, prev->size);
 	parse_tree(next);
@@ -4057,8 +4053,6 @@  static int checkout(struct merge_options *opt,
 
 	ret = unpack_trees(2, trees, &unpack_opts);
 	clear_unpack_trees_porcelain(&unpack_opts);
-	dir_clear(unpack_opts.dir);
-	FREE_AND_NULL(unpack_opts.dir);
 	return ret;
 }
 
diff --git a/merge-recursive.c b/merge-recursive.c
index 233d9f686ad..2be3f5d4044 100644
--- a/merge-recursive.c
+++ b/merge-recursive.c
@@ -411,9 +411,7 @@  static int unpack_trees_start(struct merge_options *opt,
 	else {
 		opt->priv->unpack_opts.update = 1;
 		/* FIXME: should only do this if !overwrite_ignore */
-		CALLOC_ARRAY(opt->priv->unpack_opts.dir, 1);
-		opt->priv->unpack_opts.dir->flags |= DIR_SHOW_IGNORED;
-		setup_standard_excludes(opt->priv->unpack_opts.dir);
+		opt->priv->unpack_opts.preserve_ignored = 0;
 	}
 	opt->priv->unpack_opts.merge = 1;
 	opt->priv->unpack_opts.head_idx = 2;
@@ -428,10 +426,6 @@  static int unpack_trees_start(struct merge_options *opt,
 	init_tree_desc_from_tree(t+2, merge);
 
 	rc = unpack_trees(3, t, &opt->priv->unpack_opts);
-	if (opt->priv->unpack_opts.dir) {
-		dir_clear(opt->priv->unpack_opts.dir);
-		FREE_AND_NULL(opt->priv->unpack_opts.dir);
-	}
 	cache_tree_free(&opt->repo->index->cache_tree);
 
 	/*
diff --git a/merge.c b/merge.c
index 6e736881d90..2382ff66d35 100644
--- a/merge.c
+++ b/merge.c
@@ -53,7 +53,6 @@  int checkout_fast_forward(struct repository *r,
 	struct unpack_trees_options opts;
 	struct tree_desc t[MAX_UNPACK_TREES];
 	int i, nr_trees = 0;
-	struct dir_struct dir = DIR_INIT;
 	struct lock_file lock_file = LOCK_INIT;
 
 	refresh_index(r->index, REFRESH_QUIET, NULL, NULL, NULL);
@@ -80,11 +79,7 @@  int checkout_fast_forward(struct repository *r,
 	}
 
 	memset(&opts, 0, sizeof(opts));
-	if (overwrite_ignore) {
-		dir.flags |= DIR_SHOW_IGNORED;
-		setup_standard_excludes(&dir);
-		opts.dir = &dir;
-	}
+	opts.preserve_ignored = !overwrite_ignore;
 
 	opts.head_idx = 1;
 	opts.src_index = r->index;
@@ -101,7 +96,6 @@  int checkout_fast_forward(struct repository *r,
 		clear_unpack_trees_porcelain(&opts);
 		return -1;
 	}
-	dir_clear(&dir);
 	clear_unpack_trees_porcelain(&opts);
 
 	if (write_locked_index(r->index, &lock_file, COMMIT_LOCK))
diff --git a/reset.c b/reset.c
index 79310ae071b..41b3e2d88de 100644
--- a/reset.c
+++ b/reset.c
@@ -56,6 +56,8 @@  int reset_head(struct repository *r, struct object_id *oid, const char *action,
 	unpack_tree_opts.fn = reset_hard ? oneway_merge : twoway_merge;
 	unpack_tree_opts.update = 1;
 	unpack_tree_opts.merge = 1;
+	/* FIXME: Default should be to remove ignored files */
+	unpack_tree_opts.preserve_ignored = 1;
 	init_checkout_metadata(&unpack_tree_opts.meta, switch_to_branch, oid, NULL);
 	if (!detach_head)
 		unpack_tree_opts.reset = 1;
diff --git a/sequencer.c b/sequencer.c
index 614d56f5e21..098566c68d9 100644
--- a/sequencer.c
+++ b/sequencer.c
@@ -3699,6 +3699,8 @@  static int do_reset(struct repository *r,
 	unpack_tree_opts.fn = oneway_merge;
 	unpack_tree_opts.merge = 1;
 	unpack_tree_opts.update = 1;
+	/* FIXME: Default should be to remove ignored files */
+	unpack_tree_opts.preserve_ignored = 1;
 	init_checkout_metadata(&unpack_tree_opts.meta, name, &oid, NULL);
 
 	if (repo_read_index_unmerged(r)) {
diff --git a/unpack-trees.c b/unpack-trees.c
index 8ea0a542da8..1e4eae1dc7d 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1707,6 +1707,12 @@  int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 		ensure_full_index(o->dst_index);
 	}
 
+	if (!o->preserve_ignored) {
+		CALLOC_ARRAY(o->dir, 1);
+		o->dir->flags |= DIR_SHOW_IGNORED;
+		setup_standard_excludes(o->dir);
+	}
+
 	if (!core_apply_sparse_checkout || !o->update)
 		o->skip_sparse_checkout = 1;
 	if (!o->skip_sparse_checkout && !o->pl) {
@@ -1868,6 +1874,10 @@  int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 done:
 	if (free_pattern_list)
 		clear_pattern_list(&pl);
+	if (o->dir) {
+		dir_clear(o->dir);
+		FREE_AND_NULL(o->dir);
+	}
 	trace2_region_leave("unpack_trees", "unpack_trees", the_repository);
 	trace_performance_leave("unpack_trees");
 	return ret;
diff --git a/unpack-trees.h b/unpack-trees.h
index 2d88b19dca7..f98cfd49d7b 100644
--- a/unpack-trees.h
+++ b/unpack-trees.h
@@ -49,6 +49,7 @@  struct unpack_trees_options {
 	unsigned int reset,
 		     merge,
 		     update,
+		     preserve_ignored,
 		     clone,
 		     index_only,
 		     nontrivial_merge,