diff mbox series

[02/14] combine-diff: add combine_diff_path_new()

Message ID 20250109083236.GB2748836@coredump.intra.peff.net (mailing list archive)
State New
Headers show
Series combine-diff cleanups | expand

Commit Message

Jeff King Jan. 9, 2025, 8:32 a.m. UTC
The combine_diff_path struct has variable size, since it embeds both the
memory allocation for the path field as well as a variable-sized parent
array. This makes allocating one a bit tricky.

We have a helper to compute the required size, but it's up to individual
sites to actually initialize all of the fields. Let's provide a
constructor function to make that a little nicer. Besides being shorter,
it also hides away tricky bits like the computation of the "path"
pointer (which is right after the "parent" flex array).

As a bonus, using the same constructor everywhere means that we'll
consistently initialize all parts of the struct. A few code paths left
the parent array unitialized. This didn't cause any bugs, but we'll be
able to simplify some code in the next few patches knowing that the
parent fields have all been zero'd.

This also gets rid of some questionable uses of "int" to store buffer
lengths. Though we do use them to allocate, I don't think there are any
integer overflow vulnerabilities here (the allocation helper promotes
them to size_t and checks arithmetic for overflow, and the actual memcpy
of the bytes is done using the possibly-truncated "int" value).

Sadly we can't use the FLEX_* macros to simplify the allocation here,
because there are two variable-sized parts to the struct (and those
macros only handle one).

Nor can we get stop publicly declaring combine_diff_path_size(). This
patch does not touch the code in path_appendnew() at all, which is not
ready to be moved to our new constructor for a few reasons:

  - path_appendnew() has a memory-reuse optimization where it tries to
    reuse combine_diff_path structs rather than freeing and
    reallocating.

  - path_appendnew() does not create the struct from a single path
    string, but rather allocates and copies into the buffer from
    multiple sources.

These can be addressed by some refactoring, but let's leave it as-is for
now.

Signed-off-by: Jeff King <peff@peff.net>
---
 combine-diff.c | 40 ++++++++++++++++++++++++++--------------
 diff-lib.c     | 29 ++++++-----------------------
 diff.h         |  5 +++++
 3 files changed, 37 insertions(+), 37 deletions(-)

Comments

Junio C Hamano Jan. 9, 2025, 6:05 p.m. UTC | #1
Jeff King <peff@peff.net> writes:

> +struct combine_diff_path *combine_diff_path_new(const char *path,
> +						size_t path_len,
> +						unsigned int mode,
> +						const struct object_id *oid,
> +						size_t num_parents)
> +{
> +	struct combine_diff_path *p;
> +
> +	p = xmalloc(combine_diff_path_size(num_parents, path_len));
> +	p->path = (char *)&(p->parent[num_parents]);
> +	memcpy(p->path, path, path_len);
> +	p->path[path_len] = 0;
> +	p->next = NULL;
> +	p->mode = mode;
> +	oidcpy(&p->oid, oid);
> +
> +	memset(p->parent, 0, sizeof(p->parent[0]) * num_parents);
> +
> +	return p;
> +}

OK, I can see how the structure is laid out clearly in this code,
but I have to say it is one ugly hack X-<.  At least with the
refactoring, it becomes much easier to see what the caller is doing.
Patrick Steinhardt Jan. 13, 2025, 3:40 p.m. UTC | #2
On Thu, Jan 09, 2025 at 03:32:36AM -0500, Jeff King wrote:
> The combine_diff_path struct has variable size, since it embeds both the
> memory allocation for the path field as well as a variable-sized parent
> array. This makes allocating one a bit tricky.
> 
> We have a helper to compute the required size, but it's up to individual
> sites to actually initialize all of the fields. Let's provide a
> constructor function to make that a little nicer. Besides being shorter,
> it also hides away tricky bits like the computation of the "path"
> pointer (which is right after the "parent" flex array).
> 
> As a bonus, using the same constructor everywhere means that we'll
> consistently initialize all parts of the struct. A few code paths left
> the parent array unitialized. This didn't cause any bugs, but we'll be
> able to simplify some code in the next few patches knowing that the
> parent fields have all been zero'd.
> 
> This also gets rid of some questionable uses of "int" to store buffer
> lengths. Though we do use them to allocate, I don't think there are any
> integer overflow vulnerabilities here (the allocation helper promotes
> them to size_t and checks arithmetic for overflow, and the actual memcpy
> of the bytes is done using the possibly-truncated "int" value).
> 
> Sadly we can't use the FLEX_* macros to simplify the allocation here,
> because there are two variable-sized parts to the struct (and those
> macros only handle one).
> 
> Nor can we get stop publicly declaring combine_diff_path_size(). This

s/we get stop/we stop/

> diff --git a/combine-diff.c b/combine-diff.c
> index 641bc92dbd..45548fd438 100644
> --- a/combine-diff.c
> +++ b/combine-diff.c
> @@ -47,22 +47,13 @@ static struct combine_diff_path *intersect_paths(
>  
>  	if (!n) {
>  		for (i = 0; i < q->nr; i++) {
> -			int len;
> -			const char *path;
>  			if (diff_unmodified_pair(q->queue[i]))
>  				continue;
> -			path = q->queue[i]->two->path;
> -			len = strlen(path);
> -			p = xmalloc(combine_diff_path_size(num_parent, len));
> -			p->path = (char *) &(p->parent[num_parent]);
> -			memcpy(p->path, path, len);
> -			p->path[len] = 0;
> -			p->next = NULL;
> -			memset(p->parent, 0,
> -			       sizeof(p->parent[0]) * num_parent);
> -
> -			oidcpy(&p->oid, &q->queue[i]->two->oid);
> -			p->mode = q->queue[i]->two->mode;
> +			p = combine_diff_path_new(q->queue[i]->two->path,
> +						  strlen(q->queue[i]->two->path),
> +						  q->queue[i]->two->mode,
> +						  &q->queue[i]->two->oid,
> +						  num_parent);
>  			oidcpy(&p->parent[n].oid, &q->queue[i]->one->oid);
>  			p->parent[n].mode = q->queue[i]->one->mode;
>  			p->parent[n].status = q->queue[i]->status;
> @@ -1667,3 +1658,24 @@ void diff_tree_combined_merge(const struct commit *commit,
>  	diff_tree_combined(&commit->object.oid, &parents, rev);
>  	oid_array_clear(&parents);
>  }
> +
> +struct combine_diff_path *combine_diff_path_new(const char *path,
> +						size_t path_len,
> +						unsigned int mode,
> +						const struct object_id *oid,
> +						size_t num_parents)
> +{
> +	struct combine_diff_path *p;
> +
> +	p = xmalloc(combine_diff_path_size(num_parents, path_len));
> +	p->path = (char *)&(p->parent[num_parents]);
> +	memcpy(p->path, path, path_len);
> +	p->path[path_len] = 0;
> +	p->next = NULL;
> +	p->mode = mode;
> +	oidcpy(&p->oid, oid);
> +
> +	memset(p->parent, 0, sizeof(p->parent[0]) * num_parents);
> +
> +	return p;
> +}

If I were to write this anew I'd probably use `xcalloc()` instead of
manually `memset()`ing parts of it to zero. But it's a faithful
transplant of the code from `intersect_paths()`, so that's probably
okay.

Patrick
Jeff King Jan. 14, 2025, 9:29 a.m. UTC | #3
On Mon, Jan 13, 2025 at 04:40:19PM +0100, Patrick Steinhardt wrote:

> > +struct combine_diff_path *combine_diff_path_new(const char *path,
> > +						size_t path_len,
> > +						unsigned int mode,
> > +						const struct object_id *oid,
> > +						size_t num_parents)
> > +{
> > +	struct combine_diff_path *p;
> > +
> > +	p = xmalloc(combine_diff_path_size(num_parents, path_len));
> > +	p->path = (char *)&(p->parent[num_parents]);
> > +	memcpy(p->path, path, path_len);
> > +	p->path[path_len] = 0;
> > +	p->next = NULL;
> > +	p->mode = mode;
> > +	oidcpy(&p->oid, oid);
> > +
> > +	memset(p->parent, 0, sizeof(p->parent[0]) * num_parents);
> > +
> > +	return p;
> > +}
> 
> If I were to write this anew I'd probably use `xcalloc()` instead of
> manually `memset()`ing parts of it to zero. But it's a faithful
> transplant of the code from `intersect_paths()`, so that's probably
> okay.

Yeah, I actually wrote it that way originally (thinking the issue was
that we were leaving uninitialized fields all over), before realizing
that most callers were explicitly zero-ing the parents. So I went for
the minimal change.

From an efficiency standpoint, I don't know that it matters much between
the two (xcalloc would zero some fields which we're going to assign
anyway, but the zeroing may be more efficient on the backend). xcalloc
means you'd never forget to initialize any part of it, so maybe it's
more readable / less error prone?

We could do a patch on top, but I doubt it's a big deal either way.

-Peff
diff mbox series

Patch

diff --git a/combine-diff.c b/combine-diff.c
index 641bc92dbd..45548fd438 100644
--- a/combine-diff.c
+++ b/combine-diff.c
@@ -47,22 +47,13 @@  static struct combine_diff_path *intersect_paths(
 
 	if (!n) {
 		for (i = 0; i < q->nr; i++) {
-			int len;
-			const char *path;
 			if (diff_unmodified_pair(q->queue[i]))
 				continue;
-			path = q->queue[i]->two->path;
-			len = strlen(path);
-			p = xmalloc(combine_diff_path_size(num_parent, len));
-			p->path = (char *) &(p->parent[num_parent]);
-			memcpy(p->path, path, len);
-			p->path[len] = 0;
-			p->next = NULL;
-			memset(p->parent, 0,
-			       sizeof(p->parent[0]) * num_parent);
-
-			oidcpy(&p->oid, &q->queue[i]->two->oid);
-			p->mode = q->queue[i]->two->mode;
+			p = combine_diff_path_new(q->queue[i]->two->path,
+						  strlen(q->queue[i]->two->path),
+						  q->queue[i]->two->mode,
+						  &q->queue[i]->two->oid,
+						  num_parent);
 			oidcpy(&p->parent[n].oid, &q->queue[i]->one->oid);
 			p->parent[n].mode = q->queue[i]->one->mode;
 			p->parent[n].status = q->queue[i]->status;
@@ -1667,3 +1658,24 @@  void diff_tree_combined_merge(const struct commit *commit,
 	diff_tree_combined(&commit->object.oid, &parents, rev);
 	oid_array_clear(&parents);
 }
+
+struct combine_diff_path *combine_diff_path_new(const char *path,
+						size_t path_len,
+						unsigned int mode,
+						const struct object_id *oid,
+						size_t num_parents)
+{
+	struct combine_diff_path *p;
+
+	p = xmalloc(combine_diff_path_size(num_parents, path_len));
+	p->path = (char *)&(p->parent[num_parents]);
+	memcpy(p->path, path, path_len);
+	p->path[path_len] = 0;
+	p->next = NULL;
+	p->mode = mode;
+	oidcpy(&p->oid, oid);
+
+	memset(p->parent, 0, sizeof(p->parent[0]) * num_parents);
+
+	return p;
+}
diff --git a/diff-lib.c b/diff-lib.c
index 85b8f1fa59..471ef99614 100644
--- a/diff-lib.c
+++ b/diff-lib.c
@@ -153,7 +153,6 @@  void run_diff_files(struct rev_info *revs, unsigned int option)
 			struct diff_filepair *pair;
 			unsigned int wt_mode = 0;
 			int num_compare_stages = 0;
-			size_t path_len;
 			struct stat st;
 
 			changed = check_removed(ce, &st);
@@ -167,18 +166,8 @@  void run_diff_files(struct rev_info *revs, unsigned int option)
 				wt_mode = 0;
 			}
 
-			path_len = ce_namelen(ce);
-
-			dpath = xmalloc(combine_diff_path_size(5, path_len));
-			dpath->path = (char *) &(dpath->parent[5]);
-
-			dpath->next = NULL;
-			memcpy(dpath->path, ce->name, path_len);
-			dpath->path[path_len] = '\0';
-			oidclr(&dpath->oid, the_repository->hash_algo);
-			dpath->mode = wt_mode;
-			memset(&(dpath->parent[0]), 0,
-			       sizeof(struct combine_diff_parent)*5);
+			dpath = combine_diff_path_new(ce->name, ce_namelen(ce),
+						      wt_mode, null_oid(), 5);
 
 			while (i < entries) {
 				struct cache_entry *nce = istate->cache[i];
@@ -405,16 +394,10 @@  static int show_modified(struct rev_info *revs,
 	if (revs->combine_merges && !cached &&
 	    (!oideq(oid, &old_entry->oid) || !oideq(&old_entry->oid, &new_entry->oid))) {
 		struct combine_diff_path *p;
-		int pathlen = ce_namelen(new_entry);
-
-		p = xmalloc(combine_diff_path_size(2, pathlen));
-		p->path = (char *) &p->parent[2];
-		p->next = NULL;
-		memcpy(p->path, new_entry->name, pathlen);
-		p->path[pathlen] = 0;
-		p->mode = mode;
-		oidclr(&p->oid, the_repository->hash_algo);
-		memset(p->parent, 0, 2 * sizeof(struct combine_diff_parent));
+
+		p = combine_diff_path_new(new_entry->name,
+					  ce_namelen(new_entry),
+					  mode, null_oid(), 2);
 		p->parent[0].status = DIFF_STATUS_MODIFIED;
 		p->parent[0].mode = new_entry->ce_mode;
 		oidcpy(&p->parent[0].oid, &new_entry->oid);
diff --git a/diff.h b/diff.h
index 6e6007c17b..5cddd5a870 100644
--- a/diff.h
+++ b/diff.h
@@ -486,6 +486,11 @@  struct combine_diff_path {
 #define combine_diff_path_size(n, l) \
 	st_add4(sizeof(struct combine_diff_path), (l), 1, \
 		st_mult(sizeof(struct combine_diff_parent), (n)))
+struct combine_diff_path *combine_diff_path_new(const char *path,
+						size_t path_len,
+						unsigned int mode,
+						const struct object_id *oid,
+						size_t num_parents);
 
 void show_combined_diff(struct combine_diff_path *elem, int num_parent,
 			struct rev_info *);