[v2,20/24] fast-import: permit reading multiple marks files
diff mbox series

Message ID 20200222201749.937983-21-sandals@crustytoothpaste.net
State New
Headers show
Series
  • SHA-256 stage 4 implementation, part 1/3
Related show

Commit Message

brian m. carlson Feb. 22, 2020, 8:17 p.m. UTC
In the future, we'll want to read marks files for submodules as well.
Refactor the existing code to make it possible to read multiple marks
files, each into their own marks set.

Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
---
 fast-import.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

Comments

Junio C Hamano June 5, 2020, 4:14 p.m. UTC | #1
"brian m. carlson" <sandals@crustytoothpaste.net> writes:

> In the future, we'll want to read marks files for submodules as well.
> Refactor the existing code to make it possible to read multiple marks
> files, each into their own marks set.
>
> Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
> ---
>  fast-import.c | 32 ++++++++++++++++++--------------
>  1 file changed, 18 insertions(+), 14 deletions(-)

This conversion looks a bit strange.  We are working off of 'marks'
that is file scope global, and the topmost caller passes the pointer
to that single instance down the callchain to this function (among
others) in "s", and it all looked no-op to my eye, but this actually
updates the global 'marks'.

> diff --git a/fast-import.c b/fast-import.c
> index b8b65a801c..b9ecd89699 100644
> --- a/fast-import.c
> +++ b/fast-import.c
> @@ -493,25 +493,24 @@ static char *pool_strdup(const char *s)
>  	return r;
>  }
>  
> -static void insert_mark(uintmax_t idnum, struct object_entry *oe)
> +static void insert_mark(struct mark_set *s, uintmax_t idnum, struct object_entry *oe)
>  {
> -	struct mark_set *s = marks;

This is no longer needed as the caller gave the 'marks' to us in
's'.  Up to this line, this step is a no-op patch. 

>  	while ((idnum >> s->shift) >= 1024) {
>  		s = mem_pool_calloc(&fi_mem_pool, 1, sizeof(struct mark_set));

Here we overwrite 's', and we lost the pointer the caller gave us!!!

>  		s->shift = marks->shift + 10;

And we still work off of global marks here and

>  		s->data.sets[0] = marks;
>  		marks = s;

try to swap it here.  But if our caller is also using 's' that was
passed from its caller, this assignment would not be seen by our
caller.  We'd need to pass a pointer to caller's 's' to this
function and update it around here.

>  	}
>  	while (s->shift) {
>  		uintmax_t i = idnum >> s->shift;
>  		idnum -= i << s->shift;
>  		if (!s->data.sets[i]) {
>  			s->data.sets[i] = mem_pool_calloc(&fi_mem_pool, 1, sizeof(struct mark_set));
>  			s->data.sets[i]->shift = s->shift - 10;
>  		}
>  		s = s->data.sets[i];
>  	}
>  	if (!s->data.marked[idnum])
>  		marks_set_count++;

I am not sure if this global also needs a separate counter and I
didn't bother to check.

>  	s->data.marked[idnum] = oe;
>  }

With this patch t9300 still seems to pass.

I do not know if this is the fix for Billes Tibor's "fast-import
oom" issue <c53bb69b-682d-3b47-4ed0-5f4559e69e37@gmx.com> but the
change to this function stood out.

Thanks.


 fast-import.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 0dfa14dc8c..f39b7890ac 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -534,13 +534,15 @@ static char *pool_strdup(const char *s)
 	return r;
 }
 
-static void insert_mark(struct mark_set *s, uintmax_t idnum, struct object_entry *oe)
+static void insert_mark(struct mark_set **sp, uintmax_t idnum, struct object_entry *oe)
 {
+	struct mark_set *s = *sp;
+
 	while ((idnum >> s->shift) >= 1024) {
 		s = mem_pool_calloc(&fi_mem_pool, 1, sizeof(struct mark_set));
-		s->shift = marks->shift + 10;
-		s->data.sets[0] = marks;
-		marks = s;
+		s->shift = (*sp)->shift + 10;
+		s->data.sets[0] = (*sp);
+		(*sp) = s;
 	}
 	while (s->shift) {
 		uintmax_t i = idnum >> s->shift;
@@ -958,7 +960,7 @@ static int store_object(
 
 	e = insert_object(&oid);
 	if (mark)
-		insert_mark(marks, mark, e);
+		insert_mark(&marks, mark, e);
 	if (e->idx.offset) {
 		duplicate_count_by_type[type]++;
 		return 1;
@@ -1156,7 +1158,7 @@ static void stream_blob(uintmax_t len, struct object_id *oidout, uintmax_t mark)
 	e = insert_object(&oid);
 
 	if (mark)
-		insert_mark(marks, mark, e);
+		insert_mark(&marks, mark, e);
 
 	if (e->idx.offset) {
 		duplicate_count_by_type[OBJ_BLOB]++;
@@ -1745,12 +1747,12 @@ static void insert_object_entry(struct mark_set *s, struct object_id *oid, uintm
 		e->pack_id = MAX_PACK_ID;
 		e->idx.offset = 1; /* just not zero! */
 	}
-	insert_mark(s, mark, e);
+	insert_mark(&s, mark, e);
 }
 
 static void insert_oid_entry(struct mark_set *s, struct object_id *oid, uintmax_t mark)
 {
-	insert_mark(s, mark, xmemdupz(oid, sizeof(*oid)));
+	insert_mark(&s, mark, xmemdupz(oid, sizeof(*oid)));
 }
 
 static void read_mark_file(struct mark_set *s, FILE *f, mark_set_inserter_t inserter)
@@ -3242,7 +3244,7 @@ static void parse_alias(void)
 		die(_("Expected 'to' command, got %s"), command_buf.buf);
 	e = find_object(&b.oid);
 	assert(e);
-	insert_mark(marks, next_mark, e);
+	insert_mark(&marks, next_mark, e);
 }
 
 static char* make_fast_import_path(const char *path)
Junio C Hamano June 5, 2020, 4:26 p.m. UTC | #2
Junio C Hamano <gitster@pobox.com> writes:

> @@ -1745,12 +1747,12 @@ static void insert_object_entry(struct mark_set *s, struct object_id *oid, uintm
>  		e->pack_id = MAX_PACK_ID;
>  		e->idx.offset = 1; /* just not zero! */
>  	}
> -	insert_mark(s, mark, e);
> +	insert_mark(&s, mark, e);
>  }
>  
>  static void insert_oid_entry(struct mark_set *s, struct object_id *oid, uintmax_t mark)
>  {
> -	insert_mark(s, mark, xmemdupz(oid, sizeof(*oid)));
> +	insert_mark(&s, mark, xmemdupz(oid, sizeof(*oid)));
>  }
>  

Actually the above "fix" still has the same issue; we may have made
the on-stack copy of 's' that are parameters these two functions see
as callees, but the caller of these two functions would not see the
effect of the munging done deep in the callchain by insert_mark().

The read_marks() function calls read_mark_file() with "marks", but I
think you'd need to pass pointer to that global singleton and teach
mark_set_inserter_t function to take a pointer to a pointer to
struct mark_set throughout, so that the instance the top-level
caller (read_marks in this callchain) gets updated.

Here is another try.

 fast-import.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 0dfa14dc8c..ed87d6e380 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -150,7 +150,7 @@ struct recent_command {
 	char *buf;
 };
 
-typedef void (*mark_set_inserter_t)(struct mark_set *s, struct object_id *oid, uintmax_t mark);
+typedef void (*mark_set_inserter_t)(struct mark_set **s, struct object_id *oid, uintmax_t mark);
 typedef void (*each_mark_fn_t)(uintmax_t mark, void *obj, void *cbp);
 
 /* Configured limits on output */
@@ -534,13 +534,15 @@ static char *pool_strdup(const char *s)
 	return r;
 }
 
-static void insert_mark(struct mark_set *s, uintmax_t idnum, struct object_entry *oe)
+static void insert_mark(struct mark_set **sp, uintmax_t idnum, struct object_entry *oe)
 {
+	struct mark_set *s = *sp;
+
 	while ((idnum >> s->shift) >= 1024) {
 		s = mem_pool_calloc(&fi_mem_pool, 1, sizeof(struct mark_set));
-		s->shift = marks->shift + 10;
-		s->data.sets[0] = marks;
-		marks = s;
+		s->shift = (*sp)->shift + 10;
+		s->data.sets[0] = (*sp);
+		(*sp) = s;
 	}
 	while (s->shift) {
 		uintmax_t i = idnum >> s->shift;
@@ -958,7 +960,7 @@ static int store_object(
 
 	e = insert_object(&oid);
 	if (mark)
-		insert_mark(marks, mark, e);
+		insert_mark(&marks, mark, e);
 	if (e->idx.offset) {
 		duplicate_count_by_type[type]++;
 		return 1;
@@ -1156,7 +1158,7 @@ static void stream_blob(uintmax_t len, struct object_id *oidout, uintmax_t mark)
 	e = insert_object(&oid);
 
 	if (mark)
-		insert_mark(marks, mark, e);
+		insert_mark(&marks, mark, e);
 
 	if (e->idx.offset) {
 		duplicate_count_by_type[OBJ_BLOB]++;
@@ -1731,7 +1733,7 @@ static void dump_marks(void)
 	}
 }
 
-static void insert_object_entry(struct mark_set *s, struct object_id *oid, uintmax_t mark)
+static void insert_object_entry(struct mark_set **s, struct object_id *oid, uintmax_t mark)
 {
 	struct object_entry *e;
 	e = find_object(oid);
@@ -1748,12 +1750,12 @@ static void insert_object_entry(struct mark_set *s, struct object_id *oid, uintm
 	insert_mark(s, mark, e);
 }
 
-static void insert_oid_entry(struct mark_set *s, struct object_id *oid, uintmax_t mark)
+static void insert_oid_entry(struct mark_set **s, struct object_id *oid, uintmax_t mark)
 {
 	insert_mark(s, mark, xmemdupz(oid, sizeof(*oid)));
 }
 
-static void read_mark_file(struct mark_set *s, FILE *f, mark_set_inserter_t inserter)
+static void read_mark_file(struct mark_set **s, FILE *f, mark_set_inserter_t inserter)
 {
 	char line[512];
 	while (fgets(line, sizeof(line), f)) {
@@ -1786,7 +1788,7 @@ static void read_marks(void)
 		goto done; /* Marks file does not exist */
 	else
 		die_errno("cannot read '%s'", import_marks_file);
-	read_mark_file(marks, f, insert_object_entry);
+	read_mark_file(&marks, f, insert_object_entry);
 	fclose(f);
 done:
 	import_marks_file_done = 1;
@@ -3242,7 +3244,7 @@ static void parse_alias(void)
 		die(_("Expected 'to' command, got %s"), command_buf.buf);
 	e = find_object(&b.oid);
 	assert(e);
-	insert_mark(marks, next_mark, e);
+	insert_mark(&marks, next_mark, e);
 }
 
 static char* make_fast_import_path(const char *path)
@@ -3340,7 +3342,7 @@ static void option_rewrite_submodules(const char *arg, struct string_list *list)
 	fp = fopen(f, "r");
 	if (!fp)
 		die_errno("cannot read '%s'", f);
-	read_mark_file(ms, fp, insert_oid_entry);
+	read_mark_file(&ms, fp, insert_oid_entry);
 	fclose(fp);
 }
brian m. carlson June 5, 2020, 10:39 p.m. UTC | #3
On 2020-06-05 at 16:26:02, Junio C Hamano wrote:
> diff --git a/fast-import.c b/fast-import.c
> index 0dfa14dc8c..ed87d6e380 100644
> --- a/fast-import.c
> +++ b/fast-import.c
> @@ -150,7 +150,7 @@ struct recent_command {
>  	char *buf;
>  };
>  
> -typedef void (*mark_set_inserter_t)(struct mark_set *s, struct object_id *oid, uintmax_t mark);
> +typedef void (*mark_set_inserter_t)(struct mark_set **s, struct object_id *oid, uintmax_t mark);
>  typedef void (*each_mark_fn_t)(uintmax_t mark, void *obj, void *cbp);
>  
>  /* Configured limits on output */
> @@ -534,13 +534,15 @@ static char *pool_strdup(const char *s)
>  	return r;
>  }
>  
> -static void insert_mark(struct mark_set *s, uintmax_t idnum, struct object_entry *oe)
> +static void insert_mark(struct mark_set **sp, uintmax_t idnum, struct object_entry *oe)
>  {
> +	struct mark_set *s = *sp;
> +
>  	while ((idnum >> s->shift) >= 1024) {
>  		s = mem_pool_calloc(&fi_mem_pool, 1, sizeof(struct mark_set));
> -		s->shift = marks->shift + 10;
> -		s->data.sets[0] = marks;
> -		marks = s;
> +		s->shift = (*sp)->shift + 10;
> +		s->data.sets[0] = (*sp);
> +		(*sp) = s;
>  	}

Yeah, this is much better.  I'm not sure how I missed converting that
block, but it definitely leaks heavily, and obviously we don't want to
update the actual marks variable in every case, since that defeats the
purpose of the patch.

We don't actually hit this case in any of the tests because we don't
have any tests that have 1024 marks in them.  I'm having trouble coming
up with a test even with a large number of marks because I don't think
we produce different behavior here; we just leak a lot of memory.

This does fix the reported issue, as I suspected.

Do you want to write this up into a patch with a commit message, or
should I?  If the latter, may I have your sign-off for it?
Junio C Hamano June 5, 2020, 10:53 p.m. UTC | #4
"brian m. carlson" <sandals@crustytoothpaste.net> writes:

> We don't actually hit this case in any of the tests because we don't
> have any tests that have 1024 marks in them.  I'm having trouble coming
> up with a test even with a large number of marks because I don't think
> we produce different behavior here; we just leak a lot of memory.
>
> This does fix the reported issue, as I suspected.
>
> Do you want to write this up into a patch with a commit message, or
> should I?  If the latter, may I have your sign-off for it?

Sure, anything I write for this project you can assume to be signed
off by me.  As I was just "fixing" what I thought was a botched
conversion, without really figuring out where it actually leads to
leaks, I'd need further work if I were to wrap it up with a sensible
log message, so thanks for volunteering ;-)

Signed-off-by: Junio C Hamano <gitster@pobox.com>

Patch
diff mbox series

diff --git a/fast-import.c b/fast-import.c
index b8b65a801c..b9ecd89699 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -493,9 +493,8 @@  static char *pool_strdup(const char *s)
 	return r;
 }
 
-static void insert_mark(uintmax_t idnum, struct object_entry *oe)
+static void insert_mark(struct mark_set *s, uintmax_t idnum, struct object_entry *oe)
 {
-	struct mark_set *s = marks;
 	while ((idnum >> s->shift) >= 1024) {
 		s = mem_pool_calloc(&fi_mem_pool, 1, sizeof(struct mark_set));
 		s->shift = marks->shift + 10;
@@ -919,7 +918,7 @@  static int store_object(
 
 	e = insert_object(&oid);
 	if (mark)
-		insert_mark(mark, e);
+		insert_mark(marks, mark, e);
 	if (e->idx.offset) {
 		duplicate_count_by_type[type]++;
 		return 1;
@@ -1117,7 +1116,7 @@  static void stream_blob(uintmax_t len, struct object_id *oidout, uintmax_t mark)
 	e = insert_object(&oid);
 
 	if (mark)
-		insert_mark(mark, e);
+		insert_mark(marks, mark, e);
 
 	if (e->idx.offset) {
 		duplicate_count_by_type[OBJ_BLOB]++;
@@ -1712,16 +1711,9 @@  static void dump_marks(void)
 	}
 }
 
-static void read_marks(void)
+static void read_mark_file(struct mark_set *s, FILE *f)
 {
 	char line[512];
-	FILE *f = fopen(import_marks_file, "r");
-	if (f)
-		;
-	else if (import_marks_file_ignore_missing && errno == ENOENT)
-		goto done; /* Marks file does not exist */
-	else
-		die_errno("cannot read '%s'", import_marks_file);
 	while (fgets(line, sizeof(line), f)) {
 		uintmax_t mark;
 		char *end;
@@ -1747,8 +1739,20 @@  static void read_marks(void)
 			e->pack_id = MAX_PACK_ID;
 			e->idx.offset = 1; /* just not zero! */
 		}
-		insert_mark(mark, e);
+		insert_mark(s, mark, e);
 	}
+}
+
+static void read_marks(void)
+{
+	FILE *f = fopen(import_marks_file, "r");
+	if (f)
+		;
+	else if (import_marks_file_ignore_missing && errno == ENOENT)
+		goto done; /* Marks file does not exist */
+	else
+		die_errno("cannot read '%s'", import_marks_file);
+	read_mark_file(marks, f);
 	fclose(f);
 done:
 	import_marks_file_done = 1;
@@ -3130,7 +3134,7 @@  static void parse_alias(void)
 		die(_("Expected 'to' command, got %s"), command_buf.buf);
 	e = find_object(&b.oid);
 	assert(e);
-	insert_mark(next_mark, e);
+	insert_mark(marks, next_mark, e);
 }
 
 static char* make_fast_import_path(const char *path)