mbox series

[v9,0/6] submodule: parallelize diff

Message ID 20230302215237.1473444-1-calvinwan@google.com (mailing list archive)
Headers show
Series submodule: parallelize diff | expand

Message

Calvin Wan March 2, 2023, 9:52 p.m. UTC
Original cover letter for context:
https://lore.kernel.org/git/20221011232604.839941-1-calvinwan@google.com/

I appreciate all the reviewers that have stuck through this entire series!
Hoping this can be the final reroll as I believe I've addressed all feedback
and personally am happy with the state of the patches.

Changes from v8
 - renamed duplicate_output_fn to on_stderr_output_fn
 - renamed diff_change_helper() to record_file_diff() and added comments
 - reworded commit message for patch 5
 - removed the refactoring of match_stat_with_submodule()
 - inlined parse_status_porcelain_strbuf()
 - fixed stylistic nits and cleaned up unnecessary variables and logic

Calvin Wan (6):
  run-command: add on_stderr_output_fn to run_processes_parallel_opts
  submodule: rename strbuf variable
  submodule: move status parsing into function
  submodule: refactor is_submodule_modified()
  diff-lib: refactor out diff_change logic
  diff-lib: parallelize run_diff_files for submodules

 Documentation/config/submodule.txt |  12 ++
 diff-lib.c                         | 123 +++++++++++---
 run-command.c                      |  16 +-
 run-command.h                      |  25 +++
 submodule.c                        | 254 +++++++++++++++++++++++------
 submodule.h                        |   9 +
 t/helper/test-run-command.c        |  20 +++
 t/t0061-run-command.sh             |  39 +++++
 t/t4027-diff-submodule.sh          |  31 ++++
 t/t7506-status-submodule.sh        |  25 +++
 10 files changed, 478 insertions(+), 76 deletions(-)

Range-diff against v8:
1:  5d51250c67 ! 1:  49749ae3a5 run-command: add duplicate_output_fn to run_processes_parallel_opts
    @@ Metadata
     Author: Calvin Wan <calvinwan@google.com>
     
      ## Commit message ##
    -    run-command: add duplicate_output_fn to run_processes_parallel_opts
    +    run-command: add on_stderr_output_fn to run_processes_parallel_opts
     
      ## run-command.c ##
     @@ run-command.c: static void pp_init(struct parallel_processes *pp,
    @@ run-command.c: static void pp_init(struct parallel_processes *pp,
      		BUG("you need to specify a get_next_task function");
      
     +	if (opts->ungroup) {
    -+		if (opts->duplicate_output)
    -+			BUG("duplicate_output and ungroup are incompatible with each other");
    ++		if (opts->on_stderr_output)
    ++			BUG("on_stderr_output and ungroup are incompatible with each other");
     +	}
     +
      	CALLOC_ARRAY(pp->children, n);
    @@ run-command.c: static void pp_buffer_stderr(struct parallel_processes *pp,
     +			} else if (n < 0) {
      				if (errno != EAGAIN)
      					die_errno("read");
    -+			} else if (opts->duplicate_output) {
    -+				opts->duplicate_output(&pp->children[i].err,
    ++			} else if (opts->on_stderr_output) {
    ++				opts->on_stderr_output(&pp->children[i].err,
     +					pp->children[i].err.len - n,
     +					opts->data, pp->children[i].data);
     +			}
    @@ run-command.h: typedef int (*start_failure_fn)(struct strbuf *out,
      
     +/**
     + * This callback is called whenever output from a child process is buffered
    -+ * 
    ++ *
     + * See run_processes_parallel() below for a discussion of the "struct
     + * strbuf *out" parameter.
    -+ * 
    ++ *
     + * The offset refers to the number of bytes originally in "out" before
     + * the output from the child process was buffered. Therefore, the buffer
     + * range, "out + buf" to the end of "out", would contain the buffer of
    @@ run-command.h: typedef int (*start_failure_fn)(struct strbuf *out,
     + *
     + * This function is incompatible with "ungroup"
     + */
    -+typedef void (*duplicate_output_fn)(struct strbuf *out, size_t offset,
    ++typedef void (*on_stderr_output_fn)(struct strbuf *out, size_t offset,
     +				    void *pp_cb, void *pp_task_cb);
     +
      /**
    @@ run-command.h: struct run_process_parallel_opts
      	start_failure_fn start_failure;
      
     +	/**
    -+	 * duplicate_output: See duplicate_output_fn() above. Unless you need
    ++	 * on_stderr_output: See on_stderr_output_fn() above. Unless you need
     +	 * to capture output from child processes, leave this as NULL.
     +	 */
    -+	duplicate_output_fn duplicate_output;
    ++	on_stderr_output_fn on_stderr_output;
     +
      	/**
      	 * task_finished: See task_finished_fn() above. This can be
    @@ t/helper/test-run-command.c: static int no_job(struct child_process *cp,
      	return 0;
      }
      
    -+static void duplicate_output(struct strbuf *out,
    ++static void on_stderr_output(struct strbuf *out,
     +			size_t offset,
     +			void *pp_cb UNUSED,
     +			void *pp_task_cb UNUSED)
    @@ t/helper/test-run-command.c: static int no_job(struct child_process *cp,
     +
     +	string_list_split(&list, out->buf + offset, '\n', -1);
     +	for_each_string_list_item(item, &list)
    -+		fprintf(stderr, "duplicate_output: %s\n", item->string);
    ++		fprintf(stderr, "on_stderr_output: %s\n", item->string);
     +	string_list_clear(&list, 0);
     +}
     +
    @@ t/helper/test-run-command.c: int cmd__run_command(int argc, const char **argv)
      		opts.ungroup = 1;
      	}
      
    -+	if (!strcmp(argv[1], "--duplicate-output")) {
    ++	if (!strcmp(argv[1], "--on-stderr-output")) {
     +		argv += 1;
     +		argc -= 1;
    -+		opts.duplicate_output = duplicate_output;
    ++		opts.on_stderr_output = on_stderr_output;
     +	}
     +
      	jobs = atoi(argv[2]);
    @@ t/t0061-run-command.sh: test_expect_success 'run_command runs in parallel with m
      	test_cmp expect actual
      '
      
    -+test_expect_success 'run_command runs in parallel with more jobs available than tasks --duplicate-output' '
    -+	test-tool run-command --duplicate-output run-command-parallel 5 sh -c "printf \"%s\n%s\n\" Hello World" >out 2>err &&
    ++test_expect_success 'run_command runs in parallel with more jobs available than tasks --on-stderr-output' '
    ++	test-tool run-command --on-stderr-output run-command-parallel 5 sh -c "printf \"%s\n%s\n\" Hello World" >out 2>err &&
     +	test_must_be_empty out &&
    -+	test 4 = $(grep -c "duplicate_output: Hello" err) &&
    -+	test 4 = $(grep -c "duplicate_output: World" err) &&
    -+	sed "/duplicate_output/d" err >err1 &&
    ++	test 4 = $(grep -c "on_stderr_output: Hello" err) &&
    ++	test 4 = $(grep -c "on_stderr_output: World" err) &&
    ++	sed "/on_stderr_output/d" err >err1 &&
     +	test_cmp expect err1
     +'
     +
    @@ t/t0061-run-command.sh: test_expect_success 'run_command runs in parallel with a
      	test_cmp expect actual
      '
      
    -+test_expect_success 'run_command runs in parallel with as many jobs as tasks --duplicate-output' '
    -+	test-tool run-command --duplicate-output run-command-parallel 4 sh -c "printf \"%s\n%s\n\" Hello World" >out 2>err &&
    ++test_expect_success 'run_command runs in parallel with as many jobs as tasks --on-stderr-output' '
    ++	test-tool run-command --on-stderr-output run-command-parallel 4 sh -c "printf \"%s\n%s\n\" Hello World" >out 2>err &&
     +	test_must_be_empty out &&
    -+	test 4 = $(grep -c "duplicate_output: Hello" err) &&
    -+	test 4 = $(grep -c "duplicate_output: World" err) &&
    -+	sed "/duplicate_output/d" err >err1 &&
    ++	test 4 = $(grep -c "on_stderr_output: Hello" err) &&
    ++	test 4 = $(grep -c "on_stderr_output: World" err) &&
    ++	sed "/on_stderr_output/d" err >err1 &&
     +	test_cmp expect err1
     +'
     +
    @@ t/t0061-run-command.sh: test_expect_success 'run_command runs in parallel with m
      	test_cmp expect actual
      '
      
    -+test_expect_success 'run_command runs in parallel with more tasks than jobs available --duplicate-output' '
    -+	test-tool run-command --duplicate-output run-command-parallel 3 sh -c "printf \"%s\n%s\n\" Hello World" >out 2>err &&
    ++test_expect_success 'run_command runs in parallel with more tasks than jobs available --on-stderr-output' '
    ++	test-tool run-command --on-stderr-output run-command-parallel 3 sh -c "printf \"%s\n%s\n\" Hello World" >out 2>err &&
     +	test_must_be_empty out &&
    -+	test 4 = $(grep -c "duplicate_output: Hello" err) &&
    -+	test 4 = $(grep -c "duplicate_output: World" err) &&
    -+	sed "/duplicate_output/d" err >err1 &&
    ++	test 4 = $(grep -c "on_stderr_output: Hello" err) &&
    ++	test 4 = $(grep -c "on_stderr_output: World" err) &&
    ++	sed "/on_stderr_output/d" err >err1 &&
     +	test_cmp expect err1
     +'
     +
    @@ t/t0061-run-command.sh: test_expect_success 'run_command is asked to abort grace
      	test_cmp expect actual
      '
      
    -+test_expect_success 'run_command is asked to abort gracefully --duplicate-output' '
    -+	test-tool run-command --duplicate-output run-command-abort 3 false >out 2>err &&
    ++test_expect_success 'run_command is asked to abort gracefully --on-stderr-output' '
    ++	test-tool run-command --on-stderr-output run-command-abort 3 false >out 2>err &&
     +	test_must_be_empty out &&
     +	test_cmp expect err
     +'
    @@ t/t0061-run-command.sh: test_expect_success 'run_command outputs ' '
      	test_cmp expect actual
      '
      
    -+test_expect_success 'run_command outputs --duplicate-output' '
    -+	test-tool run-command --duplicate-output run-command-no-jobs 3 sh -c "printf \"%s\n%s\n\" Hello World" >out 2>err &&
    ++test_expect_success 'run_command outputs --on-stderr-output' '
    ++	test-tool run-command --on-stderr-output run-command-no-jobs 3 sh -c "printf \"%s\n%s\n\" Hello World" >out 2>err &&
     +	test_must_be_empty out &&
     +	test_cmp expect err
     +'
2:  6ded5b6788 ! 2:  6c62e670f9 submodule: strbuf variable rename
    @@ Metadata
     Author: Calvin Wan <calvinwan@google.com>
     
      ## Commit message ##
    -    submodule: strbuf variable rename
    +    submodule: rename strbuf variable
     
      ## submodule.c ##
     @@ submodule.c: unsigned is_submodule_modified(const char *path, int ignore_untracked)
3:  0c71cea8cd = 3:  24e02f2a24 submodule: move status parsing into function
4:  5c8cc93f9f = 4:  86c1f734a0 submodule: refactor is_submodule_modified()
5:  6c2b62abc8 ! 5:  811a1fee55 diff-lib: refactor out diff_change logic
    @@ diff-lib.c: static int match_stat_with_submodule(struct diff_options *diffopt,
      	return changed;
      }
      
    -+static int diff_change_helper(struct diff_options *options,
    -+	      unsigned newmode, unsigned dirty_submodule,
    -+	      int changed, struct index_state *istate,
    -+	      struct cache_entry *ce)
    ++/**
    ++ * Records diff_change if there is a change in the entry from run_diff_files.
    ++ * If there is no change, then the cache entry is marked CE_UPTODATE and
    ++ * CE_FSMONITOR_VALID. If there is no change and the find_copies_harder flag
    ++ * is not set, then the function returns early.
    ++ */
    ++static void record_file_diff(struct diff_options *options, unsigned newmode,
    ++			     unsigned dirty_submodule, int changed,
    ++			     struct index_state *istate,
    ++			     struct cache_entry *ce)
     +{
     +	unsigned int oldmode;
     +	const struct object_id *old_oid, *new_oid;
    @@ diff-lib.c: static int match_stat_with_submodule(struct diff_options *diffopt,
     +		ce_mark_uptodate(ce);
     +		mark_fsmonitor_valid(istate, ce);
     +		if (!options->flags.find_copies_harder)
    -+			return 1;
    ++			return;
     +	}
     +	oldmode = ce->ce_mode;
     +	old_oid = &ce->oid;
     +	new_oid = changed ? null_oid() : &ce->oid;
    -+	diff_change(options, oldmode, newmode,
    -+			old_oid, new_oid,
    -+			!is_null_oid(old_oid),
    -+			!is_null_oid(new_oid),
    -+			ce->name, 0, dirty_submodule);
    -+	return 0;
    ++	diff_change(options, oldmode, newmode, old_oid, new_oid,
    ++		    !is_null_oid(old_oid), !is_null_oid(new_oid),
    ++		    ce->name, 0, dirty_submodule);
     +}
     +
      int run_diff_files(struct rev_info *revs, unsigned int option)
    @@ diff-lib.c: int run_diff_files(struct rev_info *revs, unsigned int option)
     -			    !is_null_oid(new_oid),
     -			    ce->name, 0, dirty_submodule);
     -
    -+		if (diff_change_helper(&revs->diffopt, newmode, dirty_submodule,
    -+				       changed, istate, ce))
    -+			continue;
    ++		record_file_diff(&revs->diffopt, newmode, dirty_submodule,
    ++				 changed, istate, ce);
      	}
      	diffcore_std(&revs->diffopt);
      	diff_flush(&revs->diffopt);
6:  bb25dadbe5 ! 6:  17010fc179 diff-lib: parallelize run_diff_files for submodules
    @@ diff-lib.c: static int check_removed(const struct index_state *istate, const str
     +				     unsigned *ignore_untracked)
      {
      	int changed = ie_match_stat(diffopt->repo->index, ce, st, ce_option);
    --	if (S_ISGITLINK(ce->ce_mode)) {
    --		struct diff_flags orig_flags = diffopt->flags;
    --		if (!diffopt->flags.override_submodule_config)
    --			set_diffopt_flags_from_submodule_config(diffopt, ce->name);
    ++	int defer = 0;
    ++
    + 	if (S_ISGITLINK(ce->ce_mode)) {
    + 		struct diff_flags orig_flags = diffopt->flags;
    + 		if (!diffopt->flags.override_submodule_config)
    + 			set_diffopt_flags_from_submodule_config(diffopt, ce->name);
     -		if (diffopt->flags.ignore_submodules)
    --			changed = 0;
    ++		if (diffopt->flags.ignore_submodules) {
    + 			changed = 0;
     -		else if (!diffopt->flags.ignore_dirty_submodules &&
     -			 (!changed || diffopt->flags.dirty_submodules))
    -+	struct diff_flags orig_flags;
    -+	int defer = 0;
    -+
    -+	if (!S_ISGITLINK(ce->ce_mode))
    -+		goto ret;
    -+
    -+	orig_flags = diffopt->flags;
    -+	if (!diffopt->flags.override_submodule_config)
    -+		set_diffopt_flags_from_submodule_config(diffopt, ce->name);
    -+	if (diffopt->flags.ignore_submodules) {
    -+		changed = 0;
    -+		goto cleanup;
    -+	}
    -+	if (!diffopt->flags.ignore_dirty_submodules &&
    -+	    (!changed || diffopt->flags.dirty_submodules)) {
    -+		if (defer_submodule_status && *defer_submodule_status) {
    -+			defer = 1;
    -+			*ignore_untracked = diffopt->flags.ignore_untracked_in_submodules;
    -+		} else {
    - 			*dirty_submodule = is_submodule_modified(ce->name,
    +-			*dirty_submodule = is_submodule_modified(ce->name,
     -								 diffopt->flags.ignore_untracked_in_submodules);
    --		diffopt->flags = orig_flags;
    ++		} else if (!diffopt->flags.ignore_dirty_submodules &&
    ++			   (!changed || diffopt->flags.dirty_submodules)) {
    ++			if (defer_submodule_status && *defer_submodule_status) {
    ++				defer = 1;
    ++				*ignore_untracked = diffopt->flags.ignore_untracked_in_submodules;
    ++			} else {
    ++				*dirty_submodule = is_submodule_modified(ce->name,
     +					 diffopt->flags.ignore_untracked_in_submodules);
    ++			}
     +		}
    + 		diffopt->flags = orig_flags;
      	}
    -+cleanup:
    -+	diffopt->flags = orig_flags;
    -+ret:
    ++
     +	if (defer_submodule_status)
     +		*defer_submodule_status = defer;
      	return changed;
    @@ diff-lib.c: int run_diff_files(struct rev_info *revs, unsigned int option)
      
      	diff_set_mnemonic_prefix(&revs->diffopt, "i/", "w/");
      
    +@@ diff-lib.c: int run_diff_files(struct rev_info *revs, unsigned int option)
    + 		unsigned int newmode;
    + 		struct cache_entry *ce = istate->cache[i];
    + 		int changed;
    +-		unsigned dirty_submodule = 0;
    ++		int defer_submodule_status = 1;
    + 
    + 		if (diff_can_quit_early(&revs->diffopt))
    + 			break;
     @@ diff-lib.c: int run_diff_files(struct rev_info *revs, unsigned int option)
      			newmode = ce->ce_mode;
      		} else {
      			struct stat st;
     +			unsigned ignore_untracked = 0;
    -+			int defer_submodule_status = 1;
      
      			changed = check_removed(istate, ce, &st);
      			if (changed) {
    @@ diff-lib.c: int run_diff_files(struct rev_info *revs, unsigned int option)
      
      			changed = match_stat_with_submodule(&revs->diffopt, ce, &st,
     -							    ce_option, &dirty_submodule);
    -+							    ce_option, &dirty_submodule,
    ++							    ce_option, NULL,
     +							    &defer_submodule_status,
     +							    &ignore_untracked);
      			newmode = ce_mode_from_stat(ce, st.st_mode);
    @@ diff-lib.c: int run_diff_files(struct rev_info *revs, unsigned int option)
     +			}
      		}
      
    - 		if (diff_change_helper(&revs->diffopt, newmode, dirty_submodule,
    - 				       changed, istate, ce))
    - 			continue;
    - 	}
    +-		record_file_diff(&revs->diffopt, newmode, dirty_submodule,
    +-				 changed, istate, ce);
    ++		if (!defer_submodule_status)
    ++			record_file_diff(&revs->diffopt, newmode, 0,
    ++					   changed,istate, ce);
    ++	}
     +	if (submodules.nr) {
     +		unsigned long parallel_jobs;
     +		struct string_list_item *item;
    @@ diff-lib.c: int run_diff_files(struct rev_info *revs, unsigned int option)
     +		for_each_string_list_item(item, &submodules) {
     +			struct submodule_status_util *util = item->util;
     +
    -+			if (diff_change_helper(&revs->diffopt, util->newmode,
    -+				       util->dirty_submodule, util->changed,
    -+				       istate, util->ce))
    -+				continue;
    ++			record_file_diff(&revs->diffopt, util->newmode,
    ++					 util->dirty_submodule, util->changed,
    ++					 istate, util->ce);
     +		}
    -+	}
    + 	}
     +	string_list_clear(&submodules, 1);
      	diffcore_std(&revs->diffopt);
      	diff_flush(&revs->diffopt);
    @@ submodule.c: struct fetch_task {
      /**
       * When a submodule is not defined in .gitmodules, we cannot access it
       * via the regular submodule-config. Create a fake submodule, which we can
    -@@ submodule.c: static int parse_status_porcelain(char *str, size_t len,
    - 	return 0;
    - }
    - 
    -+static void parse_status_porcelain_strbuf(struct strbuf *buf,
    -+				   unsigned *dirty_submodule,
    -+				   int ignore_untracked)
    -+{
    -+	struct string_list list = STRING_LIST_INIT_DUP;
    -+	struct string_list_item *item;
    -+
    -+	string_list_split(&list, buf->buf, '\n', -1);
    -+
    -+	for_each_string_list_item(item, &list) {
    -+		if (parse_status_porcelain(item->string,
    -+					   strlen(item->string),
    -+					   dirty_submodule,
    -+					   ignore_untracked))
    -+			break;
    -+	}
    -+	string_list_clear(&list, 0);
    -+}
    -+
    - unsigned is_submodule_modified(const char *path, int ignore_untracked)
    - {
    - 	struct child_process cp = CHILD_PROCESS_INIT;
     @@ submodule.c: unsigned is_submodule_modified(const char *path, int ignore_untracked)
      	return dirty_submodule;
      }
    @@ submodule.c: unsigned is_submodule_modified(const char *path, int ignore_untrack
     +	return 0;
     +}
     +
    -+static void status_duplicate_output(struct strbuf *out,
    ++static void status_on_stderr_output(struct strbuf *out,
     +				    size_t offset,
     +				    void *cb, void *task_cb)
     +{
    @@ submodule.c: unsigned is_submodule_modified(const char *path, int ignore_untrack
     +	struct string_list_item *it =
     +		string_list_lookup(sps->submodule_names, task->path);
     +	struct submodule_status_util *util = it->util;
    ++	struct string_list list = STRING_LIST_INIT_DUP;
    ++	struct string_list_item *item;
     +
     +	if (retvalue) {
     +		sps->result = 1;
     +		strbuf_addf(err, _(STATUS_PORCELAIN_FAIL_ERROR), task->path);
     +	}
     +
    -+	parse_status_porcelain_strbuf(&task->out,
    -+			      &util->dirty_submodule,
    -+			      util->ignore_untracked);
    -+
    ++	string_list_split(&list, task->out.buf, '\n', -1);
    ++	for_each_string_list_item(item, &list) {
    ++		if (parse_status_porcelain(item->string,
    ++					   strlen(item->string),
    ++					   &util->dirty_submodule,
    ++					   util->ignore_untracked))
    ++			break;
    ++	}
    ++	string_list_clear(&list, 0);
     +	strbuf_release(&task->out);
     +	free(task);
     +
    @@ submodule.c: unsigned is_submodule_modified(const char *path, int ignore_untrack
     +
     +		.get_next_task = get_next_submodule_status,
     +		.start_failure = status_start_failure,
    -+		.duplicate_output = status_duplicate_output,
    ++		.on_stderr_output = status_on_stderr_output,
     +		.task_finished = status_finish,
     +		.data = &sps,
     +	};