diff mbox series

[RFC,17/21] parallel-checkout: avoid stat() calls in workers

Message ID 79de52b6952441e77d7276243b4b2ebe7ca16a1f.1597093021.git.matheus.bernardino@usp.br (mailing list archive)
State New, archived
Headers show
Series Parallel checkout | expand

Commit Message

Matheus Tavares Aug. 10, 2020, 9:33 p.m. UTC
The current parallel checkout implementation requires the workers to
stat() the path components of each entry before writing, to make sure
they are all real directories and not symlinks or something else. The
stat() info is cached, so this procedure should not be so bad
performance-wise. But the exact same check is already done by the main
process, before enqueueing the entries for parallel checkout, to remove
files that were in the way and create the leading dirs. The reason we
still need the second check is that, in case of path collisions, a
symlink X could be created after an entry x/f was enqueued, leading the
parallel worker to wrongly create the file at X/f. If we postpone the
symlinks' checkouts, though, we can avoid the need of these stat() calls
in the workers. Other types of path collisions are still possible, such
as a regular file X being written before the worker tries to write x/f.
But that's OK, since the parallel checkout machinery will check the
return of open() to detect such collisions (which would not be possible
for the symlink case, as open() would succeed).

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
---
 entry.c             | 10 +++++++
 parallel-checkout.c | 71 ++++++++++++++++++++++++++++-----------------
 parallel-checkout.h |  8 +++++
 unpack-trees.c      |  4 ++-
 4 files changed, 65 insertions(+), 28 deletions(-)
diff mbox series

Patch

diff --git a/entry.c b/entry.c
index b6c808dffa..6208df23df 100644
--- a/entry.c
+++ b/entry.c
@@ -477,6 +477,16 @@  int checkout_entry_ca(struct cache_entry *ce, struct conv_attrs *ca,
 		return write_entry(ce, topath, ca, state, 1);
 	}
 
+	/*
+	 * If a regular file x/f is queued for parallel checkout and a symlink
+	 * X is created now, the worker could wrongly create the file at X/f
+	 * due to path collision. Thus, symlinks are only created after
+	 * parallel-eligible entries.
+	 */
+	if (parallel_checkout_status() == PC_ACCEPTING_ENTRIES &&
+	    S_ISLNK(ce->ce_mode))
+		enqueue_symlink_checkout(ce, nr_checkouts);
+
 	strbuf_reset(&path);
 	strbuf_add(&path, state->base_dir, state->base_dir_len);
 	strbuf_add(&path, ce->name, ce_namelen(ce));
diff --git a/parallel-checkout.c b/parallel-checkout.c
index 78bf2de5ea..fee93460c1 100644
--- a/parallel-checkout.c
+++ b/parallel-checkout.c
@@ -140,6 +140,44 @@  static void advance_progress_meter(void)
 	}
 }
 
+struct symlink_checkout_item {
+	struct cache_entry *ce;
+	int *nr_checkouts;
+};
+
+static struct symlink_checkout_item *symlink_queue = NULL;
+static size_t symlink_queue_nr = 0, symlink_queue_alloc = 0;
+
+void enqueue_symlink_checkout(struct cache_entry *ce, int *nr_checkouts)
+{
+	assert(S_ISLNK(ce->ce_mode));
+	ALLOC_GROW(symlink_queue, symlink_queue_nr + 1, symlink_queue_alloc);
+	symlink_queue[symlink_queue_nr].ce = ce;
+	symlink_queue[symlink_queue_nr].nr_checkouts = nr_checkouts;
+	symlink_queue_nr++;
+}
+
+size_t symlink_queue_size(void)
+{
+	return symlink_queue_nr;
+}
+
+static int checkout_symlink_queue(struct checkout *state)
+{
+	size_t i;
+	int ret = 0;
+
+	for (i = 0; i < symlink_queue_nr; ++i) {
+		struct symlink_checkout_item *sci = &symlink_queue[i];
+		ret |= checkout_entry(sci->ce, state, NULL, sci->nr_checkouts);
+		advance_progress_meter();
+	}
+
+	FREE_AND_NULL(symlink_queue);
+	symlink_queue_nr = symlink_queue_alloc = 0;
+	return ret;
+}
+
 static int handle_results(struct checkout *state)
 {
 	int ret = 0;
@@ -257,16 +295,6 @@  static int close_and_clear(int *fd)
 	return ret;
 }
 
-static int check_leading_dirs(const char *path, int len, int prefix_len)
-{
-	const char *slash = path + len;
-
-	while (slash > path && *slash != '/')
-		slash--;
-
-	return has_dirs_only_path(path, slash - path, prefix_len);
-}
-
 void write_checkout_item(struct checkout *state, struct checkout_item *ci)
 {
 	unsigned int mode = (ci->ce->ce_mode & 0100) ? 0777 : 0666;
@@ -276,27 +304,15 @@  void write_checkout_item(struct checkout *state, struct checkout_item *ci)
 	strbuf_add(&path, state->base_dir, state->base_dir_len);
 	strbuf_add(&path, ci->ce->name, ci->ce->ce_namelen);
 
-	/*
-	 * At this point, leading dirs should have already been created. But if
-	 * a symlink being checked out has collided with one of the dirs, due to
-	 * file system folding rules, it's possible that the dirs are no longer
-	 * present. So we have to check again, and report any path collisions.
-	 */
-	if (!check_leading_dirs(path.buf, path.len, state->base_dir_len)) {
-		ci->status = CI_RETRY;
-		goto out;
-	}
-
 	fd = open(path.buf, O_WRONLY | O_CREAT | O_EXCL, mode);
 
 	if (fd < 0) {
-		if (errno == EEXIST || errno == EISDIR) {
+		if (errno == EEXIST || errno == EISDIR || errno == ENOENT ||
+		    errno == ENOTDIR) {
 			/*
 			 * Errors which probably represent a path collision.
 			 * Suppress the error message and mark the ci to be
-			 * retried later, sequentially. ENOTDIR and ENOENT are
-			 * also interesting, but check_leading_dirs() should
-			 * have already caught these cases.
+			 * retried later, sequentially.
 			 */
 			ci->status = CI_RETRY;
 		} else {
@@ -523,7 +539,7 @@  static int run_checkout_sequentially(struct checkout *state)
 		if (ci->status != CI_RETRY)
 			advance_progress_meter();
 	}
-	return handle_results(state);
+	return handle_results(state) | checkout_symlink_queue(state);
 }
 
 int run_parallel_checkout(struct checkout *state, int num_workers, int threshold,
@@ -553,7 +569,8 @@  int run_parallel_checkout(struct checkout *state, int num_workers, int threshold
 	workers = setup_workers(state, num_workers);
 	gather_results_from_workers(workers, num_workers);
 	finish_workers(workers, num_workers);
-	ret = handle_results(state);
+	ret |= handle_results(state);
+	ret |= checkout_symlink_queue(state);
 
 done:
 	finish_parallel_checkout();
diff --git a/parallel-checkout.h b/parallel-checkout.h
index 2b81a5db6c..a4f7e5b7bd 100644
--- a/parallel-checkout.h
+++ b/parallel-checkout.h
@@ -29,6 +29,14 @@  void get_parallel_checkout_configs(int *num_workers, int *threshold);
 int enqueue_checkout(struct cache_entry *ce, struct conv_attrs *ca);
 size_t pc_queue_size(void);
 
+/*
+ * Enqueues a symlink to be checked out *sequentially* after the parallel
+ * checkout finishes. This is done to avoid path collisions with leading dirs,
+ * which could make parallel workers write a file to the wrong place.
+ */
+void enqueue_symlink_checkout(struct cache_entry *ce, int *nr_checkouts);
+size_t symlink_queue_size(void);
+
 /*
  * Write all the queued entries, returning 0 on success. If the number of
  * entries is below the specified threshold, the operation is performed
diff --git a/unpack-trees.c b/unpack-trees.c
index dcb40dc8fa..01928d3d65 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -472,6 +472,7 @@  static int check_updates(struct unpack_trees_options *o,
 
 		if (ce->ce_flags & CE_UPDATE) {
 			size_t last_pc_queue_size = pc_queue_size();
+			size_t last_symlink_queue_size = symlink_queue_size();
 
 			if (ce->ce_flags & CE_WT_REMOVE)
 				BUG("both update and delete flags are set on %s",
@@ -479,7 +480,8 @@  static int check_updates(struct unpack_trees_options *o,
 			ce->ce_flags &= ~CE_UPDATE;
 			errs |= checkout_entry(ce, &state, NULL, NULL);
 
-			if (last_pc_queue_size == pc_queue_size())
+			if (last_pc_queue_size == pc_queue_size() &&
+			    last_symlink_queue_size == symlink_queue_size())
 				display_progress(progress, ++cnt);
 		}
 	}