diff mbox series

[RFC,1/4] symlinks: add and export threaded rmdir variants

Message ID 20231203133911.41594-2-hanyoung@protonmail.com (mailing list archive)
State New, archived
Headers show
Series add parallel unlink | expand

Commit Message

Han Young Dec. 3, 2023, 1:39 p.m. UTC
From: Han Young <hanyang.tony@bytedance.com>

Add and export threaded variants of remove dir related functions, these functions will be used by parallel unlink
---
Most of the code of threaded_schedule_dir_for_removal and threaded_do_remove_scheduled_dirs is duplicated.
We can remove the duplication either via breaking the function into smaller functions, or pass the cache as parameters.
If we choose to pass the cache explicitly, default cache in both entry.c and symlinks.c probably need to be moved to
unpack-trees.c. I'm not satisfied with using mutex guarded hashset to ensure every dir is removed. But I can't come
up with a better way.

 symlinks.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 symlinks.h |   6 +++
 2 files changed, 123 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/symlinks.c b/symlinks.c
index b29e340c2d..c8cb0a7eb7 100644
--- a/symlinks.c
+++ b/symlinks.c
@@ -2,9 +2,9 @@ 
 #include "gettext.h"
 #include "setup.h"
 #include "symlinks.h"
+#include "hashmap.h"
+#include "pthread.h"
 
-static int threaded_check_leading_path(struct cache_def *cache, const char *name,
-				       int len, int warn_on_lstat_err);
 static int threaded_has_dirs_only_path(struct cache_def *cache, const char *name, int len, int prefix_len);
 
 /*
@@ -229,7 +229,7 @@  int check_leading_path(const char *name, int len, int warn_on_lstat_err)
  * directory, or if we were unable to lstat() it. If warn_on_lstat_err is true,
  * also emit a warning for this error.
  */
-static int threaded_check_leading_path(struct cache_def *cache, const char *name,
+int threaded_check_leading_path(struct cache_def *cache, const char *name,
 				       int len, int warn_on_lstat_err)
 {
 	int flags;
@@ -277,6 +277,51 @@  static int threaded_has_dirs_only_path(struct cache_def *cache, const char *name
 }
 
 static struct strbuf removal = STRBUF_INIT;
+static struct hashmap dir_set;
+pthread_mutex_t dir_set_mutex = PTHREAD_MUTEX_INITIALIZER;
+struct rmdir_hash_entry {
+      struct hashmap_entry hash;
+      char *dir;
+      size_t dirlen;
+};
+
+/* rmdir_hashmap comparison function */
+static int rmdir_hash_entry_cmp(const void *cmp_data UNUSED,
+			       const struct hashmap_entry *eptr,
+			       const struct hashmap_entry *entry_or_key UNUSED,
+			       const void *keydata)
+{
+	const struct rmdir_hash_entry *a, *b;
+
+	a = container_of(eptr, const struct rmdir_hash_entry, hash);
+	return strcmp(a->dir, (char *)keydata);
+}
+
+void threaded_init_remove_scheduled_dirs(void)
+{
+	unsigned flags = 0;
+	hashmap_init(&dir_set, rmdir_hash_entry_cmp, &flags, 0);
+}
+
+static void add_dir_to_rmdir_hash(char *dir, size_t dirlen)
+{
+	struct rmdir_hash_entry *e;
+	struct hashmap_entry *ent;
+	int hash = strhash(dir);
+	pthread_mutex_lock(&dir_set_mutex);
+	ent = hashmap_get_from_hash(&dir_set, hash, dir);
+
+	if (!ent) {
+		e = xmalloc(sizeof(struct rmdir_hash_entry));
+		hashmap_entry_init(&e->hash, hash);
+		char *_dir= xmallocz(dirlen);
+		memcpy(_dir, dir, dirlen+1);
+		e->dir = _dir;
+		e->dirlen = dirlen;
+		hashmap_put_entry(&dir_set, e, hash);
+	}
+	pthread_mutex_unlock(&dir_set_mutex);
+}
 
 static void do_remove_scheduled_dirs(int new_len)
 {
@@ -294,6 +339,26 @@  static void do_remove_scheduled_dirs(int new_len)
 	removal.len = new_len;
 }
 
+
+static void threaded_do_remove_scheduled_dirs(int new_len, struct strbuf *removal)
+{
+	while (removal->len > new_len) {
+		removal->buf[removal->len] = '\0';
+		if (startup_info->original_cwd &&
+		     !strcmp(removal->buf, startup_info->original_cwd))
+			 break;
+		if (rmdir(removal->buf)) {
+			add_dir_to_rmdir_hash(removal->buf, removal->len);
+			break;
+		}
+		do {
+			removal->len--;
+		} while (removal->len > new_len &&
+			 removal->buf[removal->len] != '/');
+	}
+	removal->len = new_len;
+}
+
 void schedule_dir_for_removal(const char *name, int len)
 {
 	int match_len, last_slash, i, previous_slash;
@@ -327,11 +392,60 @@  void schedule_dir_for_removal(const char *name, int len)
 		strbuf_add(&removal, &name[match_len], last_slash - match_len);
 }
 
+void threaded_schedule_dir_for_removal(const char *name, int len, struct strbuf *removal_cache)
+{
+	int match_len, last_slash, i, previous_slash;
+
+	if (startup_info->original_cwd &&
+	    !strcmp(name, startup_info->original_cwd))
+		return;	/* Do not remove the current working directory */
+
+	match_len = last_slash = i =
+		longest_path_match(name, len, removal_cache->buf, removal_cache->len,
+				   &previous_slash);
+	/* Find last slash inside 'name' */
+	while (i < len) {
+		if (name[i] == '/')
+			last_slash = i;
+		i++;
+	}
+
+	/*
+	 * If we are about to go down the directory tree, we check if
+	 * we must first go upwards the tree, such that we then can
+	 * remove possible empty directories as we go upwards.
+	 */
+	if (match_len < last_slash && match_len < removal_cache->len)
+		threaded_do_remove_scheduled_dirs(match_len, removal_cache);
+	/*
+	 * If we go deeper down the directory tree, we only need to
+	 * save the new path components as we go down.
+	 */
+	if (match_len < last_slash)
+		strbuf_add(removal_cache, &name[match_len], last_slash - match_len);
+}
+
 void remove_scheduled_dirs(void)
 {
 	do_remove_scheduled_dirs(0);
 }
 
+void threaded_remove_scheduled_dirs_clean_up(void)
+{
+	struct hashmap_iter iter;
+	const struct rmdir_hash_entry *entry;
+
+	hashmap_for_each_entry(&dir_set, &iter, entry, hash /* member name */) {
+		schedule_dir_for_removal(entry->dir, entry->dirlen);
+	}
+	remove_scheduled_dirs();
+}
+
+void threaded_remove_scheduled_dirs(struct strbuf *removal_cache)
+{
+	threaded_do_remove_scheduled_dirs(0, removal_cache);
+}
+
 void invalidate_lstat_cache(void)
 {
 	reset_lstat_cache(&default_cache);
diff --git a/symlinks.h b/symlinks.h
index 7ae3d5b856..7898eae941 100644
--- a/symlinks.h
+++ b/symlinks.h
@@ -20,9 +20,15 @@  static inline void cache_def_clear(struct cache_def *cache)
 int has_symlink_leading_path(const char *name, int len);
 int threaded_has_symlink_leading_path(struct cache_def *, const char *, int);
 int check_leading_path(const char *name, int len, int warn_on_lstat_err);
+int threaded_check_leading_path(struct cache_def *cache, const char *name,
+				       int len, int warn_on_lstat_err);
 int has_dirs_only_path(const char *name, int len, int prefix_len);
 void invalidate_lstat_cache(void);
 void schedule_dir_for_removal(const char *name, int len);
+void threaded_schedule_dir_for_removal(const char *name, int len, struct strbuf *removal_cache);
 void remove_scheduled_dirs(void);
+void threaded_remove_scheduled_dirs(struct strbuf *removal_cache);
+void threaded_init_remove_scheduled_dirs(void);
+void threaded_remove_scheduled_dirs_clean_up(void);
 
 #endif /* SYMLINKS_H */