diff mbox series

[v3,11/25] tools/xenstore: drop use of tdb

Message ID 20230724110247.10520-12-jgross@suse.com (mailing list archive)
State Superseded
Headers show
Series tools/xenstore: drop TDB | expand

Commit Message

Jürgen Groß July 24, 2023, 11:02 a.m. UTC
Today all Xenstore nodes are stored in a TDB data base. This data base
has several disadvantages:

- It is using a fixed sized hash table, resulting in high memory
  overhead for small installations with only very few VMs, and a rather
  large performance hit for systems with lots of VMs due to many
  collisions.
  The hash table size today is 7919 entries. This means that e.g. in
  case of a simple desktop use case with 2 or 3 VMs probably far less
  than 10% of the entries will be used (assuming roughly 100 nodes per
  VM). OTOH a setup on a large server with 500 VMs would result in
  heavy conflicts in the hash list with 5-10 nodes per hash table entry.

- TDB is using a single large memory area for storing the nodes. It
  only ever increases this area and will never shrink it afterwards.
  This will result in more memory usage than necessary after a peak of
  Xenstore usage.

- Xenstore is only single-threaded, while TDB is designed to be fit
  for multi-threaded use cases, resulting in much higher code
  complexity than needed.

- Special use cases of Xenstore are not possible to implement with TDB
  in an effective way, while an implementation of a data base tailored
  for Xenstore could simplify some handling (e.g. transactions) a lot.

So drop using TDB and store the nodes directly in memory making them
easily accessible. Use a hash-based lookup mechanism for fast lookup
of nodes by their full path.

For now only replace TDB keeping the current access functions.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
V2:
- add const (Julien Grall)
- use specific pointer type instead of void * (Julien Grall)
- add comment to db_fetch() (Julien Grall)
V3:
- use talloc_memdup() (Julien Grall)
---
 tools/xenstore/xenstored_core.c        | 156 ++++++++++---------------
 tools/xenstore/xenstored_core.h        |   5 +-
 tools/xenstore/xenstored_transaction.c |   1 -
 3 files changed, 62 insertions(+), 100 deletions(-)

Comments

Julien Grall July 27, 2023, 9:07 p.m. UTC | #1
Hi Juergen,

On 24/07/2023 12:02, Juergen Gross wrote:
> Today all Xenstore nodes are stored in a TDB data base. This data base
> has several disadvantages:
> 
> - It is using a fixed sized hash table, resulting in high memory
>    overhead for small installations with only very few VMs, and a rather
>    large performance hit for systems with lots of VMs due to many
>    collisions.
>    The hash table size today is 7919 entries. This means that e.g. in
>    case of a simple desktop use case with 2 or 3 VMs probably far less
>    than 10% of the entries will be used (assuming roughly 100 nodes per
>    VM). OTOH a setup on a large server with 500 VMs would result in
>    heavy conflicts in the hash list with 5-10 nodes per hash table entry.
> 
> - TDB is using a single large memory area for storing the nodes. It
>    only ever increases this area and will never shrink it afterwards.
>    This will result in more memory usage than necessary after a peak of
>    Xenstore usage.
> 
> - Xenstore is only single-threaded, while TDB is designed to be fit
>    for multi-threaded use cases, resulting in much higher code
>    complexity than needed.
> 
> - Special use cases of Xenstore are not possible to implement with TDB
>    in an effective way, while an implementation of a data base tailored
>    for Xenstore could simplify some handling (e.g. transactions) a lot.
> 
> So drop using TDB and store the nodes directly in memory making them
> easily accessible. Use a hash-based lookup mechanism for fast lookup
> of nodes by their full path.
> 
> For now only replace TDB keeping the current access functions.
> 
> Signed-off-by: Juergen Gross <jgross@suse.com>

Reviewed-by: Julien Grall <jgrall@amazon.com>

Cheers,
diff mbox series

Patch

diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c
index a12ede147c..2b94392fd4 100644
--- a/tools/xenstore/xenstored_core.c
+++ b/tools/xenstore/xenstored_core.c
@@ -53,7 +53,6 @@ 
 #include "xenstored_domain.h"
 #include "xenstored_control.h"
 #include "xenstored_lu.h"
-#include "tdb.h"
 
 #ifndef NO_SOCKETS
 #if defined(HAVE_SYSTEMD)
@@ -85,7 +84,7 @@  bool keep_orphans = false;
 static int reopen_log_pipe[2];
 static int reopen_log_pipe0_pollfd_idx = -1;
 char *tracefile = NULL;
-static TDB_CONTEXT *tdb_ctx = NULL;
+static struct hashtable *nodes;
 unsigned int trace_flags = TRACE_OBJ | TRACE_IO;
 
 static const char *sockmsg_string(enum xsd_sockmsg_type type);
@@ -556,32 +555,30 @@  static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout)
 	}
 }
 
-static void set_tdb_key(const char *name, TDB_DATA *key)
-{
-	/*
-	 * Dropping const is fine here, as the key will never be modified
-	 * by TDB.
-	 */
-	key->dptr = (char *)name;
-	key->dsize = strlen(name);
-}
-
 struct xs_tdb_record_hdr *db_fetch(const char *db_name, size_t *size)
 {
-	TDB_DATA key, data;
+	const struct xs_tdb_record_hdr *hdr;
+	struct xs_tdb_record_hdr *p;
 
-	set_tdb_key(db_name, &key);
-	data = tdb_fetch(tdb_ctx, key);
-	if (!data.dptr) {
-		errno = (tdb_error(tdb_ctx) == TDB_ERR_NOEXIST) ? ENOENT : EIO;
-		*size = 0;
-	} else {
-		*size = data.dsize;
-		trace_tdb("read %s size %zu\n", db_name,
-			  *size + strlen(db_name));
+	hdr = hashtable_search(nodes, db_name);
+	if (!hdr) {
+		errno = ENOENT;
+		return NULL;
 	}
 
-	return (struct xs_tdb_record_hdr *)data.dptr;
+	*size = sizeof(*hdr) + hdr->num_perms * sizeof(hdr->perms[0]) +
+		hdr->datalen + hdr->childlen;
+
+	/* Return a copy, avoiding a potential modification in the DB. */
+	p = talloc_memdup(NULL, hdr, *size);
+	if (!p) {
+		errno = ENOMEM;
+		return NULL;
+	}
+
+	trace_tdb("read %s size %zu\n", db_name, *size + strlen(db_name));
+
+	return p;
 }
 
 static void get_acc_data(const char *name, struct node_account_data *acc)
@@ -621,12 +618,10 @@  int db_write(struct connection *conn, const char *db_name, void *data,
 	struct xs_tdb_record_hdr *hdr = data;
 	struct node_account_data old_acc = {};
 	unsigned int old_domid, new_domid;
+	size_t name_len = strlen(db_name);
+	const char *name;
 	int ret;
-	TDB_DATA key, dat;
 
-	set_tdb_key(db_name, &key);
-	dat.dptr = data;
-	dat.dsize = size;
 	if (!acc)
 		old_acc.memory = -1;
 	else
@@ -642,29 +637,36 @@  int db_write(struct connection *conn, const char *db_name, void *data,
 	 */
 	if (old_acc.memory)
 		domain_memory_add_nochk(conn, old_domid,
-					-old_acc.memory - key.dsize);
-	ret = domain_memory_add(conn, new_domid, size + key.dsize,
+					-old_acc.memory - name_len);
+	ret = domain_memory_add(conn, new_domid, size + name_len,
 				no_quota_check);
 	if (ret) {
 		/* Error path, so no quota check. */
 		if (old_acc.memory)
 			domain_memory_add_nochk(conn, old_domid,
-						old_acc.memory + key.dsize);
+						old_acc.memory + name_len);
 		return ret;
 	}
 
-	/* TDB should set errno, but doesn't even set ecode AFAICT. */
-	if (tdb_store(tdb_ctx, key, dat,
-		      (mode == NODE_CREATE) ? TDB_INSERT : TDB_MODIFY) != 0) {
-		domain_memory_add_nochk(conn, new_domid, -size - key.dsize);
+	if (mode == NODE_CREATE) {
+		/* db_name could be modified later, so allocate a copy. */
+		name = talloc_strdup(data, db_name);
+		ret = name ? hashtable_add(nodes, name, data) : ENOMEM;
+	} else
+		ret = hashtable_replace(nodes, db_name, data);
+
+	if (ret) {
+		/* Free data, as it isn't owned by hashtable now. */
+		talloc_free(data);
+		domain_memory_add_nochk(conn, new_domid, -size - name_len);
 		/* Error path, so no quota check. */
 		if (old_acc.memory)
 			domain_memory_add_nochk(conn, old_domid,
-						old_acc.memory + key.dsize);
-		errno = EIO;
+						old_acc.memory + name_len);
+		errno = ret;
 		return errno;
 	}
-	trace_tdb("store %s size %zu\n", db_name, size + key.dsize);
+	trace_tdb("store %s size %zu\n", db_name, size + name_len);
 
 	if (acc) {
 		/* Don't use new_domid, as it might be a transaction node. */
@@ -680,9 +682,6 @@  int db_delete(struct connection *conn, const char *name,
 {
 	struct node_account_data tmp_acc;
 	unsigned int domid;
-	TDB_DATA key;
-
-	set_tdb_key(name, &key);
 
 	if (!acc) {
 		acc = &tmp_acc;
@@ -691,15 +690,13 @@  int db_delete(struct connection *conn, const char *name,
 
 	get_acc_data(name, acc);
 
-	if (tdb_delete(tdb_ctx, key)) {
-		errno = EIO;
-		return errno;
-	}
+	hashtable_remove(nodes, name);
 	trace_tdb("delete %s\n", name);
 
 	if (acc->memory) {
 		domid = get_acc_domid(conn, name, acc->domid);
-		domain_memory_add_nochk(conn, domid, -acc->memory - key.dsize);
+		domain_memory_add_nochk(conn, domid,
+					-acc->memory - strlen(name));
 	}
 
 	return 0;
@@ -2354,43 +2351,29 @@  static void manual_node(const char *name, const char *child)
 	talloc_free(node);
 }
 
-static void tdb_logger(TDB_CONTEXT *tdb, int level, const char * fmt, ...)
+static unsigned int hash_from_key_fn(const void *k)
 {
-	va_list ap;
-	char *s;
-	int saved_errno = errno;
+	const char *str = k;
+	unsigned int hash = 5381;
+	char c;
 
-	va_start(ap, fmt);
-	s = talloc_vasprintf(NULL, fmt, ap);
-	va_end(ap);
+	while ((c = *str++))
+		hash = ((hash << 5) + hash) + (unsigned int)c;
 
-	if (s) {
-		trace("TDB: %s\n", s);
-		syslog(LOG_ERR, "TDB: %s",  s);
-		if (verbose)
-			xprintf("TDB: %s", s);
-		talloc_free(s);
-	} else {
-		trace("talloc failure during logging\n");
-		syslog(LOG_ERR, "talloc failure during logging\n");
-	}
+	return hash;
+}
 
-	errno = saved_errno;
+static int keys_equal_fn(const void *key1, const void *key2)
+{
+	return 0 == strcmp(key1, key2);
 }
 
 void setup_structure(bool live_update)
 {
-	char *tdbname;
-
-	tdbname = talloc_strdup(talloc_autofree_context(), "/dev/mem");
-	if (!tdbname)
-		barf_perror("Could not create tdbname");
-
-	tdb_ctx = tdb_open_ex(tdbname, 7919, TDB_INTERNAL | TDB_NOLOCK,
-			      O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC,
-			      0640, &tdb_logger, NULL);
-	if (!tdb_ctx)
-		barf_perror("Could not create tdb file %s", tdbname);
+	nodes = create_hashtable(NULL, "nodes", hash_from_key_fn, keys_equal_fn,
+				 HASHTABLE_FREE_KEY | HASHTABLE_FREE_VALUE);
+	if (!nodes)
+		barf_perror("Could not create nodes hashtable");
 
 	if (live_update)
 		manual_node("/", NULL);
@@ -2404,24 +2387,6 @@  void setup_structure(bool live_update)
 	}
 }
 
-static unsigned int hash_from_key_fn(const void *k)
-{
-	const char *str = k;
-	unsigned int hash = 5381;
-	char c;
-
-	while ((c = *str++))
-		hash = ((hash << 5) + hash) + (unsigned int)c;
-
-	return hash;
-}
-
-
-static int keys_equal_fn(const void *key1, const void *key2)
-{
-	return 0 == strcmp(key1, key2);
-}
-
 int remember_string(struct hashtable *hash, const char *str)
 {
 	char *k = talloc_strdup(NULL, str);
@@ -2481,12 +2446,11 @@  static int check_store_enoent(const void *ctx, struct connection *conn,
 /**
  * Helper to clean_store below.
  */
-static int clean_store_(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA val,
-			void *private)
+static int clean_store_(const void *key, void *val, void *private)
 {
 	struct hashtable *reachable = private;
 	char *slash;
-	char * name = talloc_strndup(NULL, key.dptr, key.dsize);
+	char *name = talloc_strdup(NULL, key);
 
 	if (!name) {
 		log("clean_store: ENOMEM");
@@ -2516,7 +2480,7 @@  static int clean_store_(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA val,
  */
 static void clean_store(struct check_store_data *data)
 {
-	tdb_traverse(tdb_ctx, &clean_store_, data->reachable);
+	hashtable_iterate(nodes, clean_store_, data->reachable);
 	domain_check_acc(data->domains);
 }
 
diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h
index f5aa8d51a0..ce40c61f44 100644
--- a/tools/xenstore/xenstored_core.h
+++ b/tools/xenstore/xenstored_core.h
@@ -33,7 +33,6 @@ 
 #include "xenstore_lib.h"
 #include "xenstore_state.h"
 #include "list.h"
-#include "tdb.h"
 #include "hashtable.h"
 
 #ifndef O_CLOEXEC
@@ -237,7 +236,7 @@  static inline unsigned int get_node_owner(const struct node *node)
 	return node->perms.p[0].id;
 }
 
-/* Write a node to the tdb data base. */
+/* Write a node to the data base. */
 enum write_node_mode {
 	NODE_CREATE,
 	NODE_MODIFY
@@ -247,7 +246,7 @@  int write_node_raw(struct connection *conn, const char *db_name,
 		   struct node *node, enum write_node_mode mode,
 		   bool no_quota_check);
 
-/* Get a node from the tdb data base. */
+/* Get a node from the data base. */
 struct node *read_node(struct connection *conn, const void *ctx,
 		       const char *name);
 
diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c
index 1981d1d55d..378fe79763 100644
--- a/tools/xenstore/xenstored_transaction.c
+++ b/tools/xenstore/xenstored_transaction.c
@@ -397,7 +397,6 @@  static int finalize_transaction(struct connection *conn,
 				       ? NODE_CREATE : NODE_MODIFY;
 				*is_corrupt |= db_write(conn, i->node, hdr,
 							size, NULL, mode, true);
-				talloc_free(hdr);
 				if (db_delete(conn, i->trans_name, NULL))
 					*is_corrupt = true;
 			} else {