diff mbox series

[v3,17/25] tools/xenstore: rework struct xs_tdb_record_hdr

Message ID 20230724110247.10520-18-jgross@suse.com (mailing list archive)
State Superseded
Headers show
Series tools/xenstore: drop TDB | expand

Commit Message

Jürgen Groß July 24, 2023, 11:02 a.m. UTC
Struct xs_tdb_record_hdr is used for nodes stored in the data base.
When working on a node, struct node is being used, which is including
the same information as struct xs_tdb_record_hdr, but in a different
format. Rework struct xs_tdb_record_hdr in order to prepare including
it in struct node.

Do the following modifications:

- move its definition to xenstored_core.h, as the reason to put it into
  utils.h are no longer existing

- rename it to struct node_hdr, as the "tdb" in its name has only
  historical reasons

- replace the empty permission array at the end with a comment about
  the layout of data in the data base (concatenation of header,
  permissions, node contents, and children list)

- use narrower types for num_perms and datalen, as those are naturally
  limited to XENSTORE_PAYLOAD_MAX (childlen is different here, as it is
  in theory basically unlimited)

Signed-off-by: Juergen Gross <jgross@suse.com>
---
V2:
- new patch
---
 tools/xenstore/utils.h                 |  9 -------
 tools/xenstore/xenstored_core.c        | 35 +++++++++++++++-----------
 tools/xenstore/xenstored_core.h        | 20 ++++++++++++++-
 tools/xenstore/xenstored_transaction.c |  6 ++---
 4 files changed, 42 insertions(+), 28 deletions(-)

Comments

Julien Grall July 27, 2023, 9:53 p.m. UTC | #1
Hi Juergen,

On 24/07/2023 12:02, Juergen Gross wrote:
> Struct xs_tdb_record_hdr is used for nodes stored in the data base.
> When working on a node, struct node is being used, which is including
> the same information as struct xs_tdb_record_hdr, but in a different
> format. Rework struct xs_tdb_record_hdr in order to prepare including
> it in struct node.
> 
> Do the following modifications:
> 
> - move its definition to xenstored_core.h, as the reason to put it into
>    utils.h are no longer existing
> 
> - rename it to struct node_hdr, as the "tdb" in its name has only
>    historical reasons
> 
> - replace the empty permission array at the end with a comment about
>    the layout of data in the data base (concatenation of header,
>    permissions, node contents, and children list)
> 
> - use narrower types for num_perms and datalen, as those are naturally
>    limited to XENSTORE_PAYLOAD_MAX (childlen is different here, as it is
>    in theory basically unlimited)

The assumption is XENSTORE_PAYLOAD_MAX will never change and/or always 
apply for all the connection. I am aware of at least one downstream use 
where this is not the case.

I am happy to use narrower types, but I would at least like some checks 
in Xenstore to ensure the limits are not reached.

> 
> Signed-off-by: Juergen Gross <jgross@suse.com>
> ---
> V2:
> - new patch
> ---
>   tools/xenstore/utils.h                 |  9 -------
>   tools/xenstore/xenstored_core.c        | 35 +++++++++++++++-----------
>   tools/xenstore/xenstored_core.h        | 20 ++++++++++++++-
>   tools/xenstore/xenstored_transaction.c |  6 ++---
>   4 files changed, 42 insertions(+), 28 deletions(-)
> 
> diff --git a/tools/xenstore/utils.h b/tools/xenstore/utils.h
> index 028ecb9d7a..405d662ea2 100644
> --- a/tools/xenstore/utils.h
> +++ b/tools/xenstore/utils.h
> @@ -9,15 +9,6 @@
>   
>   #include "xenstore_lib.h"
>   
> -/* Header of the node record in tdb. */
> -struct xs_tdb_record_hdr {
> -	uint64_t generation;
> -	uint32_t num_perms;
> -	uint32_t datalen;
> -	uint32_t childlen;
> -	struct xs_permissions perms[0];
> -};
> -
>   /* Is A == B ? */
>   #define streq(a,b) (strcmp((a),(b)) == 0)
>   
> diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c
> index 1f5f118f1c..86b7c9bf36 100644
> --- a/tools/xenstore/xenstored_core.c
> +++ b/tools/xenstore/xenstored_core.c
> @@ -555,9 +555,9 @@ static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout)
>   	}
>   }
>   
> -const struct xs_tdb_record_hdr *db_fetch(const char *db_name, size_t *size)
> +const struct node_hdr *db_fetch(const char *db_name, size_t *size)
>   {
> -	struct xs_tdb_record_hdr *hdr;
> +	struct node_hdr *hdr;
>   
>   	hdr = hashtable_search(nodes, db_name);
>   	if (!hdr) {
> @@ -565,7 +565,7 @@ const struct xs_tdb_record_hdr *db_fetch(const char *db_name, size_t *size)
>   		return NULL;
>   	}
>   
> -	*size = sizeof(*hdr) + hdr->num_perms * sizeof(hdr->perms[0]) +
> +	*size = sizeof(*hdr) + hdr->num_perms * sizeof(struct xs_permissions) +
>   		hdr->datalen + hdr->childlen;
>   
>   	trace_tdb("read %s size %zu\n", db_name, *size + strlen(db_name));
> @@ -573,10 +573,15 @@ const struct xs_tdb_record_hdr *db_fetch(const char *db_name, size_t *size)
>   	return hdr;
>   }
>   
> +static struct xs_permissions *perms_from_node_hdr(const struct node_hdr *hdr)

I don't much like the casting-away the const here. Looking at the code, 
most of the users seems to not modify the value. So how about return 
const here and open-coding or casting-away the const in the caller (with 
a proper explanation)?

> +{
> +	return (struct xs_permissions *)(hdr + 1);
> +}
> +
>   static void get_acc_data(const char *name, struct node_account_data *acc)
>   {
>   	size_t size;
> -	const struct xs_tdb_record_hdr *hdr;
> +	const struct node_hdr *hdr;
>   
>   	if (acc->memory < 0) {
>   		hdr = db_fetch(name, &size);
> @@ -585,7 +590,7 @@ static void get_acc_data(const char *name, struct node_account_data *acc)
>   			acc->memory = 0;
>   		} else {
>   			acc->memory = size;
> -			acc->domid = hdr->perms[0].id;
> +			acc->domid = perms_from_node_hdr(hdr)->id;
>   		}
>   	}
>   }
> @@ -606,7 +611,7 @@ int db_write(struct connection *conn, const char *db_name, const void *data,
>   	     size_t size, struct node_account_data *acc,
>   	     enum write_node_mode mode, bool no_quota_check)
>   {
> -	const struct xs_tdb_record_hdr *hdr = data;
> +	const struct node_hdr *hdr = data;
>   	struct node_account_data old_acc = {};
>   	unsigned int old_domid, new_domid;
>   	size_t name_len = strlen(db_name);
> @@ -620,7 +625,7 @@ int db_write(struct connection *conn, const char *db_name, const void *data,
>   
>   	get_acc_data(db_name, &old_acc);
>   	old_domid = get_acc_domid(conn, db_name, old_acc.domid);
> -	new_domid = get_acc_domid(conn, db_name, hdr->perms[0].id);
> +	new_domid = get_acc_domid(conn, db_name, perms_from_node_hdr(hdr)->id);
>   
>   	/*
>   	 * Don't check for ENOENT, as we want to be able to switch orphaned
> @@ -661,7 +666,7 @@ int db_write(struct connection *conn, const char *db_name, const void *data,
>   
>   	if (acc) {
>   		/* Don't use new_domid, as it might be a transaction node. */
> -		acc->domid = hdr->perms[0].id;
> +		acc->domid = perms_from_node_hdr(hdr)->id;
>   		acc->memory = size;
>   	}
>   
> @@ -699,7 +704,7 @@ struct node *read_node(struct connection *conn, const void *ctx,
>   		       const char *name)
>   {
>   	size_t size;
> -	const struct xs_tdb_record_hdr *hdr;
> +	const struct node_hdr *hdr;
>   	struct node *node;
>   	const char *db_name;
>   	int err;
> @@ -733,12 +738,12 @@ struct node *read_node(struct connection *conn, const void *ctx,
>   	node->perms.num = hdr->num_perms;
>   	node->datalen = hdr->datalen;
>   	node->childlen = hdr->childlen;
> -	node->acc.domid = hdr->perms[0].id;
> +	node->acc.domid = perms_from_node_hdr(hdr)->id;
>   	node->acc.memory = size;
>   
>   	/* Copy node data to new memory area, starting with permissions. */
>   	size -= sizeof(*hdr);
> -	node->perms.p = talloc_memdup(node, hdr->perms, size);
> +	node->perms.p = talloc_memdup(node, perms_from_node_hdr(hdr), size);
>   	if (node->perms.p == NULL) {
>   		errno = ENOMEM;
>   		goto error;
> @@ -785,7 +790,7 @@ int write_node_raw(struct connection *conn, const char *db_name,
>   	void *data;
>   	size_t size;
>   	void *p;
> -	struct xs_tdb_record_hdr *hdr;
> +	struct node_hdr *hdr;
>   
>   	if (domain_adjust_node_perms(node))
>   		return errno;
> @@ -812,9 +817,9 @@ int write_node_raw(struct connection *conn, const char *db_name,
>   	hdr->datalen = node->datalen;
>   	hdr->childlen = node->childlen;
>   
> -	memcpy(hdr->perms, node->perms.p,
> -	       node->perms.num * sizeof(*node->perms.p));
> -	p = hdr->perms + node->perms.num;
> +	p = perms_from_node_hdr(hdr);
> +	memcpy(p, node->perms.p, node->perms.num * sizeof(*node->perms.p));
> +	p += node->perms.num * sizeof(*node->perms.p);
>   	memcpy(p, node->data, node->datalen);
>   	p += node->datalen;
>   	memcpy(p, node->children, node->childlen);
> diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h
> index 6d1578ce97..c965709090 100644
> --- a/tools/xenstore/xenstored_core.h
> +++ b/tools/xenstore/xenstored_core.h
> @@ -168,6 +168,24 @@ struct connection
>   };
>   extern struct list_head connections;
>   
> +/*
> + * Header of the node record in the data base.
> + * In the data base the memory of the node is a single memory chunk with the
> + * following format:
> + * struct {
> + *     node_hdr hdr;
> + *     struct xs_permissions perms[hdr.num_perms];
> + *     char data[hdr.datalen];
> + *     char children[hdr.childlen];
> + * };
> + */
> +struct node_hdr {
> +	uint64_t generation;
> +	uint16_t num_perms;

OOI, do you have a use case where a node would be shared with more than 
255 domains?

> +	uint16_t datalen;
> +	uint32_t childlen;
> +};
> +
>   struct node_perms {
>   	unsigned int num;
>   	struct xs_permissions *p;
> @@ -362,7 +380,7 @@ extern xengnttab_handle **xgt_handle;
>   int remember_string(struct hashtable *hash, const char *str);
>   
>   /* Data base access functions. */
> -const struct xs_tdb_record_hdr *db_fetch(const char *db_name, size_t *size);
> +const struct node_hdr *db_fetch(const char *db_name, size_t *size);
>   int db_write(struct connection *conn, const char *db_name, const void *data,
>   	     size_t size, struct node_account_data *acc,
>   	     enum write_node_mode mode, bool no_quota_check);
> diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c
> index a90283dcc5..9ca73b9874 100644
> --- a/tools/xenstore/xenstored_transaction.c
> +++ b/tools/xenstore/xenstored_transaction.c
> @@ -357,7 +357,7 @@ static int finalize_transaction(struct connection *conn,
>   {
>   	struct accessed_node *i, *n;
>   	size_t size;
> -	const struct xs_tdb_record_hdr *hdr;
> +	const struct node_hdr *hdr;
>   	uint64_t gen;
>   
>   	list_for_each_entry_safe(i, n, &trans->accessed, list) {
> @@ -394,12 +394,12 @@ static int finalize_transaction(struct connection *conn,
>   				 * generation count.
>   				 */
>   				enum write_node_mode mode;
> -				struct xs_tdb_record_hdr *own;
> +				struct node_hdr *own;
>   
>   				talloc_increase_ref_count(hdr);
>   				db_delete(conn, i->trans_name, NULL);
>   
> -				own = (struct xs_tdb_record_hdr *)hdr;
> +				own = (struct node_hdr *)hdr;
>   				own->generation = ++generation;
>   				mode = (i->generation == NO_GENERATION)
>   				       ? NODE_CREATE : NODE_MODIFY;

Cheers,
Jürgen Groß July 28, 2023, 6:23 a.m. UTC | #2
On 27.07.23 23:53, Julien Grall wrote:
> Hi Juergen,
> 
> On 24/07/2023 12:02, Juergen Gross wrote:
>> Struct xs_tdb_record_hdr is used for nodes stored in the data base.
>> When working on a node, struct node is being used, which is including
>> the same information as struct xs_tdb_record_hdr, but in a different
>> format. Rework struct xs_tdb_record_hdr in order to prepare including
>> it in struct node.
>>
>> Do the following modifications:
>>
>> - move its definition to xenstored_core.h, as the reason to put it into
>>    utils.h are no longer existing
>>
>> - rename it to struct node_hdr, as the "tdb" in its name has only
>>    historical reasons
>>
>> - replace the empty permission array at the end with a comment about
>>    the layout of data in the data base (concatenation of header,
>>    permissions, node contents, and children list)
>>
>> - use narrower types for num_perms and datalen, as those are naturally
>>    limited to XENSTORE_PAYLOAD_MAX (childlen is different here, as it is
>>    in theory basically unlimited)
> 
> The assumption is XENSTORE_PAYLOAD_MAX will never change and/or always apply for 
> all the connection. I am aware of at least one downstream use where this is not 
> the case.
> 
> I am happy to use narrower types, but I would at least like some checks in 
> Xenstore to ensure the limits are not reached.

I will add a BUILD_BUG_ON().

> 
>>
>> Signed-off-by: Juergen Gross <jgross@suse.com>
>> ---
>> V2:
>> - new patch
>> ---
>>   tools/xenstore/utils.h                 |  9 -------
>>   tools/xenstore/xenstored_core.c        | 35 +++++++++++++++-----------
>>   tools/xenstore/xenstored_core.h        | 20 ++++++++++++++-
>>   tools/xenstore/xenstored_transaction.c |  6 ++---
>>   4 files changed, 42 insertions(+), 28 deletions(-)
>>
>> diff --git a/tools/xenstore/utils.h b/tools/xenstore/utils.h
>> index 028ecb9d7a..405d662ea2 100644
>> --- a/tools/xenstore/utils.h
>> +++ b/tools/xenstore/utils.h
>> @@ -9,15 +9,6 @@
>>   #include "xenstore_lib.h"
>> -/* Header of the node record in tdb. */
>> -struct xs_tdb_record_hdr {
>> -    uint64_t generation;
>> -    uint32_t num_perms;
>> -    uint32_t datalen;
>> -    uint32_t childlen;
>> -    struct xs_permissions perms[0];
>> -};
>> -
>>   /* Is A == B ? */
>>   #define streq(a,b) (strcmp((a),(b)) == 0)
>> diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c
>> index 1f5f118f1c..86b7c9bf36 100644
>> --- a/tools/xenstore/xenstored_core.c
>> +++ b/tools/xenstore/xenstored_core.c
>> @@ -555,9 +555,9 @@ static void initialize_fds(int *p_sock_pollfd_idx, int 
>> *ptimeout)
>>       }
>>   }
>> -const struct xs_tdb_record_hdr *db_fetch(const char *db_name, size_t *size)
>> +const struct node_hdr *db_fetch(const char *db_name, size_t *size)
>>   {
>> -    struct xs_tdb_record_hdr *hdr;
>> +    struct node_hdr *hdr;
>>       hdr = hashtable_search(nodes, db_name);
>>       if (!hdr) {
>> @@ -565,7 +565,7 @@ const struct xs_tdb_record_hdr *db_fetch(const char 
>> *db_name, size_t *size)
>>           return NULL;
>>       }
>> -    *size = sizeof(*hdr) + hdr->num_perms * sizeof(hdr->perms[0]) +
>> +    *size = sizeof(*hdr) + hdr->num_perms * sizeof(struct xs_permissions) +
>>           hdr->datalen + hdr->childlen;
>>       trace_tdb("read %s size %zu\n", db_name, *size + strlen(db_name));
>> @@ -573,10 +573,15 @@ const struct xs_tdb_record_hdr *db_fetch(const char 
>> *db_name, size_t *size)
>>       return hdr;
>>   }
>> +static struct xs_permissions *perms_from_node_hdr(const struct node_hdr *hdr)
> 
> I don't much like the casting-away the const here. Looking at the code, most of 
> the users seems to not modify the value. So how about return const here and 
> open-coding or casting-away the const in the caller (with a proper explanation)?

I'll look into that.

> 
>> +{
>> +    return (struct xs_permissions *)(hdr + 1);
>> +}
>> +
>>   static void get_acc_data(const char *name, struct node_account_data *acc)
>>   {
>>       size_t size;
>> -    const struct xs_tdb_record_hdr *hdr;
>> +    const struct node_hdr *hdr;
>>       if (acc->memory < 0) {
>>           hdr = db_fetch(name, &size);
>> @@ -585,7 +590,7 @@ static void get_acc_data(const char *name, struct 
>> node_account_data *acc)
>>               acc->memory = 0;
>>           } else {
>>               acc->memory = size;
>> -            acc->domid = hdr->perms[0].id;
>> +            acc->domid = perms_from_node_hdr(hdr)->id;
>>           }
>>       }
>>   }
>> @@ -606,7 +611,7 @@ int db_write(struct connection *conn, const char *db_name, 
>> const void *data,
>>            size_t size, struct node_account_data *acc,
>>            enum write_node_mode mode, bool no_quota_check)
>>   {
>> -    const struct xs_tdb_record_hdr *hdr = data;
>> +    const struct node_hdr *hdr = data;
>>       struct node_account_data old_acc = {};
>>       unsigned int old_domid, new_domid;
>>       size_t name_len = strlen(db_name);
>> @@ -620,7 +625,7 @@ int db_write(struct connection *conn, const char *db_name, 
>> const void *data,
>>       get_acc_data(db_name, &old_acc);
>>       old_domid = get_acc_domid(conn, db_name, old_acc.domid);
>> -    new_domid = get_acc_domid(conn, db_name, hdr->perms[0].id);
>> +    new_domid = get_acc_domid(conn, db_name, perms_from_node_hdr(hdr)->id);
>>       /*
>>        * Don't check for ENOENT, as we want to be able to switch orphaned
>> @@ -661,7 +666,7 @@ int db_write(struct connection *conn, const char *db_name, 
>> const void *data,
>>       if (acc) {
>>           /* Don't use new_domid, as it might be a transaction node. */
>> -        acc->domid = hdr->perms[0].id;
>> +        acc->domid = perms_from_node_hdr(hdr)->id;
>>           acc->memory = size;
>>       }
>> @@ -699,7 +704,7 @@ struct node *read_node(struct connection *conn, const void 
>> *ctx,
>>                  const char *name)
>>   {
>>       size_t size;
>> -    const struct xs_tdb_record_hdr *hdr;
>> +    const struct node_hdr *hdr;
>>       struct node *node;
>>       const char *db_name;
>>       int err;
>> @@ -733,12 +738,12 @@ struct node *read_node(struct connection *conn, const 
>> void *ctx,
>>       node->perms.num = hdr->num_perms;
>>       node->datalen = hdr->datalen;
>>       node->childlen = hdr->childlen;
>> -    node->acc.domid = hdr->perms[0].id;
>> +    node->acc.domid = perms_from_node_hdr(hdr)->id;
>>       node->acc.memory = size;
>>       /* Copy node data to new memory area, starting with permissions. */
>>       size -= sizeof(*hdr);
>> -    node->perms.p = talloc_memdup(node, hdr->perms, size);
>> +    node->perms.p = talloc_memdup(node, perms_from_node_hdr(hdr), size);
>>       if (node->perms.p == NULL) {
>>           errno = ENOMEM;
>>           goto error;
>> @@ -785,7 +790,7 @@ int write_node_raw(struct connection *conn, const char 
>> *db_name,
>>       void *data;
>>       size_t size;
>>       void *p;
>> -    struct xs_tdb_record_hdr *hdr;
>> +    struct node_hdr *hdr;
>>       if (domain_adjust_node_perms(node))
>>           return errno;
>> @@ -812,9 +817,9 @@ int write_node_raw(struct connection *conn, const char 
>> *db_name,
>>       hdr->datalen = node->datalen;
>>       hdr->childlen = node->childlen;
>> -    memcpy(hdr->perms, node->perms.p,
>> -           node->perms.num * sizeof(*node->perms.p));
>> -    p = hdr->perms + node->perms.num;
>> +    p = perms_from_node_hdr(hdr);
>> +    memcpy(p, node->perms.p, node->perms.num * sizeof(*node->perms.p));
>> +    p += node->perms.num * sizeof(*node->perms.p);
>>       memcpy(p, node->data, node->datalen);
>>       p += node->datalen;
>>       memcpy(p, node->children, node->childlen);
>> diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h
>> index 6d1578ce97..c965709090 100644
>> --- a/tools/xenstore/xenstored_core.h
>> +++ b/tools/xenstore/xenstored_core.h
>> @@ -168,6 +168,24 @@ struct connection
>>   };
>>   extern struct list_head connections;
>> +/*
>> + * Header of the node record in the data base.
>> + * In the data base the memory of the node is a single memory chunk with the
>> + * following format:
>> + * struct {
>> + *     node_hdr hdr;
>> + *     struct xs_permissions perms[hdr.num_perms];
>> + *     char data[hdr.datalen];
>> + *     char children[hdr.childlen];
>> + * };
>> + */
>> +struct node_hdr {
>> +    uint64_t generation;
>> +    uint16_t num_perms;
> 
> OOI, do you have a use case where a node would be shared with more than 255 
> domains?

No, but limiting it wouldn't give any real advantage.

> 
>> +    uint16_t datalen;
>> +    uint32_t childlen;
>> +};
>> +
>>   struct node_perms {
>>       unsigned int num;
>>       struct xs_permissions *p;
>> @@ -362,7 +380,7 @@ extern xengnttab_handle **xgt_handle;
>>   int remember_string(struct hashtable *hash, const char *str);
>>   /* Data base access functions. */
>> -const struct xs_tdb_record_hdr *db_fetch(const char *db_name, size_t *size);
>> +const struct node_hdr *db_fetch(const char *db_name, size_t *size);
>>   int db_write(struct connection *conn, const char *db_name, const void *data,
>>            size_t size, struct node_account_data *acc,
>>            enum write_node_mode mode, bool no_quota_check);
>> diff --git a/tools/xenstore/xenstored_transaction.c 
>> b/tools/xenstore/xenstored_transaction.c
>> index a90283dcc5..9ca73b9874 100644
>> --- a/tools/xenstore/xenstored_transaction.c
>> +++ b/tools/xenstore/xenstored_transaction.c
>> @@ -357,7 +357,7 @@ static int finalize_transaction(struct connection *conn,
>>   {
>>       struct accessed_node *i, *n;
>>       size_t size;
>> -    const struct xs_tdb_record_hdr *hdr;
>> +    const struct node_hdr *hdr;
>>       uint64_t gen;
>>       list_for_each_entry_safe(i, n, &trans->accessed, list) {
>> @@ -394,12 +394,12 @@ static int finalize_transaction(struct connection *conn,
>>                    * generation count.
>>                    */
>>                   enum write_node_mode mode;
>> -                struct xs_tdb_record_hdr *own;
>> +                struct node_hdr *own;
>>                   talloc_increase_ref_count(hdr);
>>                   db_delete(conn, i->trans_name, NULL);
>> -                own = (struct xs_tdb_record_hdr *)hdr;
>> +                own = (struct node_hdr *)hdr;
>>                   own->generation = ++generation;
>>                   mode = (i->generation == NO_GENERATION)
>>                          ? NODE_CREATE : NODE_MODIFY;

Juergen
Julien Grall July 28, 2023, 8:59 a.m. UTC | #3
Hi Juergen,

On 28/07/2023 07:23, Juergen Gross wrote:
> On 27.07.23 23:53, Julien Grall wrote:
>> Hi Juergen,
>>
>> On 24/07/2023 12:02, Juergen Gross wrote:
>>> Struct xs_tdb_record_hdr is used for nodes stored in the data base.
>>> When working on a node, struct node is being used, which is including
>>> the same information as struct xs_tdb_record_hdr, but in a different
>>> format. Rework struct xs_tdb_record_hdr in order to prepare including
>>> it in struct node.
>>>
>>> Do the following modifications:
>>>
>>> - move its definition to xenstored_core.h, as the reason to put it into
>>>    utils.h are no longer existing
>>>
>>> - rename it to struct node_hdr, as the "tdb" in its name has only
>>>    historical reasons
>>>
>>> - replace the empty permission array at the end with a comment about
>>>    the layout of data in the data base (concatenation of header,
>>>    permissions, node contents, and children list)
>>>
>>> - use narrower types for num_perms and datalen, as those are naturally
>>>    limited to XENSTORE_PAYLOAD_MAX (childlen is different here, as it is
>>>    in theory basically unlimited)
>>
>> The assumption is XENSTORE_PAYLOAD_MAX will never change and/or always 
>> apply for all the connection. I am aware of at least one downstream 
>> use where this is not the case.
>>
>> I am happy to use narrower types, but I would at least like some 
>> checks in Xenstore to ensure the limits are not reached.
> 
> I will add a BUILD_BUG_ON().

Initially I was thinking about a runtime check, but I am also fine with 
a BUILD_BUG_ON() if it is right next to length check in handle_input(). 
So if someone decided to add different payload size depending on the 
domain (such as privileged domain could do more), it would be easier to 
spot what else needs to be changed.

>> OOI, do you have a use case where a node would be shared with more 
>> than 255 domains?
> 
> No, but limiting it wouldn't give any real advantage.

I guess by advantage you mean that the size of the structure would still 
be the same. I thought this was the rationale but I asked just in case 
you had something else in mind. For instance, Xen technically supports 
up to ~32 000 domains. But I think it would be crazy to decide to have 
more than a few tens permissions on a node :).

Cheers,
Jürgen Groß July 28, 2023, 9:14 a.m. UTC | #4
On 28.07.23 10:59, Julien Grall wrote:
> Hi Juergen,
> 
> On 28/07/2023 07:23, Juergen Gross wrote:
>> On 27.07.23 23:53, Julien Grall wrote:
>>> Hi Juergen,
>>>
>>> On 24/07/2023 12:02, Juergen Gross wrote:
>>>> Struct xs_tdb_record_hdr is used for nodes stored in the data base.
>>>> When working on a node, struct node is being used, which is including
>>>> the same information as struct xs_tdb_record_hdr, but in a different
>>>> format. Rework struct xs_tdb_record_hdr in order to prepare including
>>>> it in struct node.
>>>>
>>>> Do the following modifications:
>>>>
>>>> - move its definition to xenstored_core.h, as the reason to put it into
>>>>    utils.h are no longer existing
>>>>
>>>> - rename it to struct node_hdr, as the "tdb" in its name has only
>>>>    historical reasons
>>>>
>>>> - replace the empty permission array at the end with a comment about
>>>>    the layout of data in the data base (concatenation of header,
>>>>    permissions, node contents, and children list)
>>>>
>>>> - use narrower types for num_perms and datalen, as those are naturally
>>>>    limited to XENSTORE_PAYLOAD_MAX (childlen is different here, as it is
>>>>    in theory basically unlimited)
>>>
>>> The assumption is XENSTORE_PAYLOAD_MAX will never change and/or always apply 
>>> for all the connection. I am aware of at least one downstream use where this 
>>> is not the case.
>>>
>>> I am happy to use narrower types, but I would at least like some checks in 
>>> Xenstore to ensure the limits are not reached.
>>
>> I will add a BUILD_BUG_ON().
> 
> Initially I was thinking about a runtime check, but I am also fine with a 
> BUILD_BUG_ON() if it is right next to length check in handle_input(). So if 
> someone decided to add different payload size depending on the domain (such as 
> privileged domain could do more), it would be easier to spot what else needs to 
> be changed.

Is this really the correct placement?

I've put it into write_node_raw(), as this is where the related datalen member
is being filled. This placement has the further advantage that I already have
a related pointer at hand, so I can use:

	BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >= (typeof(hdr->datalen))(-1));

which is exactly what should be tested.

> 
>>> OOI, do you have a use case where a node would be shared with more than 255 
>>> domains?
>>
>> No, but limiting it wouldn't give any real advantage.
> 
> I guess by advantage you mean that the size of the structure would still be the 
> same. I thought this was the rationale but I asked just in case you had 
> something else in mind. For instance, Xen technically supports up to ~32 000 
> domains. But I think it would be crazy to decide to have more than a few tens 
> permissions on a node :).

Yes to both. OTOH why should we deny to allow that without any urgent need to do
so?


Juergen
Julien Grall July 28, 2023, 9:38 a.m. UTC | #5
Hi Juergen,

On 28/07/2023 10:14, Juergen Gross wrote:
> On 28.07.23 10:59, Julien Grall wrote:
>> Hi Juergen,
>>
>> On 28/07/2023 07:23, Juergen Gross wrote:
>>> On 27.07.23 23:53, Julien Grall wrote:
>>>> Hi Juergen,
>>>>
>>>> On 24/07/2023 12:02, Juergen Gross wrote:
>>>>> Struct xs_tdb_record_hdr is used for nodes stored in the data base.
>>>>> When working on a node, struct node is being used, which is including
>>>>> the same information as struct xs_tdb_record_hdr, but in a different
>>>>> format. Rework struct xs_tdb_record_hdr in order to prepare including
>>>>> it in struct node.
>>>>>
>>>>> Do the following modifications:
>>>>>
>>>>> - move its definition to xenstored_core.h, as the reason to put it 
>>>>> into
>>>>>    utils.h are no longer existing
>>>>>
>>>>> - rename it to struct node_hdr, as the "tdb" in its name has only
>>>>>    historical reasons
>>>>>
>>>>> - replace the empty permission array at the end with a comment about
>>>>>    the layout of data in the data base (concatenation of header,
>>>>>    permissions, node contents, and children list)
>>>>>
>>>>> - use narrower types for num_perms and datalen, as those are naturally
>>>>>    limited to XENSTORE_PAYLOAD_MAX (childlen is different here, as 
>>>>> it is
>>>>>    in theory basically unlimited)
>>>>
>>>> The assumption is XENSTORE_PAYLOAD_MAX will never change and/or 
>>>> always apply for all the connection. I am aware of at least one 
>>>> downstream use where this is not the case.
>>>>
>>>> I am happy to use narrower types, but I would at least like some 
>>>> checks in Xenstore to ensure the limits are not reached.
>>>
>>> I will add a BUILD_BUG_ON().
>>
>> Initially I was thinking about a runtime check, but I am also fine 
>> with a BUILD_BUG_ON() if it is right next to length check in 
>> handle_input(). So if someone decided to add different payload size 
>> depending on the domain (such as privileged domain could do more), it 
>> would be easier to spot what else needs to be changed.
> 
> Is this really the correct placement?

I think so. By adding the BUILD_ON_BUG() right above the length check, 
it would be easier for everyone to know that the datastructure may also 
need to change. This ...

> 
> I've put it into write_node_raw(), as this is where the related datalen 
> member
> is being filled. 

... would be less obvious here and I think it might be miss.

Cheers,
Jürgen Groß July 28, 2023, 9:45 a.m. UTC | #6
On 28.07.23 11:38, Julien Grall wrote:
> Hi Juergen,
> 
> On 28/07/2023 10:14, Juergen Gross wrote:
>> On 28.07.23 10:59, Julien Grall wrote:
>>> Hi Juergen,
>>>
>>> On 28/07/2023 07:23, Juergen Gross wrote:
>>>> On 27.07.23 23:53, Julien Grall wrote:
>>>>> Hi Juergen,
>>>>>
>>>>> On 24/07/2023 12:02, Juergen Gross wrote:
>>>>>> Struct xs_tdb_record_hdr is used for nodes stored in the data base.
>>>>>> When working on a node, struct node is being used, which is including
>>>>>> the same information as struct xs_tdb_record_hdr, but in a different
>>>>>> format. Rework struct xs_tdb_record_hdr in order to prepare including
>>>>>> it in struct node.
>>>>>>
>>>>>> Do the following modifications:
>>>>>>
>>>>>> - move its definition to xenstored_core.h, as the reason to put it into
>>>>>>    utils.h are no longer existing
>>>>>>
>>>>>> - rename it to struct node_hdr, as the "tdb" in its name has only
>>>>>>    historical reasons
>>>>>>
>>>>>> - replace the empty permission array at the end with a comment about
>>>>>>    the layout of data in the data base (concatenation of header,
>>>>>>    permissions, node contents, and children list)
>>>>>>
>>>>>> - use narrower types for num_perms and datalen, as those are naturally
>>>>>>    limited to XENSTORE_PAYLOAD_MAX (childlen is different here, as it is
>>>>>>    in theory basically unlimited)
>>>>>
>>>>> The assumption is XENSTORE_PAYLOAD_MAX will never change and/or always 
>>>>> apply for all the connection. I am aware of at least one downstream use 
>>>>> where this is not the case.
>>>>>
>>>>> I am happy to use narrower types, but I would at least like some checks in 
>>>>> Xenstore to ensure the limits are not reached.
>>>>
>>>> I will add a BUILD_BUG_ON().
>>>
>>> Initially I was thinking about a runtime check, but I am also fine with a 
>>> BUILD_BUG_ON() if it is right next to length check in handle_input(). So if 
>>> someone decided to add different payload size depending on the domain (such 
>>> as privileged domain could do more), it would be easier to spot what else 
>>> needs to be changed.
>>
>> Is this really the correct placement?
> 
> I think so. By adding the BUILD_ON_BUG() right above the length check, it would 
> be easier for everyone to know that the datastructure may also need to change. 
> This ...
> 
>>
>> I've put it into write_node_raw(), as this is where the related datalen member
>> is being filled. 
> 
> ... would be less obvious here and I think it might be miss.

Assuming that someone changing XENSTORE_PAYLOAD_MAX would do a build afterwards,
I don't see how such a failure could be missed. In case of a runtime check I
agree that a more central place would be preferred.

In the end I don't mind that much, but

	BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
		     (typeof((struct node_hdr *)NULL->datalen))(-1));

is a little bit clumsy IMHO.


Juergen
Julien Grall July 28, 2023, 10:34 a.m. UTC | #7
Hi,

On 28/07/2023 10:45, Juergen Gross wrote:
> On 28.07.23 11:38, Julien Grall wrote:
>> Hi Juergen,
>>
>> On 28/07/2023 10:14, Juergen Gross wrote:
>>> On 28.07.23 10:59, Julien Grall wrote:
>>>> Hi Juergen,
>>>>
>>>> On 28/07/2023 07:23, Juergen Gross wrote:
>>>>> On 27.07.23 23:53, Julien Grall wrote:
>>>>>> Hi Juergen,
>>>>>>
>>>>>> On 24/07/2023 12:02, Juergen Gross wrote:
>>>>>>> Struct xs_tdb_record_hdr is used for nodes stored in the data base.
>>>>>>> When working on a node, struct node is being used, which is 
>>>>>>> including
>>>>>>> the same information as struct xs_tdb_record_hdr, but in a different
>>>>>>> format. Rework struct xs_tdb_record_hdr in order to prepare 
>>>>>>> including
>>>>>>> it in struct node.
>>>>>>>
>>>>>>> Do the following modifications:
>>>>>>>
>>>>>>> - move its definition to xenstored_core.h, as the reason to put 
>>>>>>> it into
>>>>>>>    utils.h are no longer existing
>>>>>>>
>>>>>>> - rename it to struct node_hdr, as the "tdb" in its name has only
>>>>>>>    historical reasons
>>>>>>>
>>>>>>> - replace the empty permission array at the end with a comment about
>>>>>>>    the layout of data in the data base (concatenation of header,
>>>>>>>    permissions, node contents, and children list)
>>>>>>>
>>>>>>> - use narrower types for num_perms and datalen, as those are 
>>>>>>> naturally
>>>>>>>    limited to XENSTORE_PAYLOAD_MAX (childlen is different here, 
>>>>>>> as it is
>>>>>>>    in theory basically unlimited)
>>>>>>
>>>>>> The assumption is XENSTORE_PAYLOAD_MAX will never change and/or 
>>>>>> always apply for all the connection. I am aware of at least one 
>>>>>> downstream use where this is not the case.
>>>>>>
>>>>>> I am happy to use narrower types, but I would at least like some 
>>>>>> checks in Xenstore to ensure the limits are not reached.
>>>>>
>>>>> I will add a BUILD_BUG_ON().
>>>>
>>>> Initially I was thinking about a runtime check, but I am also fine 
>>>> with a BUILD_BUG_ON() if it is right next to length check in 
>>>> handle_input(). So if someone decided to add different payload size 
>>>> depending on the domain (such as privileged domain could do more), 
>>>> it would be easier to spot what else needs to be changed.
>>>
>>> Is this really the correct placement?
>>
>> I think so. By adding the BUILD_ON_BUG() right above the length check, 
>> it would be easier for everyone to know that the datastructure may 
>> also need to change. This ...
>>
>>>
>>> I've put it into write_node_raw(), as this is where the related 
>>> datalen member
>>> is being filled. 
>>
>> ... would be less obvious here and I think it might be miss.
> 
> Assuming that someone changing XENSTORE_PAYLOAD_MAX would do a build 
> afterwards,
> I don't see how such a failure could be missed.

Because one may want dom0 to send payload bigger than 
XENSTORE_PAYLOAD_MAX. Something like:

if ( conn->id != 0 && len < XENSTORE_PAYLOAD_MAX )

Such change would not be caught during compilation. AWS has a similar 
check in the downstream tree because the implementation of 
non-cooperative migration is using Xenstore command and we want to be 
able to transfer the state in a single command.

> In case of a runtime 
> check I
> agree that a more central place would be preferred.
> 
> In the end I don't mind that much, but
> 
>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>               (typeof((struct node_hdr *)NULL->datalen))(-1));
> 
> is a little bit clumsy IMHO.

Agree. We could introduce FIELD_SIZEOF() (as Linux did) to hide the 
complexity. The code would then look like:

 >= (8 * FIELD_SIZEOF(struct node_hdr, datalen))

Cheers,
Jürgen Groß July 28, 2023, 10:47 a.m. UTC | #8
On 28.07.23 12:34, Julien Grall wrote:
> Hi,
> 
> On 28/07/2023 10:45, Juergen Gross wrote:
>> On 28.07.23 11:38, Julien Grall wrote:
>>> Hi Juergen,
>>>
>>> On 28/07/2023 10:14, Juergen Gross wrote:
>>>> On 28.07.23 10:59, Julien Grall wrote:
>>>>> Hi Juergen,
>>>>>
>>>>> On 28/07/2023 07:23, Juergen Gross wrote:
>>>>>> On 27.07.23 23:53, Julien Grall wrote:
>>>>>>> Hi Juergen,
>>>>>>>
>>>>>>> On 24/07/2023 12:02, Juergen Gross wrote:
>>>>>>>> Struct xs_tdb_record_hdr is used for nodes stored in the data base.
>>>>>>>> When working on a node, struct node is being used, which is including
>>>>>>>> the same information as struct xs_tdb_record_hdr, but in a different
>>>>>>>> format. Rework struct xs_tdb_record_hdr in order to prepare including
>>>>>>>> it in struct node.
>>>>>>>>
>>>>>>>> Do the following modifications:
>>>>>>>>
>>>>>>>> - move its definition to xenstored_core.h, as the reason to put it into
>>>>>>>>    utils.h are no longer existing
>>>>>>>>
>>>>>>>> - rename it to struct node_hdr, as the "tdb" in its name has only
>>>>>>>>    historical reasons
>>>>>>>>
>>>>>>>> - replace the empty permission array at the end with a comment about
>>>>>>>>    the layout of data in the data base (concatenation of header,
>>>>>>>>    permissions, node contents, and children list)
>>>>>>>>
>>>>>>>> - use narrower types for num_perms and datalen, as those are naturally
>>>>>>>>    limited to XENSTORE_PAYLOAD_MAX (childlen is different here, as it is
>>>>>>>>    in theory basically unlimited)
>>>>>>>
>>>>>>> The assumption is XENSTORE_PAYLOAD_MAX will never change and/or always 
>>>>>>> apply for all the connection. I am aware of at least one downstream use 
>>>>>>> where this is not the case.
>>>>>>>
>>>>>>> I am happy to use narrower types, but I would at least like some checks 
>>>>>>> in Xenstore to ensure the limits are not reached.
>>>>>>
>>>>>> I will add a BUILD_BUG_ON().
>>>>>
>>>>> Initially I was thinking about a runtime check, but I am also fine with a 
>>>>> BUILD_BUG_ON() if it is right next to length check in handle_input(). So if 
>>>>> someone decided to add different payload size depending on the domain (such 
>>>>> as privileged domain could do more), it would be easier to spot what else 
>>>>> needs to be changed.
>>>>
>>>> Is this really the correct placement?
>>>
>>> I think so. By adding the BUILD_ON_BUG() right above the length check, it 
>>> would be easier for everyone to know that the datastructure may also need to 
>>> change. This ...
>>>
>>>>
>>>> I've put it into write_node_raw(), as this is where the related datalen member
>>>> is being filled. 
>>>
>>> ... would be less obvious here and I think it might be miss.
>>
>> Assuming that someone changing XENSTORE_PAYLOAD_MAX would do a build afterwards,
>> I don't see how such a failure could be missed.
> 
> Because one may want dom0 to send payload bigger than XENSTORE_PAYLOAD_MAX. 
> Something like:
> 
> if ( conn->id != 0 && len < XENSTORE_PAYLOAD_MAX )
> 
> Such change would not be caught during compilation. AWS has a similar check in 
> the downstream tree because the implementation of non-cooperative migration is 
> using Xenstore command and we want to be able to transfer the state in a single 
> command.

And how directly is this related to the max data size of node contents?

As soon as you are allowing dom0 to write larger nodes, you are risking to
kill client connections trying to read such a node. So the node size should
still be limited to XENSTORE_PAYLOAD_MAX.

IMO another reason to use the placement I've suggested. AWS should even add
a size check when writing nodes to make sure dom0 doesn't do evil things.

> 
>> In case of a runtime check I
>> agree that a more central place would be preferred.
>>
>> In the end I don't mind that much, but
>>
>>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>>               (typeof((struct node_hdr *)NULL->datalen))(-1));
>>
>> is a little bit clumsy IMHO.
> 
> Agree. We could introduce FIELD_SIZEOF() (as Linux did) to hide the complexity. 
> The code would then look like:
> 
>  >= (8 * FIELD_SIZEOF(struct node_hdr, datalen))

Oh, I guess you mean sizeof_field().

And even with that it would look quite clumsy:

	BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
		     (1UL << (8 * sizeof_field(struct node_hdr, datalen))));


Juergen
Julien Grall July 28, 2023, 11:19 a.m. UTC | #9
On 28/07/2023 11:47, Juergen Gross wrote:
> On 28.07.23 12:34, Julien Grall wrote:
>> Because one may want dom0 to send payload bigger than 
>> XENSTORE_PAYLOAD_MAX. Something like:
>>
>> if ( conn->id != 0 && len < XENSTORE_PAYLOAD_MAX )
>>
>> Such change would not be caught during compilation. AWS has a similar 
>> check in the downstream tree because the implementation of 
>> non-cooperative migration is using Xenstore command and we want to be 
>> able to transfer the state in a single command.
> 
> And how directly is this related to the max data size of node contents?

I think you missed my point. Until this patch, the existing field would 
be able to accomodate very large payload. This doesn't hold anymore.

What I was trying to convey is that anyone looking at relaxing the check 
in handle_input() needs to be able to find "easily" that other part of 
Xenstored are making some assumption based on the maximum length.

> 
> As soon as you are allowing dom0 to write larger nodes, you are risking to
> kill client connections trying to read such a node. So the node size should
> still be limited to XENSTORE_PAYLOAD_MAX.
> 
> IMO another reason to use the placement I've suggested.

I agree that BUILD_BUG_ON() makes sense where you suggest if you think 
about where the runtime check would happen.

It seems like we have two different aims here. Mine is to make sure we 
make it more difficult to introduce a security hole if the lenght check 
is relaxed.

I have made a proposal below that may suit both our aim.

> AWS should even add
> a size check when writing nodes to make sure dom0 doesn't do evil things.

What make you think we don't already have such checked? ;)

Also, I noticed you mention about datalen. What about the number of 
permissions?

> 
>>
>>> In case of a runtime check I
>>> agree that a more central place would be preferred.
>>>
>>> In the end I don't mind that much, but
>>>
>>>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>>>               (typeof((struct node_hdr *)NULL->datalen))(-1));
>>>
>>> is a little bit clumsy IMHO.
>>
>> Agree. We could introduce FIELD_SIZEOF() (as Linux did) to hide the 
>> complexity. The code would then look like:
>>
>>  >= (8 * FIELD_SIZEOF(struct node_hdr, datalen))
> 
> Oh, I guess you mean sizeof_field().
> 
> And even with that it would look quite clumsy:
> 
>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>               (1UL << (8 * sizeof_field(struct node_hdr, datalen))));

How about keeping the BUILD_BUG_ON() in write_node_raw() and add the 
following comment on top of handle_input():

Some fields in Xenstored are sized based on the max payload (see various 
BUILD_BUG_ON()). This would need extra runtime check if we ever decide 
to have a dynamic payload size.

Cheers,
Jürgen Groß July 28, 2023, 12:06 p.m. UTC | #10
On 28.07.23 13:19, Julien Grall wrote:
> 
> 
> On 28/07/2023 11:47, Juergen Gross wrote:
>> On 28.07.23 12:34, Julien Grall wrote:
>>> Because one may want dom0 to send payload bigger than XENSTORE_PAYLOAD_MAX. 
>>> Something like:
>>>
>>> if ( conn->id != 0 && len < XENSTORE_PAYLOAD_MAX )
>>>
>>> Such change would not be caught during compilation. AWS has a similar check 
>>> in the downstream tree because the implementation of non-cooperative 
>>> migration is using Xenstore command and we want to be able to transfer the 
>>> state in a single command.
>>
>> And how directly is this related to the max data size of node contents?
> 
> I think you missed my point. Until this patch, the existing field would be able 
> to accomodate very large payload. This doesn't hold anymore.

Yes. And the field is related to node data only, so it should be checked to
be large enough where it is written, as this is the critical operation where
truncation would happen.

> What I was trying to convey is that anyone looking at relaxing the check in 
> handle_input() needs to be able to find "easily" that other part of Xenstored 
> are making some assumption based on the maximum length.

As the AWS example is showing, the limitation isn't in handle_input().

And TBH: allowing sometimes a payload larger than XENSTORE_PAYLOAD_MAX isn't
something we should encourage. This is rather a very bad design decision.

> 
>>
>> As soon as you are allowing dom0 to write larger nodes, you are risking to
>> kill client connections trying to read such a node. So the node size should
>> still be limited to XENSTORE_PAYLOAD_MAX.
>>
>> IMO another reason to use the placement I've suggested.
> 
> I agree that BUILD_BUG_ON() makes sense where you suggest if you think about 
> where the runtime check would happen.

Correct.

> It seems like we have two different aims here. Mine is to make sure we make it 
> more difficult to introduce a security hole if the lenght check is relaxed.

Which (as written above) would set a very bad precedence.

XENSTORE_PAYLOAD_MAX should be a hard limit.

> I have made a proposal below that may suit both our aim.
> 
>> AWS should even add
>> a size check when writing nodes to make sure dom0 doesn't do evil things.
> 
> What make you think we don't already have such checked? ;)

Just a wild guess. :-)

> Also, I noticed you mention about datalen. What about the number of permissions?

More than max number of domains? Now this would really be crazy. And it would
need operations using more data than XENSTORE_PAYLOAD_MAX allows. :-)

>>>> In case of a runtime check I
>>>> agree that a more central place would be preferred.
>>>>
>>>> In the end I don't mind that much, but
>>>>
>>>>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>>>>               (typeof((struct node_hdr *)NULL->datalen))(-1));
>>>>
>>>> is a little bit clumsy IMHO.
>>>
>>> Agree. We could introduce FIELD_SIZEOF() (as Linux did) to hide the 
>>> complexity. The code would then look like:
>>>
>>>  >= (8 * FIELD_SIZEOF(struct node_hdr, datalen))
>>
>> Oh, I guess you mean sizeof_field().
>>
>> And even with that it would look quite clumsy:
>>
>>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>>               (1UL << (8 * sizeof_field(struct node_hdr, datalen))));
> 
> How about keeping the BUILD_BUG_ON() in write_node_raw() and add the following 
> comment on top of handle_input():
> 
> Some fields in Xenstored are sized based on the max payload (see various 
> BUILD_BUG_ON()). This would need extra runtime check if we ever decide to have a 
> dynamic payload size.

I _could_ do that, but where to stop adding such comments?

TBH, I really don't see the point doing that.

In case a patch came up upstream trying to violate XENSTORE_PAYLOAD_MAX I would
surely NACK it.

In case we need payloads larger than XENSTORE_PAYLOAD_MAX we should split the
related operation in multiple parts (see e.g. XS_DIRECTORY_PART or XS_CONTROL
for uploading a new kernel to Xenstore-stubdom for live update). Which is, BTW,
the way AWS should have handled the migration problem (transactions come to my
mind in this context).


Juergen
Julien Grall July 28, 2023, 12:48 p.m. UTC | #11
Hi,

On 28/07/2023 13:06, Juergen Gross wrote:
> On 28.07.23 13:19, Julien Grall wrote: 
>>>>> In case of a runtime check I
>>>>> agree that a more central place would be preferred.
>>>>>
>>>>> In the end I don't mind that much, but
>>>>>
>>>>>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>>>>>               (typeof((struct node_hdr *)NULL->datalen))(-1));
>>>>>
>>>>> is a little bit clumsy IMHO.
>>>>
>>>> Agree. We could introduce FIELD_SIZEOF() (as Linux did) to hide the 
>>>> complexity. The code would then look like:
>>>>
>>>>  >= (8 * FIELD_SIZEOF(struct node_hdr, datalen))
>>>
>>> Oh, I guess you mean sizeof_field().
>>>
>>> And even with that it would look quite clumsy:
>>>
>>>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>>>               (1UL << (8 * sizeof_field(struct node_hdr, datalen))));
>>
>> How about keeping the BUILD_BUG_ON() in write_node_raw() and add the 
>> following comment on top of handle_input():
>>
>> Some fields in Xenstored are sized based on the max payload (see 
>> various BUILD_BUG_ON()). This would need extra runtime check if we 
>> ever decide to have a dynamic payload size.
> 
> I _could_ do that, but where to stop adding such comments?

When someone other than the author is able to understand the code 
without too much effort. More comments never hurts, less will in the 
longer run (see below).

> 
> TBH, I really don't see the point doing that.
> 
> In case a patch came up upstream trying to violate XENSTORE_PAYLOAD_MAX 
> I would
> surely NACK it.
That's assuming you will still be around when this happens :). I am not 
wishing anything bad but the code will likely outlast any of us.

So we need to make easy for a future maintainers/reviewers to know the 
assumptions and implications of changing some of the limits.

> In case we need payloads larger than XENSTORE_PAYLOAD_MAX we should 
> split the
> related operation in multiple parts (see e.g. XS_DIRECTORY_PART or 
> XS_CONTROL
> for uploading a new kernel to Xenstore-stubdom for live update). Which 
> is, BTW,
> the way AWS should have handled the migration problem (transactions come 
> to my
> mind in this context).

I wasn't part of the original design, but I can see why it was done like 
that.

Using multiple commands has also its downside. The first that comes to 
my mind if that you need to keep around the data. But, with your 
proposal, you we wouldn't be able to store it in the database (like for 
transaction update) as datalen can only be 65KB.

So one command as the advantage to simply a lot the logic in Xenstored.

Anyway, this is getting a bit off topic. My only request is to write 
down assumption more explicitly rather than hiding them. A comment on 
top of the check is a nice way to help the developper to avoid making a 
"bad" decision.

I am happy to rewrite the comment so it doesn't lead to think that you 
(as the maintainer) are open to have a more relax length check.

Cheers,
Jürgen Groß July 28, 2023, 1:24 p.m. UTC | #12
On 28.07.23 14:48, Julien Grall wrote:
> Hi,
> 
> On 28/07/2023 13:06, Juergen Gross wrote:
>> On 28.07.23 13:19, Julien Grall wrote:
>>>>>> In case of a runtime check I
>>>>>> agree that a more central place would be preferred.
>>>>>>
>>>>>> In the end I don't mind that much, but
>>>>>>
>>>>>>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>>>>>>               (typeof((struct node_hdr *)NULL->datalen))(-1));
>>>>>>
>>>>>> is a little bit clumsy IMHO.
>>>>>
>>>>> Agree. We could introduce FIELD_SIZEOF() (as Linux did) to hide the 
>>>>> complexity. The code would then look like:
>>>>>
>>>>>  >= (8 * FIELD_SIZEOF(struct node_hdr, datalen))
>>>>
>>>> Oh, I guess you mean sizeof_field().
>>>>
>>>> And even with that it would look quite clumsy:
>>>>
>>>>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>>>>               (1UL << (8 * sizeof_field(struct node_hdr, datalen))));
>>>
>>> How about keeping the BUILD_BUG_ON() in write_node_raw() and add the 
>>> following comment on top of handle_input():
>>>
>>> Some fields in Xenstored are sized based on the max payload (see various 
>>> BUILD_BUG_ON()). This would need extra runtime check if we ever decide to 
>>> have a dynamic payload size.
>>
>> I _could_ do that, but where to stop adding such comments?
> 
> When someone other than the author is able to understand the code without too 
> much effort. More comments never hurts, less will in the longer run (see below).

I agree with that statement in general, but requesting a comment to aid a
future potential change violating the Xenstore wire protocol is a little bit
weird.

> 
>>
>> TBH, I really don't see the point doing that.
>>
>> In case a patch came up upstream trying to violate XENSTORE_PAYLOAD_MAX I would
>> surely NACK it.
> That's assuming you will still be around when this happens :). I am not wishing 
> anything bad but the code will likely outlast any of us.

Maybe. But would you really Ack patches adding comments like that in other
areas? E.g. cases where the domid is stored in an unsigned int and me adding
a comment to be careful in case domids ever grow to 64 bits?

> So we need to make easy for a future maintainers/reviewers to know the 
> assumptions and implications of changing some of the limits.

Yes, that's what the BUILD_BUG_ON() is meant for.

>> In case we need payloads larger than XENSTORE_PAYLOAD_MAX we should split the
>> related operation in multiple parts (see e.g. XS_DIRECTORY_PART or XS_CONTROL
>> for uploading a new kernel to Xenstore-stubdom for live update). Which is, BTW,
>> the way AWS should have handled the migration problem (transactions come to my
>> mind in this context).
> 
> I wasn't part of the original design, but I can see why it was done like that.

I can see why it was done that way, but this doesn't mean I can understand
why such a design should be supported by adding comments helping to repeat such
a bad decision.

> Using multiple commands has also its downside. The first that comes to my mind 
> if that you need to keep around the data. But, with your proposal, you we 
> wouldn't be able to store it in the database (like for transaction update) as 
> datalen can only be 65KB.

I wasn't aware that a complete transaction needs to be kept in a single data
base record. :-)

It would work perfectly fine to allocate the needed memory via talloc() and to
reference it from a special node being part of the transaction, or to not use
a node at all (see again the XS_CONTROL example).

> 
> So one command as the advantage to simply a lot the logic in Xenstored.

Simplifying the logic while violating the basic wire protocol is a bad design
decision (I'm repeating myself, I know).

> Anyway, this is getting a bit off topic. My only request is to write down 
> assumption more explicitly rather than hiding them. A comment on top of the 
> check is a nice way to help the developper to avoid making a "bad" decision.

I have added a brief comment why the check is existing at all. I even managed
to avoid any strong language. :-)

	/* Some downstreams modify XENSTORE_PAYLOAD_MAX. */

But maybe that comment was based on wrong assumptions, like the mentioned
change not violating the protocol.

> I am happy to rewrite the comment so it doesn't lead to think that you (as the 
> maintainer) are open to have a more relax length check.

Yes, please make a suggestion for a proper comment not suggesting we are fine
to violate the wire protocol.


Juergen
Julien Grall July 28, 2023, 2:08 p.m. UTC | #13
Hi Juergen,

On 28/07/2023 14:24, Juergen Gross wrote:
> On 28.07.23 14:48, Julien Grall wrote:
>> Hi,
>>
>> On 28/07/2023 13:06, Juergen Gross wrote:
>>> On 28.07.23 13:19, Julien Grall wrote:
>>>>>>> In case of a runtime check I
>>>>>>> agree that a more central place would be preferred.
>>>>>>>
>>>>>>> In the end I don't mind that much, but
>>>>>>>
>>>>>>>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>>>>>>>               (typeof((struct node_hdr *)NULL->datalen))(-1));
>>>>>>>
>>>>>>> is a little bit clumsy IMHO.
>>>>>>
>>>>>> Agree. We could introduce FIELD_SIZEOF() (as Linux did) to hide 
>>>>>> the complexity. The code would then look like:
>>>>>>
>>>>>>  >= (8 * FIELD_SIZEOF(struct node_hdr, datalen))
>>>>>
>>>>> Oh, I guess you mean sizeof_field().
>>>>>
>>>>> And even with that it would look quite clumsy:
>>>>>
>>>>>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>>>>>               (1UL << (8 * sizeof_field(struct node_hdr, datalen))));
>>>>
>>>> How about keeping the BUILD_BUG_ON() in write_node_raw() and add the 
>>>> following comment on top of handle_input():
>>>>
>>>> Some fields in Xenstored are sized based on the max payload (see 
>>>> various BUILD_BUG_ON()). This would need extra runtime check if we 
>>>> ever decide to have a dynamic payload size.
>>>
>>> I _could_ do that, but where to stop adding such comments?
>>
>> When someone other than the author is able to understand the code 
>> without too much effort. More comments never hurts, less will in the 
>> longer run (see below).
> 
> I agree with that statement in general, but requesting a comment to aid a
> future potential change violating the Xenstore wire protocol is a little 
> bit
> weird.

Well... This is violating the existing protocol, but it is not set in 
stone and I think this is acceptable to update it when there is no 
change for the VMs and for new features (e.g. Live-Update/Live-Migration).

>>>
>>> TBH, I really don't see the point doing that.
>>>
>>> In case a patch came up upstream trying to violate 
>>> XENSTORE_PAYLOAD_MAX I would
>>> surely NACK it.
>> That's assuming you will still be around when this happens :). I am 
>> not wishing anything bad but the code will likely outlast any of us.
> 
> Maybe. But would you really Ack patches adding comments like that in other
> areas?

Potentially yes. We had a similar discussion on Arm when allowing 
paddr_t to be 32-bit.

[...]

>>> In case we need payloads larger than XENSTORE_PAYLOAD_MAX we should 
>>> split the
>>> related operation in multiple parts (see e.g. XS_DIRECTORY_PART or 
>>> XS_CONTROL
>>> for uploading a new kernel to Xenstore-stubdom for live update). 
>>> Which is, BTW,
>>> the way AWS should have handled the migration problem (transactions 
>>> come to my
>>> mind in this context).
>>
>> I wasn't part of the original design, but I can see why it was done 
>> like that.
> 
> I can see why it was done that way, but this doesn't mean I can understand
> why such a design should be supported by adding comments helping to 
> repeat such
> a bad decision.
> 
>> Using multiple commands has also its downside. The first that comes to 
>> my mind if that you need to keep around the data. But, with your 
>> proposal, you we wouldn't be able to store it in the database (like 
>> for transaction update) as datalen can only be 65KB.
> 
> I wasn't aware that a complete transaction needs to be kept in a single 
> data
> base record. :-)

IIUC, you are thinking that the client will restore all the state bits 
by bits. But if you look at the design in 
docs/designs/xenstore-migration.md, this is a blob.

> 
> It would work perfectly fine to allocate the needed memory via talloc() 
> and to
> reference it from a special node being part of the transaction, or to 
> not use
> a node at all (see again the XS_CONTROL example).

I am not convinced the complexity is worth it here. To be honest, I 
think the payload limit should have been relaxed for Live-Update as well 
as you don't gain much to split. That said, this is less a concern 
because you are not time constrained.

[...]

> But maybe that comment was based on wrong assumptions, like the mentioned
> change not violating the protocol. >
>> I am happy to rewrite the comment so it doesn't lead to think that you 
>> (as the maintainer) are open to have a more relax length check.
> 
> Yes, please make a suggestion for a proper comment not suggesting we are 
> fine
> to violate the wire protocol.

Here we go:

"The payload size is not only currently restricted by the protocol but 
also the internal implementation (see various BUILD_BUG_ON())."
Jürgen Groß July 28, 2023, 2:32 p.m. UTC | #14
On 28.07.23 16:08, Julien Grall wrote:
> Hi Juergen,
> 
> On 28/07/2023 14:24, Juergen Gross wrote:
>> On 28.07.23 14:48, Julien Grall wrote:
>>> Hi,
>>>
>>> On 28/07/2023 13:06, Juergen Gross wrote:
>>>> On 28.07.23 13:19, Julien Grall wrote:
>>>>>>>> In case of a runtime check I
>>>>>>>> agree that a more central place would be preferred.
>>>>>>>>
>>>>>>>> In the end I don't mind that much, but
>>>>>>>>
>>>>>>>>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>>>>>>>>               (typeof((struct node_hdr *)NULL->datalen))(-1));
>>>>>>>>
>>>>>>>> is a little bit clumsy IMHO.
>>>>>>>
>>>>>>> Agree. We could introduce FIELD_SIZEOF() (as Linux did) to hide the 
>>>>>>> complexity. The code would then look like:
>>>>>>>
>>>>>>>  >= (8 * FIELD_SIZEOF(struct node_hdr, datalen))
>>>>>>
>>>>>> Oh, I guess you mean sizeof_field().
>>>>>>
>>>>>> And even with that it would look quite clumsy:
>>>>>>
>>>>>>      BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >=
>>>>>>               (1UL << (8 * sizeof_field(struct node_hdr, datalen))));
>>>>>
>>>>> How about keeping the BUILD_BUG_ON() in write_node_raw() and add the 
>>>>> following comment on top of handle_input():
>>>>>
>>>>> Some fields in Xenstored are sized based on the max payload (see various 
>>>>> BUILD_BUG_ON()). This would need extra runtime check if we ever decide to 
>>>>> have a dynamic payload size.
>>>>
>>>> I _could_ do that, but where to stop adding such comments?
>>>
>>> When someone other than the author is able to understand the code without too 
>>> much effort. More comments never hurts, less will in the longer run (see below).
>>
>> I agree with that statement in general, but requesting a comment to aid a
>> future potential change violating the Xenstore wire protocol is a little bit
>> weird.
> 
> Well... This is violating the existing protocol, but it is not set in stone and 
> I think this is acceptable to update it when there is no change for the VMs and 
> for new features (e.g. Live-Update/Live-Migration).

No, I don't think so.

Think of Xenstore living in a stubdom. This means that even dom0 requests have
to go via a ring page, and you are just assuming that the kernel side driver
forwarding the requests from user mode is fine with XENSTORE_PAYLOAD_MAX _not_
being the absolute maximum of data transferred via a single command. I could
perfectly understand that the kernel might have a XENSTORE_PAYLOAD_MAX sized
buffer for the payload, which would be not large enough for your use case.

So violating XENSTORE_PAYLOAD_MAX might be a bad idea with some implementations,
at least in theory.

This means, BTW, that changing XENSTORE_PAYLOAD_MAX is a bad idea, too, as this
would require to sync between all components knowing its value.

>>>> TBH, I really don't see the point doing that.
>>>>
>>>> In case a patch came up upstream trying to violate XENSTORE_PAYLOAD_MAX I would
>>>> surely NACK it.
>>> That's assuming you will still be around when this happens :). I am not 
>>> wishing anything bad but the code will likely outlast any of us.
>>
>> Maybe. But would you really Ack patches adding comments like that in other
>> areas?
> 
> Potentially yes. We had a similar discussion on Arm when allowing paddr_t to be 
> 32-bit.
> 
> [...]
> 
>>>> In case we need payloads larger than XENSTORE_PAYLOAD_MAX we should split the
>>>> related operation in multiple parts (see e.g. XS_DIRECTORY_PART or XS_CONTROL
>>>> for uploading a new kernel to Xenstore-stubdom for live update). Which is, BTW,
>>>> the way AWS should have handled the migration problem (transactions come to my
>>>> mind in this context).
>>>
>>> I wasn't part of the original design, but I can see why it was done like that.
>>
>> I can see why it was done that way, but this doesn't mean I can understand
>> why such a design should be supported by adding comments helping to repeat such
>> a bad decision.
>>
>>> Using multiple commands has also its downside. The first that comes to my 
>>> mind if that you need to keep around the data. But, with your proposal, you 
>>> we wouldn't be able to store it in the database (like for transaction update) 
>>> as datalen can only be 65KB.
>>
>> I wasn't aware that a complete transaction needs to be kept in a single data
>> base record. :-)
> 
> IIUC, you are thinking that the client will restore all the state bits by bits. 
> But if you look at the design in docs/designs/xenstore-migration.md, this is a 
> blob.

Of course it is.

I was never assuming that the state would be applied piecemeal, this has to
happen atomically.

>> It would work perfectly fine to allocate the needed memory via talloc() and to
>> reference it from a special node being part of the transaction, or to not use
>> a node at all (see again the XS_CONTROL example).
> 
> I am not convinced the complexity is worth it here. To be honest, I think the 
> payload limit should have been relaxed for Live-Update as well as you don't gain 
> much to split. That said, this is less a concern because you are not time 
> constrained.
> 
> [...]
> 
>> But maybe that comment was based on wrong assumptions, like the mentioned
>> change not violating the protocol. >
>>> I am happy to rewrite the comment so it doesn't lead to think that you (as 
>>> the maintainer) are open to have a more relax length check.
>>
>> Yes, please make a suggestion for a proper comment not suggesting we are fine
>> to violate the wire protocol.
> 
> Here we go:
> 
> "The payload size is not only currently restricted by the protocol but also the 
> internal implementation (see various BUILD_BUG_ON())."

Hmm, I'm still feeling uneasy to imply that the payload size might be changed.
See above reasoning.

The only way I could imagine this being possible would be a per-ring-page
attribute with both sides agreeing to the max allowed size (the minimum being
today's value).

With that in mind I can hesitantly add the comment, maybe with the addition:
"Any potential change of the maximum payload size needs to be negotiated between
the involved parties."


Juergen
Julien Grall July 28, 2023, 2:59 p.m. UTC | #15
Hi Juergen,

On 28/07/2023 15:32, Juergen Gross wrote:
> On 28.07.23 16:08, Julien Grall wrote: >>>> Using multiple commands has also its downside. The first that comes
>>>> to my mind if that you need to keep around the data. But, with your 
>>>> proposal, you we wouldn't be able to store it in the database (like 
>>>> for transaction update) as datalen can only be 65KB.
>>>
>>> I wasn't aware that a complete transaction needs to be kept in a 
>>> single data
>>> base record. :-)
>>
>> IIUC, you are thinking that the client will restore all the state bits 
>> by bits. But if you look at the design in 
>> docs/designs/xenstore-migration.md, this is a blob.
> 
> Of course it is.
> 
> I was never assuming that the state would be applied piecemeal, this has to
> happen atomically.

I am confused because I don't see how this related to the discussion. 
Above, you mention a transaction, which I interpreted as the client 
would open a transaction and do a bunch of "write note", "set 
permissions"... And then commit the transaction.

I thought this is what you talked about and this would still be 
atomically. My point with the blob is that the parsing of the state is 
done by Xenstored, not the client.

> 
>>> It would work perfectly fine to allocate the needed memory via 
>>> talloc() and to
>>> reference it from a special node being part of the transaction, or to 
>>> not use
>>> a node at all (see again the XS_CONTROL example).
>>
>> I am not convinced the complexity is worth it here. To be honest, I 
>> think the payload limit should have been relaxed for Live-Update as 
>> well as you don't gain much to split. That said, this is less a 
>> concern because you are not time constrained.
>>
>> [...]
>>
>>> But maybe that comment was based on wrong assumptions, like the 
>>> mentioned
>>> change not violating the protocol. >
>>>> I am happy to rewrite the comment so it doesn't lead to think that 
>>>> you (as the maintainer) are open to have a more relax length check.
>>>
>>> Yes, please make a suggestion for a proper comment not suggesting we 
>>> are fine
>>> to violate the wire protocol.
>>
>> Here we go:
>>
>> "The payload size is not only currently restricted by the protocol but 
>> also the internal implementation (see various BUILD_BUG_ON())."
> 
> Hmm, I'm still feeling uneasy to imply that the payload size might be 
> changed.
> See above reasoning.
> 
> The only way I could imagine this being possible would be a per-ring-page
> attribute with both sides agreeing to the max allowed size (the minimum 
> being
> today's value).
> 
> With that in mind I can hesitantly add the comment, maybe with the 
> addition:
> "Any potential change of the maximum payload size needs to be negotiated 
> between
> the involved parties."

I am ok with that.

Cheers,
Jürgen Groß July 28, 2023, 3:08 p.m. UTC | #16
On 28.07.23 16:59, Julien Grall wrote:
> Hi Juergen,
> 
> On 28/07/2023 15:32, Juergen Gross wrote:
>> On 28.07.23 16:08, Julien Grall wrote: >>>> Using multiple commands has also 
>> its downside. The first that comes
>>>>> to my mind if that you need to keep around the data. But, with your 
>>>>> proposal, you we wouldn't be able to store it in the database (like for 
>>>>> transaction update) as datalen can only be 65KB.
>>>>
>>>> I wasn't aware that a complete transaction needs to be kept in a single data
>>>> base record. :-)
>>>
>>> IIUC, you are thinking that the client will restore all the state bits by 
>>> bits. But if you look at the design in docs/designs/xenstore-migration.md, 
>>> this is a blob.
>>
>> Of course it is.
>>
>> I was never assuming that the state would be applied piecemeal, this has to
>> happen atomically.
> 
> I am confused because I don't see how this related to the discussion. Above, you 
> mention a transaction, which I interpreted as the client would open a 
> transaction and do a bunch of "write note", "set permissions"... And then commit 
> the transaction.
> 
> I thought this is what you talked about and this would still be atomically. My 
> point with the blob is that the parsing of the state is done by Xenstored, not 
> the client.

My idea was that the transaction would be used to mark the related sub-commands
to belong to a single migration. This would make cleaning up in case of client
death much simpler.

Not using transactions would be possible, too, of course.

> 
>>
>>>> It would work perfectly fine to allocate the needed memory via talloc() and to
>>>> reference it from a special node being part of the transaction, or to not use
>>>> a node at all (see again the XS_CONTROL example).
>>>
>>> I am not convinced the complexity is worth it here. To be honest, I think the 
>>> payload limit should have been relaxed for Live-Update as well as you don't 
>>> gain much to split. That said, this is less a concern because you are not 
>>> time constrained.
>>>
>>> [...]
>>>
>>>> But maybe that comment was based on wrong assumptions, like the mentioned
>>>> change not violating the protocol. >
>>>>> I am happy to rewrite the comment so it doesn't lead to think that you (as 
>>>>> the maintainer) are open to have a more relax length check.
>>>>
>>>> Yes, please make a suggestion for a proper comment not suggesting we are fine
>>>> to violate the wire protocol.
>>>
>>> Here we go:
>>>
>>> "The payload size is not only currently restricted by the protocol but also 
>>> the internal implementation (see various BUILD_BUG_ON())."
>>
>> Hmm, I'm still feeling uneasy to imply that the payload size might be changed.
>> See above reasoning.
>>
>> The only way I could imagine this being possible would be a per-ring-page
>> attribute with both sides agreeing to the max allowed size (the minimum being
>> today's value).
>>
>> With that in mind I can hesitantly add the comment, maybe with the addition:
>> "Any potential change of the maximum payload size needs to be negotiated between
>> the involved parties."
> 
> I am ok with that.

Okay.


Juergen
diff mbox series

Patch

diff --git a/tools/xenstore/utils.h b/tools/xenstore/utils.h
index 028ecb9d7a..405d662ea2 100644
--- a/tools/xenstore/utils.h
+++ b/tools/xenstore/utils.h
@@ -9,15 +9,6 @@ 
 
 #include "xenstore_lib.h"
 
-/* Header of the node record in tdb. */
-struct xs_tdb_record_hdr {
-	uint64_t generation;
-	uint32_t num_perms;
-	uint32_t datalen;
-	uint32_t childlen;
-	struct xs_permissions perms[0];
-};
-
 /* Is A == B ? */
 #define streq(a,b) (strcmp((a),(b)) == 0)
 
diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c
index 1f5f118f1c..86b7c9bf36 100644
--- a/tools/xenstore/xenstored_core.c
+++ b/tools/xenstore/xenstored_core.c
@@ -555,9 +555,9 @@  static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout)
 	}
 }
 
-const struct xs_tdb_record_hdr *db_fetch(const char *db_name, size_t *size)
+const struct node_hdr *db_fetch(const char *db_name, size_t *size)
 {
-	struct xs_tdb_record_hdr *hdr;
+	struct node_hdr *hdr;
 
 	hdr = hashtable_search(nodes, db_name);
 	if (!hdr) {
@@ -565,7 +565,7 @@  const struct xs_tdb_record_hdr *db_fetch(const char *db_name, size_t *size)
 		return NULL;
 	}
 
-	*size = sizeof(*hdr) + hdr->num_perms * sizeof(hdr->perms[0]) +
+	*size = sizeof(*hdr) + hdr->num_perms * sizeof(struct xs_permissions) +
 		hdr->datalen + hdr->childlen;
 
 	trace_tdb("read %s size %zu\n", db_name, *size + strlen(db_name));
@@ -573,10 +573,15 @@  const struct xs_tdb_record_hdr *db_fetch(const char *db_name, size_t *size)
 	return hdr;
 }
 
+static struct xs_permissions *perms_from_node_hdr(const struct node_hdr *hdr)
+{
+	return (struct xs_permissions *)(hdr + 1);
+}
+
 static void get_acc_data(const char *name, struct node_account_data *acc)
 {
 	size_t size;
-	const struct xs_tdb_record_hdr *hdr;
+	const struct node_hdr *hdr;
 
 	if (acc->memory < 0) {
 		hdr = db_fetch(name, &size);
@@ -585,7 +590,7 @@  static void get_acc_data(const char *name, struct node_account_data *acc)
 			acc->memory = 0;
 		} else {
 			acc->memory = size;
-			acc->domid = hdr->perms[0].id;
+			acc->domid = perms_from_node_hdr(hdr)->id;
 		}
 	}
 }
@@ -606,7 +611,7 @@  int db_write(struct connection *conn, const char *db_name, const void *data,
 	     size_t size, struct node_account_data *acc,
 	     enum write_node_mode mode, bool no_quota_check)
 {
-	const struct xs_tdb_record_hdr *hdr = data;
+	const struct node_hdr *hdr = data;
 	struct node_account_data old_acc = {};
 	unsigned int old_domid, new_domid;
 	size_t name_len = strlen(db_name);
@@ -620,7 +625,7 @@  int db_write(struct connection *conn, const char *db_name, const void *data,
 
 	get_acc_data(db_name, &old_acc);
 	old_domid = get_acc_domid(conn, db_name, old_acc.domid);
-	new_domid = get_acc_domid(conn, db_name, hdr->perms[0].id);
+	new_domid = get_acc_domid(conn, db_name, perms_from_node_hdr(hdr)->id);
 
 	/*
 	 * Don't check for ENOENT, as we want to be able to switch orphaned
@@ -661,7 +666,7 @@  int db_write(struct connection *conn, const char *db_name, const void *data,
 
 	if (acc) {
 		/* Don't use new_domid, as it might be a transaction node. */
-		acc->domid = hdr->perms[0].id;
+		acc->domid = perms_from_node_hdr(hdr)->id;
 		acc->memory = size;
 	}
 
@@ -699,7 +704,7 @@  struct node *read_node(struct connection *conn, const void *ctx,
 		       const char *name)
 {
 	size_t size;
-	const struct xs_tdb_record_hdr *hdr;
+	const struct node_hdr *hdr;
 	struct node *node;
 	const char *db_name;
 	int err;
@@ -733,12 +738,12 @@  struct node *read_node(struct connection *conn, const void *ctx,
 	node->perms.num = hdr->num_perms;
 	node->datalen = hdr->datalen;
 	node->childlen = hdr->childlen;
-	node->acc.domid = hdr->perms[0].id;
+	node->acc.domid = perms_from_node_hdr(hdr)->id;
 	node->acc.memory = size;
 
 	/* Copy node data to new memory area, starting with permissions. */
 	size -= sizeof(*hdr);
-	node->perms.p = talloc_memdup(node, hdr->perms, size);
+	node->perms.p = talloc_memdup(node, perms_from_node_hdr(hdr), size);
 	if (node->perms.p == NULL) {
 		errno = ENOMEM;
 		goto error;
@@ -785,7 +790,7 @@  int write_node_raw(struct connection *conn, const char *db_name,
 	void *data;
 	size_t size;
 	void *p;
-	struct xs_tdb_record_hdr *hdr;
+	struct node_hdr *hdr;
 
 	if (domain_adjust_node_perms(node))
 		return errno;
@@ -812,9 +817,9 @@  int write_node_raw(struct connection *conn, const char *db_name,
 	hdr->datalen = node->datalen;
 	hdr->childlen = node->childlen;
 
-	memcpy(hdr->perms, node->perms.p,
-	       node->perms.num * sizeof(*node->perms.p));
-	p = hdr->perms + node->perms.num;
+	p = perms_from_node_hdr(hdr);
+	memcpy(p, node->perms.p, node->perms.num * sizeof(*node->perms.p));
+	p += node->perms.num * sizeof(*node->perms.p);
 	memcpy(p, node->data, node->datalen);
 	p += node->datalen;
 	memcpy(p, node->children, node->childlen);
diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h
index 6d1578ce97..c965709090 100644
--- a/tools/xenstore/xenstored_core.h
+++ b/tools/xenstore/xenstored_core.h
@@ -168,6 +168,24 @@  struct connection
 };
 extern struct list_head connections;
 
+/*
+ * Header of the node record in the data base.
+ * In the data base the memory of the node is a single memory chunk with the
+ * following format:
+ * struct {
+ *     node_hdr hdr;
+ *     struct xs_permissions perms[hdr.num_perms];
+ *     char data[hdr.datalen];
+ *     char children[hdr.childlen];
+ * };
+ */
+struct node_hdr {
+	uint64_t generation;
+	uint16_t num_perms;
+	uint16_t datalen;
+	uint32_t childlen;
+};
+
 struct node_perms {
 	unsigned int num;
 	struct xs_permissions *p;
@@ -362,7 +380,7 @@  extern xengnttab_handle **xgt_handle;
 int remember_string(struct hashtable *hash, const char *str);
 
 /* Data base access functions. */
-const struct xs_tdb_record_hdr *db_fetch(const char *db_name, size_t *size);
+const struct node_hdr *db_fetch(const char *db_name, size_t *size);
 int db_write(struct connection *conn, const char *db_name, const void *data,
 	     size_t size, struct node_account_data *acc,
 	     enum write_node_mode mode, bool no_quota_check);
diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c
index a90283dcc5..9ca73b9874 100644
--- a/tools/xenstore/xenstored_transaction.c
+++ b/tools/xenstore/xenstored_transaction.c
@@ -357,7 +357,7 @@  static int finalize_transaction(struct connection *conn,
 {
 	struct accessed_node *i, *n;
 	size_t size;
-	const struct xs_tdb_record_hdr *hdr;
+	const struct node_hdr *hdr;
 	uint64_t gen;
 
 	list_for_each_entry_safe(i, n, &trans->accessed, list) {
@@ -394,12 +394,12 @@  static int finalize_transaction(struct connection *conn,
 				 * generation count.
 				 */
 				enum write_node_mode mode;
-				struct xs_tdb_record_hdr *own;
+				struct node_hdr *own;
 
 				talloc_increase_ref_count(hdr);
 				db_delete(conn, i->trans_name, NULL);
 
-				own = (struct xs_tdb_record_hdr *)hdr;
+				own = (struct node_hdr *)hdr;
 				own->generation = ++generation;
 				mode = (i->generation == NO_GENERATION)
 				       ? NODE_CREATE : NODE_MODIFY;