diff mbox series

[4/5] evaluate: check variadic argument types against formatting info

Message ID 20181029153952.13927-5-ben.dooks@codethink.co.uk (mailing list archive)
State Superseded, archived
Headers show
Series [1/5] tokenize: check show_string() for NULL pointer | expand

Commit Message

Ben Dooks Oct. 29, 2018, 3:39 p.m. UTC
The variadic argumnet code did not check any of the variadic arguments
as it did not previously know the possible type. Now we have the possible
formatting information stored in the ctype, we can do some checks on the
printf formatting types.

Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
---
Fixes since v1:
- Split out the format-string -> symbol code
- Use symbol_list for the symbols from format parsing
- Changed to follow the new parsing code and ctype use
- Merged the unsigned-int/long types together

Notes:
- Is there a way of better doing the vararg list?
- %p still generates an address-space mismatch
- how do we deal with the kernel's attempt to make printk format all types?
---
 evaluate.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 parse.c    |   2 +-
 2 files changed, 162 insertions(+), 5 deletions(-)

Comments

Luc Van Oostenryck Oct. 29, 2018, 9:19 p.m. UTC | #1
On Mon, Oct 29, 2018 at 03:39:51PM +0000, Ben Dooks wrote:
> The variadic argumnet code did not check any of the variadic arguments
> as it did not previously know the possible type. Now we have the possible
> formatting information stored in the ctype, we can do some checks on the
> printf formatting types.
> 
> Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
> ---
> Fixes since v1:
> - Split out the format-string -> symbol code
> - Use symbol_list for the symbols from format parsing
> - Changed to follow the new parsing code and ctype use
> - Merged the unsigned-int/long types together
> 
> Notes:
> - Is there a way of better doing the vararg list?

I'm not totally sure to understand but retireving a
sym from the variadic_types can/should be:
* done with an helper giving the nth element from
  a list (I think I've such an helper somewhere in
  an unposted patch series, I'll look after it).
* more simply done with an iteration:
	PREPARE_PTR_LIST(variadic_types, vtype);
	...
	if (i >= fn->ctype.printf_va_start) {
		target = vtype;
		 NEXT_PTR_LIST(vtype);
	}

> - %p still generates an address-space mismatch
Yes. See my comment in patch 5/5.
I really thing you will need to either special-case it
or more probably store into 'result' some checking function
instead of simply storing a type (see nex point).

> - how do we deal with the kernel's attempt to make printk format all types?
In evaluate_printf_symbol(), instead of having a simple switch
testing 'the' conversion character, I think it will be easier
and much more flexible to use a table which will maps
the conversion string (here only 1-char string but ...)
into a function pointer that will make the check.

The string at entry will allow to use more complex conversion
format (like kernel's "%pI4b", for example).

The checking function will allow to make real type check.
The problem is that as it is now, the checking is done by
compatible_argument_type() by this function is much more
permissive than you wish (for example, a 'long' is compatible
with an 'int' and thus won't give you any warning although
they have a different size (on 64 bit) which is the most
important thing to check in the variadic checks).
Same for "%p" and noderef, like you have discovered.

> ---
>  evaluate.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>  parse.c    |   2 +-
>  2 files changed, 162 insertions(+), 5 deletions(-)
> 
> diff --git a/evaluate.c b/evaluate.c
> index b96696d..2a98a9b 100644
> --- a/evaluate.c
> +++ b/evaluate.c
> @@ -2243,23 +2243,180 @@ static struct symbol *evaluate_alignof(struct expression *expr)
>  	return size_t_ctype;
>  }
>  
> +struct printf_state {
> +	int	len;	/* size of the argument (ie, %d, %ld, %lld, etc.) */
> +};
> +
> +static struct symbol *evaluate_printf_symbol(const char *string, struct printf_state *state)

In sparse 'evaluation' is really 'type evaluation'. I would prefer that
you name this function something like 'parse_printf_conversion/format()'.

> +static int decompose_format_printf(const char *string, struct symbol_list **result)
> +{
> +	struct printf_state state;
> +	int count = 0;
> +
> +	/* TODO - deal with explitic position arguments */
> +	for (; string[0] != '\0'; string++) {
> +		struct symbol *sym;
> +
> +		if (string[0] != '%')
> +			continue;
> +		if (string[1] == '%') {
> +			string++;
> +			continue;
> +		}
> +
> +		state.len = 0;
> +
> +		/* get rid of any formatting width bits */
> +		while (isdigit(string[1]) || string[1] == '+' || string[1] == '-')
> +			string++;
> +
> +		sym = evaluate_printf_symbol(string+1, &state);
> +		if (sym) {
> +			add_symbol(result, sym);
> +			count++;
> +		}
> +
> +		while (string[0] != ' ' && string[0] != '\0')
> +			string++;
> +
> +		string--;
> +	}

Unless you have some further plan for struct printf_state, the variable
'state' is really something local to evaluate_printf_symbol() and
should be moved tere.
Likewise, 'sym' is not really neeed here as you can pass 'result'
directly to evaluate_printf_symbol() and add the symbol there.
And if you then let evaluate_printf_symbol() return the next position
of the string you can simplify part of the string handling here.


> @@ -2281,11 +2438,11 @@ static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
>  			sprintf(where, "argument %d", i);
>  			compatible_argument_type(expr, target, p, where);
>  		}
> -
> -		i++;
>  		NEXT_PTR_LIST(argtype);
> +		i++;
>  	} END_FOR_EACH_PTR(expr);
>  	FINISH_PTR_LIST(argtype);
> +
>  	return 1;
>  }

This is not needed, no?
  
> diff --git a/parse.c b/parse.c
> index 9b0d40e..6b0a20b 100644
> --- a/parse.c
> +++ b/parse.c
> @@ -1078,7 +1078,7 @@ static struct token *attribute_format(struct token *token, struct symbol *attr,
>  				fmt_sym = lookup_keyword(token->ident, NS_KEYWORD);
>  
>  			if (!fmt_sym || !fmt_sym->op ||
> -			    fmt_sym->op != &attr_printf_op) {
> +			    fmt_sym->op->type != KW_FORMAT) {
>  				sparse_error(token->pos,
>  					     "unknown format type '%s'\n",
>  					     show_ident(token->ident));

This should be folded with the third patch.

Kind regards,
-- Luc
Ben Dooks Oct. 30, 2018, 9:17 a.m. UTC | #2
On 29/10/18 21:19, Luc Van Oostenryck wrote:
> On Mon, Oct 29, 2018 at 03:39:51PM +0000, Ben Dooks wrote:
>> The variadic argumnet code did not check any of the variadic arguments
>> as it did not previously know the possible type. Now we have the possible
>> formatting information stored in the ctype, we can do some checks on the
>> printf formatting types.
>>
>> Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
>> ---
>> Fixes since v1:
>> - Split out the format-string -> symbol code
>> - Use symbol_list for the symbols from format parsing
>> - Changed to follow the new parsing code and ctype use
>> - Merged the unsigned-int/long types together
>>
>> Notes:
>> - Is there a way of better doing the vararg list?
> 
> I'm not totally sure to understand but retireving a
> sym from the variadic_types can/should be:
> * done with an helper giving the nth element from
>    a list (I think I've such an helper somewhere in
>    an unposted patch series, I'll look after it).
> * more simply done with an iteration:
> 	PREPARE_PTR_LIST(variadic_types, vtype);
> 	...
> 	if (i >= fn->ctype.printf_va_start) {
> 		target = vtype;
> 		 NEXT_PTR_LIST(vtype);
> 	}

I tried the PREPARE_PTR_LIST(variadic_types, vtype); and
the resultant mess of errors was too difficult to untangle

>> - %p still generates an address-space mismatch
> Yes. See my comment in patch 5/5.
> I really thing you will need to either special-case it
> or more probably store into 'result' some checking function
> instead of simply storing a type (see nex point).
> 
>> - how do we deal with the kernel's attempt to make printk format all types?
> In evaluate_printf_symbol(), instead of having a simple switch
> testing 'the' conversion character, I think it will be easier
> and much more flexible to use a table which will maps
> the conversion string (here only 1-char string but ...)
> into a function pointer that will make the check.
> 
> The string at entry will allow to use more complex conversion
> format (like kernel's "%pI4b", for example).
> 
> The checking function will allow to make real type check.
> The problem is that as it is now, the checking is done by
> compatible_argument_type() by this function is much more
> permissive than you wish (for example, a 'long' is compatible
> with an 'int' and thus won't give you any warning although
> they have a different size (on 64 bit) which is the most
> important thing to check in the variadic checks).
> Same for "%p" and noderef, like you have discovered.

I think if we have a de-ref by position for the list code
then we can parse the format string and directly check the
argument list there. It would make dealing with positional
arguments easier too.

>> ---
>>   evaluate.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>>   parse.c    |   2 +-
>>   2 files changed, 162 insertions(+), 5 deletions(-)
>>
>> diff --git a/evaluate.c b/evaluate.c
>> index b96696d..2a98a9b 100644
>> --- a/evaluate.c
>> +++ b/evaluate.c
>> @@ -2243,23 +2243,180 @@ static struct symbol *evaluate_alignof(struct expression *expr)
>>   	return size_t_ctype;
>>   }
>>   
>> +struct printf_state {
>> +	int	len;	/* size of the argument (ie, %d, %ld, %lld, etc.) */
>> +};
>> +
>> +static struct symbol *evaluate_printf_symbol(const char *string, struct printf_state *state)
> 
> In sparse 'evaluation' is really 'type evaluation'. I would prefer that
> you name this function something like 'parse_printf_conversion/format()'.

ok, will rename this.

>> +static int decompose_format_printf(const char *string, struct symbol_list **result)
>> +{
>> +	struct printf_state state;
>> +	int count = 0;
>> +
>> +	/* TODO - deal with explitic position arguments */
>> +	for (; string[0] != '\0'; string++) {
>> +		struct symbol *sym;
>> +
>> +		if (string[0] != '%')
>> +			continue;
>> +		if (string[1] == '%') {
>> +			string++;
>> +			continue;
>> +		}
>> +
>> +		state.len = 0;
>> +
>> +		/* get rid of any formatting width bits */
>> +		while (isdigit(string[1]) || string[1] == '+' || string[1] == '-')
>> +			string++;
>> +
>> +		sym = evaluate_printf_symbol(string+1, &state);
>> +		if (sym) {
>> +			add_symbol(result, sym);
>> +			count++;
>> +		}
>> +
>> +		while (string[0] != ' ' && string[0] != '\0')
>> +			string++;
>> +
>> +		string--;
>> +	}
> 
> Unless you have some further plan for struct printf_state, the variable
> 'state' is really something local to evaluate_printf_symbol() and
> should be moved tere.

> Likewise, 'sym' is not really neeed here as you can pass 'result'
> directly to evaluate_printf_symbol() and add the symbol there.
> And if you then let evaluate_printf_symbol() return the next position
> of the string you can simplify part of the string handling here.

I've had to re-work this as i've realised it isn't working as planned 
for %l and %ll :/


I am going to keep count as we should probably add some code to check
the expected argument count with the reality of what was passed to the
fn evaluation code.

>> @@ -2281,11 +2438,11 @@ static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
>>   			sprintf(where, "argument %d", i);
>>   			compatible_argument_type(expr, target, p, where);
>>   		}
>> -
>> -		i++;
>>   		NEXT_PTR_LIST(argtype);
>> +		i++;
>>   	} END_FOR_EACH_PTR(expr);
>>   	FINISH_PTR_LIST(argtype);
>> +
>>   	return 1;
>>   }
> 
> This is not needed, no?

Fixed.

>> diff --git a/parse.c b/parse.c
>> index 9b0d40e..6b0a20b 100644
>> --- a/parse.c
>> +++ b/parse.c
>> @@ -1078,7 +1078,7 @@ static struct token *attribute_format(struct token *token, struct symbol *attr,
>>   				fmt_sym = lookup_keyword(token->ident, NS_KEYWORD);
>>   
>>   			if (!fmt_sym || !fmt_sym->op ||
>> -			    fmt_sym->op != &attr_printf_op) {
>> +			    fmt_sym->op->type != KW_FORMAT) {
>>   				sparse_error(token->pos,
>>   					     "unknown format type '%s'\n",
>>   					     show_ident(token->ident));
> 
> This should be folded with the third patch.

Thanks, spotted this and fixed those issues already.
diff mbox series

Patch

diff --git a/evaluate.c b/evaluate.c
index b96696d..2a98a9b 100644
--- a/evaluate.c
+++ b/evaluate.c
@@ -2243,23 +2243,180 @@  static struct symbol *evaluate_alignof(struct expression *expr)
 	return size_t_ctype;
 }
 
+struct printf_state {
+	int	len;	/* size of the argument (ie, %d, %ld, %lld, etc.) */
+};
+
+static struct symbol *evaluate_printf_symbol(const char *string, struct printf_state *state)
+{
+	struct symbol *sym = NULL;
+
+	switch (string[0]) {
+	case 'C':
+		/* TODO - same as lc */
+		break;
+	case 'c':
+		/* TODO - can take l modifier */
+		sym = &char_ctype;
+		break;
+	case 'f':
+	case 'g':
+		sym = &double_ctype;
+		break;
+	case 'h':
+		/* TODO hh */
+		state->len = -1;
+		break;
+	case 'j':	/* ignore intmax/uintmax for the moment */
+		break;
+	case 'L':
+		sym = &ldouble_ctype;
+		break;
+	case 'l':
+		state->len++;
+		break;
+	case 'p':
+		/* TODO - deal with void * not being de-referenced in some cases*/
+		sym = &ptr_ctype_noderef;
+		break;
+	case 'q':
+		state->len = 2;
+		break;
+	case 's':
+		sym = &string_ctype;
+		break;
+	case 'n':
+		/* TODO - actually pointer to integer */
+		sym = &ptr_ctype;
+		break;
+		/* note, d is out of alpha order */
+	case 'd':
+		switch (state->len) {
+		case -1: sym = &short_ctype; break;
+		case 0: sym = &int_ctype; break;
+		case 1: sym = &long_ctype; break;
+		case 2: sym = &llong_ctype; break;
+		case 3: sym = &lllong_ctype; break;
+		}
+		break;
+	case 'u':
+	case 'x':
+	case 'X':
+		switch (state->len) {
+		case -1: sym = &ushort_ctype; break;
+		case 0: sym = &uint_ctype; break;
+		case 1: sym = &ulong_ctype; break;
+		case 2: sym = &ullong_ctype; break;
+		case 3: sym = &ulllong_ctype; break;
+		}
+		break;
+	case 'z':
+	case 'Z':
+		sym = &uint_ctype;	/* TODO */
+		break;
+	}
+
+	return sym;
+}
+
+static int decompose_format_printf(const char *string, struct symbol_list **result)
+{
+	struct printf_state state;
+	int count = 0;
+
+	/* TODO - deal with explitic position arguments */
+	for (; string[0] != '\0'; string++) {
+		struct symbol *sym;
+
+		if (string[0] != '%')
+			continue;
+		if (string[1] == '%') {
+			string++;
+			continue;
+		}
+
+		state.len = 0;
+
+		/* get rid of any formatting width bits */
+		while (isdigit(string[1]) || string[1] == '+' || string[1] == '-')
+			string++;
+
+		sym = evaluate_printf_symbol(string+1, &state);
+		if (sym) {
+			add_symbol(result, sym);
+			count++;
+		}
+
+		while (string[0] != ' ' && string[0] != '\0')
+			string++;
+
+		string--;
+	}
+
+	return count;
+}
+
+
+static int evaluate_format_printf(struct symbol *fn, struct expression *expr, struct symbol_list **result)
+{
+	const char *fmt_string = NULL;
+	int count = -1;
+
+	if (!expr)
+		return -1;
+	if (expr->string && expr->string->length)
+		fmt_string = expr->string->data;
+	if (!fmt_string) {
+		struct symbol *sym = evaluate_expression(expr);
+
+		/* attempt to find initialiser for this */
+		if (sym && sym->initializer && sym->initializer->string)
+			fmt_string = sym->initializer->string->data;
+	}
+
+	if (fmt_string)
+		count = decompose_format_printf(fmt_string, result);
+	return count;
+}
+
 static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
 {
 	struct expression *expr;
 	struct symbol_list *argument_types = fn->arguments;
+	struct symbol_list *variadic_types = NULL;
 	struct symbol *argtype;
 	int i = 1;
 
+	/* if we have variadic type info, copy the original arguments
+	 * first so that the format parsing can modify this local set */
+
 	PREPARE_PTR_LIST(argument_types, argtype);
 	FOR_EACH_PTR (head, expr) {
 		struct expression **p = THIS_ADDRESS(expr);
-		struct symbol *ctype, *target;
+		struct symbol *ctype, *target = NULL;
 		ctype = evaluate_expression(expr);
 
 		if (!ctype)
 			return 0;
 
-		target = argtype;
+		if (i == fn->ctype.printf_msg) {
+			int ret = evaluate_format_printf(fn, *p, &variadic_types);
+			if (ret < 0)
+				warning((*p)->pos, "cannot parse format");
+		}
+
+		if (i >= fn->ctype.printf_va_start) {
+			struct symbol *sym;
+			int arg = i - fn->ctype.printf_va_start;
+
+			FOR_EACH_PTR(variadic_types, sym) {
+				if (arg == 0)
+					target = sym;
+				arg--;
+			} END_FOR_EACH_PTR(sym);
+		} else {
+			target = argtype;
+		}
 		if (!target) {
 			struct symbol *type;
 			int class = classify_type(ctype, &type);
@@ -2281,11 +2438,11 @@  static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
 			sprintf(where, "argument %d", i);
 			compatible_argument_type(expr, target, p, where);
 		}
-
-		i++;
 		NEXT_PTR_LIST(argtype);
+		i++;
 	} END_FOR_EACH_PTR(expr);
 	FINISH_PTR_LIST(argtype);
+
 	return 1;
 }
 
diff --git a/parse.c b/parse.c
index 9b0d40e..6b0a20b 100644
--- a/parse.c
+++ b/parse.c
@@ -1078,7 +1078,7 @@  static struct token *attribute_format(struct token *token, struct symbol *attr,
 				fmt_sym = lookup_keyword(token->ident, NS_KEYWORD);
 
 			if (!fmt_sym || !fmt_sym->op ||
-			    fmt_sym->op != &attr_printf_op) {
+			    fmt_sym->op->type != KW_FORMAT) {
 				sparse_error(token->pos,
 					     "unknown format type '%s'\n",
 					     show_ident(token->ident));