[2/5] evaluate: check variadic argument types against formatting info
diff mbox series

Message ID 20181101181117.3877-3-ben.dooks@codethink.co.uk
State Under Review
Headers show
Series
  • [1/5] ptrlist: add ptr_list_nth_entry()
Related show

Commit Message

Ben Dooks Nov. 1, 2018, 6:11 p.m. UTC
The variadic argumnet code did not check any of the variadic arguments
as it did not previously know the possible type. Now we have the possible
formatting information stored in the ctype, we can do some checks on the
printf formatting types.

Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
---
Fixes since v1:
- Split out the format-string -> symbol code
- Use symbol_list for the symbols from format parsing
- Changed to follow the new parsing code and ctype use
- Merged the unsigned-int/long types together

Fixes since v2:
- Check for printf_va_start before checking variadic-list
- Tidy the type code and fix a couple of bugs with %l and %ll
- Fix function names in working through printf arguments.
- Tidy documentation

Fixes since v3:
- Added positional arguments
- Also added precision and width specifiers

Notes:
- Awaiting new pointer-list accessing code
- %p still generates an address-space mismatch
- how do we deal with the kernel's attempt to make printk format all types?

expr: updated handling for formats
---
 evaluate.c | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 297 insertions(+)

Comments

Luc Van Oostenryck Nov. 1, 2018, 11:50 p.m. UTC | #1
On Thu, Nov 01, 2018 at 06:11:14PM +0000, Ben Dooks wrote:
> The variadic argumnet code did not check any of the variadic arguments
> as it did not previously know the possible type. Now we have the possible
> formatting information stored in the ctype, we can do some checks on the
> printf formatting types.
> 
> Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
> ---
> Fixes since v1:
> - Split out the format-string -> symbol code
> - Use symbol_list for the symbols from format parsing
> - Changed to follow the new parsing code and ctype use
> - Merged the unsigned-int/long types together
> 
> Fixes since v2:
> - Check for printf_va_start before checking variadic-list
> - Tidy the type code and fix a couple of bugs with %l and %ll
> - Fix function names in working through printf arguments.
> - Tidy documentation
> 
> Fixes since v3:
> - Added positional arguments
> - Also added precision and width specifiers
> 
> Notes:
> - Awaiting new pointer-list accessing code
> - %p still generates an address-space mismatch
> - how do we deal with the kernel's attempt to make printk format all types?
> 
> expr: updated handling for formats
> ---
>  evaluate.c | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 297 insertions(+)
> 
> diff --git a/evaluate.c b/evaluate.c
> index b96696d..ffb4af6 100644
> --- a/evaluate.c
> +++ b/evaluate.c
> @@ -2243,6 +2243,297 @@ static struct symbol *evaluate_alignof(struct expression *expr)
>  	return size_t_ctype;
>  }
>  
> +struct format_type {
> +	const char	*format;
> +	int		(*test)(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff);
> +	void		*data;

Does this need to be a void pointer? Isn't it always a 'struct symbol *'?

> +};
> +
> +struct format_state {
> +	struct expression	*expr;
> +	unsigned int		va_start;
> +	unsigned int		fmt_index;
> +	unsigned int		arg_index;
> +	unsigned int		used_position: 1;
> +};
> +
> +static int printf_fmt_inttype(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
> +{
> +	struct symbol *type = data;
> +	*target = type;
> +	return ctype == type;

This will be fine as a first step but it won't work if thz argument
has some modifiers. It will be best, I think, to use something like
is_int_type() and type_difference.

> +}
> +
> +// todo - comparing pointers isn't easy...
> +static int printf_fmt_string(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
> +{
> +	*target = &string_ctype;

Same as above for integers. For example what if the arg
is a 'const char *'?

> +	return check_assignment_types(*target, expr, typediff);
> +}
> +
> +static int printf_fmt_pointer(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
> +{
> +	*target = &ptr_ctype;
> +	// todo - deal with not caring about address-space //

What about:
	if (is_ptr_type(ctype))
		...

> +static struct format_type *parse_printf_get_fmt(char *msg, char **msgout)
> +{
> +	struct format_type *type;
> +	char *end;
> +	int len;
> +
> +	for (end = msg; *end > ' '; end++);
I don't understand this line. Please explain.

> +	*msgout = end+1;
> +
> +	len = (end - msg);
> +	for (type = printf_fmts; type->format != NULL; type++)
> +		if (!strncmp(msg, type->format, len))
> +			return type;
> +
> +	return NULL;
> +}
> +
> +static int is_printf_flag(char ch)
> +{
> +	return ch == '0' || ch == '+' || ch == '-' || ch == ' ' || ch == '#';
> +}
> +
> +static int printf_check_position(char **fmt)
> +{
> +	char *ptr= *fmt;
> +
> +	while (isdigit(*ptr))
> +		ptr++;
> +	if (*ptr == '$') {
> +		char *pos = *fmt;
> +		*fmt = ptr+1;
> +		return atoi(pos);

Use strtol(), please. Also (and related) "%$d" is invalid (or is
this checked by parse_format_printf_checkpos() ?).

> +	}
> +	return -1;
> +}
> +
> +static void parse_format_printf_checkpos(struct format_state *state, const char *which)
> +{
> +	if (state->used_position)
> +		warning(state->expr->pos,
> +			"format %d: %s: no position specified",
> +			state->arg_index-1, which);
> +}
> +
> +static int parse_format_printf_argfield(char **fmtptr, struct format_state *state, struct expression_list *head, int *pos, const char *which)
> +{
> +	struct expression *expr;
> +	char *fmt = *fmtptr;
> +	int argpos = -1;
> +
> +	/* check for simple digit-string width/precision specifier first */
> +	if (*fmt != '*') {
> +		while (isdigit(*fmt))
> +			fmt++;
> +		*fmtptr = fmt;
> +		return 0;
> +	}
> +
> +	fmt++;
> +	argpos = printf_check_position(&fmt);
> +
> +	if (argpos > 0) {
> +		argpos += state->va_start - 1;
> +		state->used_position = 1;
> +	} else {
> +		argpos = (*pos)++;
> +		state->arg_index++;
> +		parse_format_printf_checkpos(state, which);
> +	}
> +
> +	*fmtptr = fmt;
> +	expr = get_expression_n(head, argpos-1);
> +	if (!expr) {
> +		warning(state->expr->pos, "%s: no argument at position %d", which, argpos);
> +		return 1;
> +	}
> +
> +	/* todo - verify we got an actual integer argument */
> +	return 0;
> +}
> +
> +/* printf format parsing code
> + *
> + * TODO:
> + * - fix up type checking of castable types (such as int vs long vs long long)
> + * - validate all arguments specified are also used...
> + * - possibly remove return code from this function
> + */
> +static int parse_format_printf(const char **fmtstring, struct format_state *state, struct expression_list *head)
> +{
> +	char buff[strlen(*fmtstring)+1];
> +	struct format_type *type;
> +	struct expression *expr;
> +	const char *string = *fmtstring;
> +	char *fmtpost = NULL, *fmt = buff;
> +	int pos = state->arg_index;
> +	int error = 0;
> +	int ret;
> +
> +	/* trivial check for %% */
> +	string++;
> +	if (string[0] == '%') {
> +		*fmtstring = string;
> +		return 0;
> +	}
> +
> +	state->arg_index++;
> +	state->fmt_index++;
> +
> +	strcpy(buff, string);
Is a copy really needed?

> +	ret = printf_check_position(&fmt);
> +	if (ret == 0) {
> +		/* we got an invalid position argument */
> +		error++;
> +	} else if (ret < 0) {
> +		parse_format_printf_checkpos(state, "position");
> +	} else {
> +		state->used_position = 1;
> +		pos = ret + state->va_start - 1;
> +	}
> +
> +	/* get rid of any formatting flag bits */
> +	while (is_printf_flag(*fmt))
> +		fmt++;
> +
> +	/* now there is the posibility of a width specifier */
> +	if (parse_format_printf_argfield(&fmt, state, head, &pos, "width"))
> +		error++;
> +
> +	/* now we might have the precision specifier */
> +	if (*fmt == '.') {
> +		fmt++;
> +		if (parse_format_printf_argfield(&fmt, state, head, &pos, "position"))
> +			error++;
> +	}
> +
> +	type = parse_printf_get_fmt(fmt, &fmtpost);
> +	*fmtstring += fmtpost - buff;
> +	fmtpost[-1] = '\0';
> +
> +	if (!type && fmt[0] == 'p')
> +		type = &printf_fmt_ptr_ref;	/* probably some extension */
> +
> +	if (type) {
> +		struct symbol *ctype, *source, *target = NULL;
> +		const char *typediff = "different types";
> +		int ret;
> +
> +		expr = get_expression_n(head, pos-1);
> +		if (!expr)
> +			return -2;
> +
> +		ctype = evaluate_expression(expr);
> +		if (!ctype)
> +			return -3;
> +
> +		source = degenerate(expr);
> +		ret = (type->test)(type->data, &expr, ctype, &target, &typediff);
> +
> +		if (ret == 0) {
> +			warning(expr->pos, "incorrect type in argument %d (%s)", pos, typediff);
> +			info(expr->pos, "   expected %s", show_typename(target));
> +			info(expr->pos, "   got %s", show_typename(source));
> +		}
> +	} else {
> +		warning(expr->pos, "cannot evaluate type '%s'", buff);
> +		return -1;
> +	}
> +
> +	return 1;
> +}
> +
> +/* attempt to run through a printf format string and work out the types
> + * it specifies. The format is parsed from the __attribute__(format())
> + * in the parser code which stores the positions of the message and arg
> + * start in the ctype.
> + */
> +static void evaluate_format_printf(struct symbol *fn, struct expression_list *head)

Rename 'head' to something like 'args'.

> +{
> +	struct format_state state = { };
> +	struct expression *expr;
> +	const char *fmt_string = NULL;
> +
> +	expr = get_expression_n(head, fn->ctype.printf_msg-1);
> +	if (!expr)
> +		return;
> +	if (expr->string && expr->string->length)
You must first check if (expr->type == EXPR_STRING)

> +		fmt_string = expr->string->data;
> +	if (!fmt_string) {
> +		struct symbol *sym = evaluate_expression(expr);
> +
> +		/* attempt to find initialiser for this */
> +		if (sym && sym->initializer && sym->initializer->string)
> +			fmt_string = sym->initializer->string->data;
Same here.
But don't want only check literal strings, so this last part
is not needed/wrong.

> +	}
> +
> +	state.expr = expr;
> +	state.va_start = fn->ctype.printf_va_start;
> +	state.arg_index = fn->ctype.printf_va_start;
> +
> +	if (!fmt_string) {
> +		warning(expr->pos, "not a format string?");
> +	} else {
> +		const char *string = fmt_string;
> +
> +		for (; string[0] != '\0'; string++) {
> +			if (string[0] != '%')
> +				continue;
> +			parse_format_printf(&string, &state, head);

I tend to think that this would make parse_format_printf() simpler:
			string = parse_format_printf(string, &state, head);
but it's not important.

> +		}
> +	}
> +}
> +
>  static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
>  {
>  	struct expression *expr;
> @@ -2250,6 +2541,12 @@ static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
>  	struct symbol *argtype;
>  	int i = 1;
>  
> +	/* do this first, otherwise the arugment info may get lost or changed
> +	 * later on in the evaluation loop.
> +	 */
Please, add an explanation (which info can be lost or changed and how).
I would much much prefer to make the format checks *after* the normal
evaluation loop; This would allow to have the args already evaluated
(in the sparse usual sense) with their corresponding warnings (for the
non-variadic part) coming before the format warnings.

> +	if (fn->ctype.printf_va_start)
> +		evaluate_format_printf(fn, head);
> +
>  	PREPARE_PTR_LIST(argument_types, argtype);
>  	FOR_EACH_PTR (head, expr) {
>  		struct expression **p = THIS_ADDRESS(expr);

Kind regards,
-- Luc
Ben Dooks Nov. 2, 2018, 4:43 p.m. UTC | #2
On 01/11/18 23:50, Luc Van Oostenryck wrote:
> On Thu, Nov 01, 2018 at 06:11:14PM +0000, Ben Dooks wrote:
>> The variadic argumnet code did not check any of the variadic arguments
>> as it did not previously know the possible type. Now we have the possible
>> formatting information stored in the ctype, we can do some checks on the
>> printf formatting types.
>>
>> Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
>> ---
>> Fixes since v1:
>> - Split out the format-string -> symbol code
>> - Use symbol_list for the symbols from format parsing
>> - Changed to follow the new parsing code and ctype use
>> - Merged the unsigned-int/long types together
>>
>> Fixes since v2:
>> - Check for printf_va_start before checking variadic-list
>> - Tidy the type code and fix a couple of bugs with %l and %ll
>> - Fix function names in working through printf arguments.
>> - Tidy documentation
>>
>> Fixes since v3:
>> - Added positional arguments
>> - Also added precision and width specifiers
>>
>> Notes:
>> - Awaiting new pointer-list accessing code
>> - %p still generates an address-space mismatch
>> - how do we deal with the kernel's attempt to make printk format all types?
>>
>> expr: updated handling for formats
>> ---
>>   evaluate.c | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 297 insertions(+)
>>
>> diff --git a/evaluate.c b/evaluate.c
>> index b96696d..ffb4af6 100644
>> --- a/evaluate.c
>> +++ b/evaluate.c
>> @@ -2243,6 +2243,297 @@ static struct symbol *evaluate_alignof(struct expression *expr)
>>   	return size_t_ctype;
>>   }
>>   
>> +struct format_type {
>> +	const char	*format;
>> +	int		(*test)(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff);
>> +	void		*data;
> 
> Does this need to be a void pointer? Isn't it always a 'struct symbol *'?

I'm going to pass the "struct format_type" for now, in case any other 
info is needed.


>> +};
>> +
>> +struct format_state {
>> +	struct expression	*expr;
>> +	unsigned int		va_start;
>> +	unsigned int		fmt_index;
>> +	unsigned int		arg_index;
>> +	unsigned int		used_position: 1;
>> +};
>> +
>> +static int printf_fmt_inttype(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
>> +{
>> +	struct symbol *type = data;
>> +	*target = type;
>> +	return ctype == type;
> 
> This will be fine as a first step but it won't work if thz argument
> has some modifiers. It will be best, I think, to use something like
> is_int_type() and type_difference.

Ok, I will have a look at this once all the othe changes are done

>> +}
>> +
>> +// todo - comparing pointers isn't easy...
>> +static int printf_fmt_string(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
>> +{
>> +	*target = &string_ctype;
> 
> Same as above for integers. For example what if the arg
> is a 'const char *'?
> 
>> +	return check_assignment_types(*target, expr, typediff);
>> +}
>> +
>> +static int printf_fmt_pointer(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
>> +{
>> +	*target = &ptr_ctype;
>> +	// todo - deal with not caring about address-space //
> 
> What about:
> 	if (is_ptr_type(ctype))
> 		...
> 
>> +static struct format_type *parse_printf_get_fmt(char *msg, char **msgout)
>> +{
>> +	struct format_type *type;
>> +	char *end;
>> +	int len;
>> +
>> +	for (end = msg; *end > ' '; end++);
> I don't understand this line. Please explain.

looks for the end of the format.
> 
>> +	*msgout = end+1;
>> +
>> +	len = (end - msg);
>> +	for (type = printf_fmts; type->format != NULL; type++)
>> +		if (!strncmp(msg, type->format, len))
>> +			return type;
>> +
>> +	return NULL;
>> +}
>> +
>> +static int is_printf_flag(char ch)
>> +{
>> +	return ch == '0' || ch == '+' || ch == '-' || ch == ' ' || ch == '#';
>> +}
>> +
>> +static int printf_check_position(char **fmt)
>> +{
>> +	char *ptr= *fmt;
>> +
>> +	while (isdigit(*ptr))
>> +		ptr++;
>> +	if (*ptr == '$') {
>> +		char *pos = *fmt;
>> +		*fmt = ptr+1;
>> +		return atoi(pos);
> 
> Use strtol(), please. Also (and related) "%$d" is invalid (or is
> this checked by parse_format_printf_checkpos() ?).

I'll add an isdigit(*ptr) to the start

>> +	}
>> +	return -1;
>> +}
>> +
>> +static void parse_format_printf_checkpos(struct format_state *state, const char *which)
>> +{
>> +	if (state->used_position)
>> +		warning(state->expr->pos,
>> +			"format %d: %s: no position specified",
>> +			state->arg_index-1, which);
>> +}
>> +
>> +static int parse_format_printf_argfield(char **fmtptr, struct format_state *state, struct expression_list *head, int *pos, const char *which)
>> +{
>> +	struct expression *expr;
>> +	char *fmt = *fmtptr;
>> +	int argpos = -1;
>> +
>> +	/* check for simple digit-string width/precision specifier first */
>> +	if (*fmt != '*') {
>> +		while (isdigit(*fmt))
>> +			fmt++;
>> +		*fmtptr = fmt;
>> +		return 0;
>> +	}
>> +
>> +	fmt++;
>> +	argpos = printf_check_position(&fmt);
>> +
>> +	if (argpos > 0) {
>> +		argpos += state->va_start - 1;
>> +		state->used_position = 1;
>> +	} else {
>> +		argpos = (*pos)++;
>> +		state->arg_index++;
>> +		parse_format_printf_checkpos(state, which);
>> +	}
>> +
>> +	*fmtptr = fmt;
>> +	expr = get_expression_n(head, argpos-1);
>> +	if (!expr) {
>> +		warning(state->expr->pos, "%s: no argument at position %d", which, argpos);
>> +		return 1;
>> +	}
>> +
>> +	/* todo - verify we got an actual integer argument */
>> +	return 0;
>> +}
>> +
>> +/* printf format parsing code
>> + *
>> + * TODO:
>> + * - fix up type checking of castable types (such as int vs long vs long long)
>> + * - validate all arguments specified are also used...
>> + * - possibly remove return code from this function
>> + */
>> +static int parse_format_printf(const char **fmtstring, struct format_state *state, struct expression_list *head)
>> +{
>> +	char buff[strlen(*fmtstring)+1];
>> +	struct format_type *type;
>> +	struct expression *expr;
>> +	const char *string = *fmtstring;
>> +	char *fmtpost = NULL, *fmt = buff;
>> +	int pos = state->arg_index;
>> +	int error = 0;
>> +	int ret;
>> +
>> +	/* trivial check for %% */
>> +	string++;
>> +	if (string[0] == '%') {
>> +		*fmtstring = string;
>> +		return 0;
>> +	}
>> +
>> +	state->arg_index++;
>> +	state->fmt_index++;
>> +
>> +	strcpy(buff, string);
> Is a copy really needed?

It made it easier to then modify the string and dump it when debugging.

Now i've removed the other bits that did modify it I think the code 
should work without the need for a writable copy.

>> +	ret = printf_check_position(&fmt);
>> +	if (ret == 0) {
>> +		/* we got an invalid position argument */
>> +		error++;
>> +	} else if (ret < 0) {
>> +		parse_format_printf_checkpos(state, "position");
>> +	} else {
>> +		state->used_position = 1;
>> +		pos = ret + state->va_start - 1;
>> +	}
>> +
>> +	/* get rid of any formatting flag bits */
>> +	while (is_printf_flag(*fmt))
>> +		fmt++;
>> +
>> +	/* now there is the posibility of a width specifier */
>> +	if (parse_format_printf_argfield(&fmt, state, head, &pos, "width"))
>> +		error++;
>> +
>> +	/* now we might have the precision specifier */
>> +	if (*fmt == '.') {
>> +		fmt++;
>> +		if (parse_format_printf_argfield(&fmt, state, head, &pos, "position"))
>> +			error++;
>> +	}
>> +
>> +	type = parse_printf_get_fmt(fmt, &fmtpost);
>> +	*fmtstring += fmtpost - buff;
>> +	fmtpost[-1] = '\0';
>> +
>> +	if (!type && fmt[0] == 'p')
>> +		type = &printf_fmt_ptr_ref;	/* probably some extension */
>> +
>> +	if (type) {
>> +		struct symbol *ctype, *source, *target = NULL;
>> +		const char *typediff = "different types";
>> +		int ret;
>> +
>> +		expr = get_expression_n(head, pos-1);
>> +		if (!expr)
>> +			return -2;
>> +
>> +		ctype = evaluate_expression(expr);
>> +		if (!ctype)
>> +			return -3;
>> +
>> +		source = degenerate(expr);
>> +		ret = (type->test)(type->data, &expr, ctype, &target, &typediff);
>> +
>> +		if (ret == 0) {
>> +			warning(expr->pos, "incorrect type in argument %d (%s)", pos, typediff);
>> +			info(expr->pos, "   expected %s", show_typename(target));
>> +			info(expr->pos, "   got %s", show_typename(source));
>> +		}
>> +	} else {
>> +		warning(expr->pos, "cannot evaluate type '%s'", buff);
>> +		return -1;
>> +	}
>> +
>> +	return 1;
>> +}
>> +
>> +/* attempt to run through a printf format string and work out the types
>> + * it specifies. The format is parsed from the __attribute__(format())
>> + * in the parser code which stores the positions of the message and arg
>> + * start in the ctype.
>> + */
>> +static void evaluate_format_printf(struct symbol *fn, struct expression_list *head)
> 
> Rename 'head' to something like 'args'.

Ok, good idea.

> 
>> +{
>> +	struct format_state state = { };
>> +	struct expression *expr;
>> +	const char *fmt_string = NULL;
>> +
>> +	expr = get_expression_n(head, fn->ctype.printf_msg-1);
>> +	if (!expr)
>> +		return;
>> +	if (expr->string && expr->string->length)
> You must first check if (expr->type == EXPR_STRING)
> 
>> +		fmt_string = expr->string->data;
>> +	if (!fmt_string) {
>> +		struct symbol *sym = evaluate_expression(expr);
>> +
>> +		/* attempt to find initialiser for this */
>> +		if (sym && sym->initializer && sym->initializer->string)
>> +			fmt_string = sym->initializer->string->data;
> Same here.
> But don't want only check literal strings, so this last part
> is not needed/wrong.

{
	char __msg[] = "%s %s";
	printf(msg, "hello", "world");
}

the above is valid, so thought it should be included in the check.

> 
>> +	}
>> +
>> +	state.expr = expr;
>> +	state.va_start = fn->ctype.printf_va_start;
>> +	state.arg_index = fn->ctype.printf_va_start;
>> +
>> +	if (!fmt_string) {
>> +		warning(expr->pos, "not a format string?");
>> +	} else {
>> +		const char *string = fmt_string;
>> +
>> +		for (; string[0] != '\0'; string++) {
>> +			if (string[0] != '%')
>> +				continue;
>> +			parse_format_printf(&string, &state, head);
> 
> I tend to think that this would make parse_format_printf() simpler:
> 			string = parse_format_printf(string, &state, head);
> but it's not important.

I'm still wondering if there should be an overall error check or not.
I might remove this in the final verison.

>> +		}
>> +	}
>> +}
>> +
>>   static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
>>   {
>>   	struct expression *expr;
>> @@ -2250,6 +2541,12 @@ static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
>>   	struct symbol *argtype;
>>   	int i = 1;
>>   
>> +	/* do this first, otherwise the arugment info may get lost or changed
>> +	 * later on in the evaluation loop.
>> +	 */
> Please, add an explanation (which info can be lost or changed and how).
> I would much much prefer to make the format checks *after* the normal
> evaluation loop; This would allow to have the args already evaluated
> (in the sparse usual sense) with their corresponding warnings (for the
> non-variadic part) coming before the format warnings.

The struct expressions seem to get turned into NULLs... it was an
annoyance. I can re-check with the new code.

>> +	if (fn->ctype.printf_va_start)
>> +		evaluate_format_printf(fn, head);
>> +
>>   	PREPARE_PTR_LIST(argument_types, argtype);
>>   	FOR_EACH_PTR (head, expr) {
>>   		struct expression **p = THIS_ADDRESS(expr);
> 
> Kind regards,
> -- Luc
>

Patch
diff mbox series

diff --git a/evaluate.c b/evaluate.c
index b96696d..ffb4af6 100644
--- a/evaluate.c
+++ b/evaluate.c
@@ -2243,6 +2243,297 @@  static struct symbol *evaluate_alignof(struct expression *expr)
 	return size_t_ctype;
 }
 
+struct format_type {
+	const char	*format;
+	int		(*test)(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff);
+	void		*data;
+};
+
+struct format_state {
+	struct expression	*expr;
+	unsigned int		va_start;
+	unsigned int		fmt_index;
+	unsigned int		arg_index;
+	unsigned int		used_position: 1;
+};
+
+static int printf_fmt_inttype(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
+{
+	struct symbol *type = data;
+	*target = type;
+	return ctype == type;
+}
+
+// todo - comparing pointers isn't easy...
+static int printf_fmt_string(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
+{
+	*target = &string_ctype;
+	return check_assignment_types(*target, expr, typediff);
+}
+
+static int printf_fmt_pointer(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
+{
+	*target = &ptr_ctype;
+	// todo - deal with not caring about address-space //
+	return check_assignment_types(*target, expr, typediff);
+}
+
+static int printf_fmt_print_pointer(void *data, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
+{
+	*target = &ptr_ctype;
+	return check_assignment_types(*target, expr, typediff);
+}
+
+struct format_type printf_fmts[] = {
+	{ "c",	.test = printf_fmt_inttype, &char_ctype },
+	// todo lc (wchar_t)
+	// todo h/hh (char types)
+	{ "f",	.test = printf_fmt_inttype, &double_ctype },
+	{ "g",	.test = printf_fmt_inttype, &double_ctype },
+	// todo j (intmax/unitmax)
+	{ "s",	.test = printf_fmt_string, },
+	// todo n (pointer to integer)
+	{ "p",	.test = printf_fmt_print_pointer, },
+	{ "px", .test = printf_fmt_print_pointer, },
+	// todo q (modifier?)
+	{ "d",	.test = printf_fmt_inttype, &int_ctype },
+	{ "ld", .test = printf_fmt_inttype, &long_ctype },
+	{ "lld", .test = printf_fmt_inttype, &llong_ctype },
+	{ "u",	.test = printf_fmt_inttype, &uint_ctype },
+	{ "lu", .test = printf_fmt_inttype, &ulong_ctype },
+	{ "llu", .test = printf_fmt_inttype, &ullong_ctype },
+	{ "L",	.test = printf_fmt_inttype, &ldouble_ctype },
+	{ "x",	.test = printf_fmt_inttype, &uint_ctype },
+	{ "lx", .test = printf_fmt_inttype, &ulong_ctype },
+	{ "llx", .test = printf_fmt_inttype, &ullong_ctype },
+	{ "X",	.test = printf_fmt_inttype, &int_ctype },
+	{ "lX", .test = printf_fmt_inttype, &ulong_ctype },
+	{ "llX", .test = printf_fmt_inttype, &ullong_ctype },
+	{ "L", .test = printf_fmt_inttype, &ldouble_ctype },
+	// todo z/Z sizeof ?
+	{ },
+};
+
+static struct format_type printf_fmt_ptr_ref = { "p", .test = printf_fmt_pointer, };
+
+static struct expression *get_expression_n(struct expression_list *head, int nr)
+{
+	return ptr_list_nth_entry((struct ptr_list *)head, nr);
+}
+
+static struct format_type *parse_printf_get_fmt(char *msg, char **msgout)
+{
+	struct format_type *type;
+	char *end;
+	int len;
+
+	for (end = msg; *end > ' '; end++);
+	*msgout = end+1;
+
+	len = (end - msg);
+	for (type = printf_fmts; type->format != NULL; type++)
+		if (!strncmp(msg, type->format, len))
+			return type;
+
+	return NULL;
+}
+
+static int is_printf_flag(char ch)
+{
+	return ch == '0' || ch == '+' || ch == '-' || ch == ' ' || ch == '#';
+}
+
+static int printf_check_position(char **fmt)
+{
+	char *ptr= *fmt;
+
+	while (isdigit(*ptr))
+		ptr++;
+	if (*ptr == '$') {
+		char *pos = *fmt;
+		*fmt = ptr+1;
+		return atoi(pos);
+	}
+	return -1;
+}
+
+static void parse_format_printf_checkpos(struct format_state *state, const char *which)
+{
+	if (state->used_position)
+		warning(state->expr->pos,
+			"format %d: %s: no position specified",
+			state->arg_index-1, which);
+}
+
+static int parse_format_printf_argfield(char **fmtptr, struct format_state *state, struct expression_list *head, int *pos, const char *which)
+{
+	struct expression *expr;
+	char *fmt = *fmtptr;
+	int argpos = -1;
+
+	/* check for simple digit-string width/precision specifier first */
+	if (*fmt != '*') {
+		while (isdigit(*fmt))
+			fmt++;
+		*fmtptr = fmt;
+		return 0;
+	}
+
+	fmt++;
+	argpos = printf_check_position(&fmt);
+
+	if (argpos > 0) {
+		argpos += state->va_start - 1;
+		state->used_position = 1;
+	} else {
+		argpos = (*pos)++;
+		state->arg_index++;
+		parse_format_printf_checkpos(state, which);
+	}
+
+	*fmtptr = fmt;
+	expr = get_expression_n(head, argpos-1);
+	if (!expr) {
+		warning(state->expr->pos, "%s: no argument at position %d", which, argpos);
+		return 1;
+	}
+
+	/* todo - verify we got an actual integer argument */
+	return 0;
+}
+
+/* printf format parsing code
+ *
+ * TODO:
+ * - fix up type checking of castable types (such as int vs long vs long long)
+ * - validate all arguments specified are also used...
+ * - possibly remove return code from this function
+ */
+static int parse_format_printf(const char **fmtstring, struct format_state *state, struct expression_list *head)
+{
+	char buff[strlen(*fmtstring)+1];
+	struct format_type *type;
+	struct expression *expr;
+	const char *string = *fmtstring;
+	char *fmtpost = NULL, *fmt = buff;
+	int pos = state->arg_index;
+	int error = 0;
+	int ret;
+
+	/* trivial check for %% */
+	string++;
+	if (string[0] == '%') {
+		*fmtstring = string;
+		return 0;
+	}
+
+	state->arg_index++;
+	state->fmt_index++;
+
+	strcpy(buff, string);
+	ret = printf_check_position(&fmt);
+	if (ret == 0) {
+		/* we got an invalid position argument */
+		error++;
+	} else if (ret < 0) {
+		parse_format_printf_checkpos(state, "position");
+	} else {
+		state->used_position = 1;
+		pos = ret + state->va_start - 1;
+	}
+
+	/* get rid of any formatting flag bits */
+	while (is_printf_flag(*fmt))
+		fmt++;
+
+	/* now there is the posibility of a width specifier */
+	if (parse_format_printf_argfield(&fmt, state, head, &pos, "width"))
+		error++;
+
+	/* now we might have the precision specifier */
+	if (*fmt == '.') {
+		fmt++;
+		if (parse_format_printf_argfield(&fmt, state, head, &pos, "position"))
+			error++;
+	}
+
+	type = parse_printf_get_fmt(fmt, &fmtpost);
+	*fmtstring += fmtpost - buff;
+	fmtpost[-1] = '\0';
+
+	if (!type && fmt[0] == 'p')
+		type = &printf_fmt_ptr_ref;	/* probably some extension */
+
+	if (type) {
+		struct symbol *ctype, *source, *target = NULL;
+		const char *typediff = "different types";
+		int ret;
+
+		expr = get_expression_n(head, pos-1);
+		if (!expr)
+			return -2;
+
+		ctype = evaluate_expression(expr);
+		if (!ctype)
+			return -3;
+
+		source = degenerate(expr);
+		ret = (type->test)(type->data, &expr, ctype, &target, &typediff);
+
+		if (ret == 0) {
+			warning(expr->pos, "incorrect type in argument %d (%s)", pos, typediff);
+			info(expr->pos, "   expected %s", show_typename(target));
+			info(expr->pos, "   got %s", show_typename(source));
+		}
+	} else {
+		warning(expr->pos, "cannot evaluate type '%s'", buff);
+		return -1;
+	}
+
+	return 1;
+}
+
+/* attempt to run through a printf format string and work out the types
+ * it specifies. The format is parsed from the __attribute__(format())
+ * in the parser code which stores the positions of the message and arg
+ * start in the ctype.
+ */
+static void evaluate_format_printf(struct symbol *fn, struct expression_list *head)
+{
+	struct format_state state = { };
+	struct expression *expr;
+	const char *fmt_string = NULL;
+
+	expr = get_expression_n(head, fn->ctype.printf_msg-1);
+	if (!expr)
+		return;
+	if (expr->string && expr->string->length)
+		fmt_string = expr->string->data;
+	if (!fmt_string) {
+		struct symbol *sym = evaluate_expression(expr);
+
+		/* attempt to find initialiser for this */
+		if (sym && sym->initializer && sym->initializer->string)
+			fmt_string = sym->initializer->string->data;
+	}
+
+	state.expr = expr;
+	state.va_start = fn->ctype.printf_va_start;
+	state.arg_index = fn->ctype.printf_va_start;
+
+	if (!fmt_string) {
+		warning(expr->pos, "not a format string?");
+	} else {
+		const char *string = fmt_string;
+
+		for (; string[0] != '\0'; string++) {
+			if (string[0] != '%')
+				continue;
+			parse_format_printf(&string, &state, head);
+		}
+	}
+}
+
 static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
 {
 	struct expression *expr;
@@ -2250,6 +2541,12 @@  static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
 	struct symbol *argtype;
 	int i = 1;
 
+	/* do this first, otherwise the arugment info may get lost or changed
+	 * later on in the evaluation loop.
+	 */
+	if (fn->ctype.printf_va_start)
+		evaluate_format_printf(fn, head);
+
 	PREPARE_PTR_LIST(argument_types, argtype);
 	FOR_EACH_PTR (head, expr) {
 		struct expression **p = THIS_ADDRESS(expr);