diff mbox series

[3/5] evaluate: check variadic argument types against formatting info

Message ID 20190925100015.31510-4-ben.dooks@codethink.co.uk (mailing list archive)
State Superseded, archived
Headers show
Series [1/5] validation: ignore temporary ~ files | expand

Commit Message

Ben Dooks Sept. 25, 2019, 10 a.m. UTC
The variadic argumnet code did not check any of the variadic arguments
as it did not previously know the possible type. Now we have the possible
formatting information stored in the ctype, we can do some checks on the
printf formatting types.

Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
---
Fixes since v1:
- Split out the format-string -> symbol code
- Use symbol_list for the symbols from format parsing
- Changed to follow the new parsing code and ctype use
- Merged the unsigned-int/long types together

Fixes since v2:
- Check for printf_va_start before checking variadic-list
- Tidy the type code and fix a couple of bugs with %l and %ll
- Fix function names in working through printf arguments.
- Tidy documentation

Fixes since v3:
- Added positional arguments
- Also added precision and width specifiers

Fixes since v4:
- Stop copying the format string
- Removed void data pointer
- Suggested code cleanups

Fixes since v5:
- Rewritten format parsing code
- Updates for handling kernel printk formatting
- Fix parsing issues with ')' characters

Fixes since v6;
- Evaluate aftre all standard args are done
- Fix 'L' parsing

Fixes since v7:
- Updated comment format
- Remove static variable from get_fmt()

Notes:
- %p still generates an address-space mismatch
- how do we deal with the kernel's attempt to make printk format all types?

hack: ) will end format too
---
 evaluate.c | 443 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 443 insertions(+)

Comments

Luc Van Oostenryck Oct. 20, 2019, 3:42 p.m. UTC | #1
On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
> diff --git a/evaluate.c b/evaluate.c
> --- a/evaluate.c
> +++ b/evaluate.c
> @@ -2319,13 +2319,452 @@ static struct symbol *evaluate_alignof(struct expression *expr)
>  	return size_t_ctype;
>  }
>  
> +struct format_type {
> +	const char	*format;
> +	int		(*test)(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff);
> +	struct symbol	*data;
> +};
> +
> +struct format_state {
> +	struct expression	*expr;
> +	unsigned int		va_start;

the prefix 'va_' is useless.

> +static int printf_fmt_print_pointer(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
> +{
> +	int ret;
> +	*target = &ptr_ctype;
> +	ret =check_assignment_types(*target, expr, typediff);
> +	if (ret == 0) {
> +		/* if just printing, ignore address-space mismatches */
> +		if (strcmp(*typediff, "different address spaces") == 0)

That's terrible.
It would be better to copy the ctype and then mask out the address spaces.

> +			ret = 1;
> +	}
> +	return ret;
> +}
> +
> +static struct format_type printf_fmt_ptr_ref = { "p", .test = printf_fmt_pointer, };

Please use a designator for all members:
	.format = "p",
	.test = ....

> +static struct expression *get_expression_n(struct expression_list *args, int nr)

get_nth_expression() ?

> +static void parse_format_printf_checkpos(struct format_state *state, const char *which)
> +{
> +	if (state->used_position)
> +		warning(state->expr->pos,
> +			"format %d: %s: no position specified",
> +			state->arg_index-1, which);

Please, use braces when the body doesn't hold on a single line.

> +static int parse_format_printf_argfield(const char **fmtptr, struct format_state *state, struct expression_list *args, int *pos, const char *which)
> +{

...

> +	/* check the vale we got was int/uint type */

typo: value

> +	ctype = evaluate_expression(expr);
> +	if (ctype) {
> +		struct symbol *source, *target = &int_ctype;
> +
> +		source = degenerate(expr);
> +
> +		if (source != &int_ctype && source != &uint_ctype) {

Is there a good reason to allow uint (the standard only allow a plain int)?

> +/*
> + * printf format parsing code
> + *
> + * this code currently does not:
> + * - check castable types (such as int vs long vs long long)
> + * - validate all arguments specified are also used...
> + */
> +static int parse_format_printf(const char **fmtstring,
> +			       struct format_state *state,
> +			       struct expression_list *args)
> +{
> +	struct format_type ftype;
> +	struct format_type *type;
> +	struct expression *expr;
> +	const char *fmt = *fmtstring;
> +	const char *fmtpost = NULL;
> +	int pos = state->arg_index;
> +	int error = 0;
> +	int ret;

It would be nice to have some comments about the variables, for example
what is the purpose of fmtpost.


> +		source = degenerate(expr);
> +		ret = (type->test)(type, &expr, ctype, &target, &typediff);

The first set of parentheses are unneeded. Please remove them.

> +	} else {
> +		/* try and find the end of this */
> +		fmtpost = *fmtstring;
> +		while (*fmtpost > ' ')

This merits a comment. Also, what is 'this'?

> +static void evaluate_format_printf(const char *fmt_string, struct symbol *fn, struct expression_list *head)
> +{
> +		if (fail > 0)
> +			/* format string may have '\n' etc embedded in it */
> +			warning(expr->pos, "cannot evaluate format string");

If fail > 0, a specific warnings has already been issued, right?
then this warning should be removed.

>  static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
>  {
>  	struct expression *expr;
>  	struct symbol_list *argument_types = fn->arguments;
> +	const char *fmt_string = NULL;
>  	struct symbol *argtype;
>  	int i = 1;
>  
> +	/*
> +	 * do this first, otherwise the arugment info may get lost or changed

typo: argument.
Maybe change the message to something like:
	Do first part of (printf) format checking. This need to be done
	here, ...

-- Luc
Luc Van Oostenryck Oct. 20, 2019, 4:40 p.m. UTC | #2
On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
> +static int printf_fmt_string(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
> +{
> +	*target = &string_ctype;

This should be const_string_ctype and a test should be added for "%s"
with a non-const char pointer/array.

> +static int printf_fmt_pointer(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
> +{
> +	*target = &ptr_ctype;

Same here with const_ptr_ctype (but I've not tested it).

> +static int printf_fmt_print_pointer(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
> +{
> +	int ret;
> +	*target = &ptr_ctype;

Same here.

-- Luc
Luc Van Oostenryck Oct. 20, 2019, 4:55 p.m. UTC | #3
On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
> +		if (*ptr == 'x' || *ptr == 'X' || *ptr == 'u' || *ptr == 'o') {
> +			ptr++;
> +			type->test = printf_fmt_numtype;
> +			switch (szmod) {
> +			case -1:
> +				type->data = &ushort_ctype;
> +				break;
> +			case 0:
> +				type->data = &uint_ctype;
> +				break;
> +			case 1:
> +				type->data = &ulong_ctype;
> +				break;
> +			case 2:
> +				type->data = &ullong_ctype;
> +				break;
> +			default:
> +				type->test = NULL;
> +			}
> +		} else if (*ptr == 'i' || *ptr == 'd') {
> +			ptr++;
> +			type->test = printf_fmt_numtype;
> +			switch (szmod) {
> +			case -1:
> +				type->data = &short_ctype;
> +				break;
> +			case 0:
> +				type->data = &int_ctype;
> +				break;
> +			case 1:
> +				type->data = &long_ctype;
> +				break;
> +			case 2:
> +				type->data = &llong_ctype;
> +				break;
> +			default:
> +				type->test = NULL;
> +			}

When testing this on the kernel, I've a bunch of
	warning: incorrect type in argument . (different types)
	   expected unsigned int
	   got int
This will quickly be quite annoying.

I've also a bunch of:
	warning: incorrect type in argument . (different types)
	   expected int
	   got int
but I can't investigate more for now.

Putting this aside, the series seesm to already been pretty good.
Congrats!

-- Luc
Luc Van Oostenryck Oct. 20, 2019, 7:12 p.m. UTC | #4
On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
> +static int printf_fmt_numtype(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
> +{
> +	struct symbol *type = fmt->data;
> +	*target = type;
> +	return ctype == type;

Comparing these pointer will never be the correct way to compare the
types. You have to use something like check_assignment_types().
Currently, a simple test like:
	void print(const char *, ...) __attribute__((format(printf, 1, 2)));
	static void foo(unsigned int u)
	{
		print("%x\n", u);
	}

gives a warning like:
	warning: incorrect type in argument 2 (different types)
	   expected unsigned int
	   got unsigned int u

-- Luc
Luc Van Oostenryck Oct. 20, 2019, 8:07 p.m. UTC | #5
On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
> +static struct format_type *parse_printf_get_fmt(struct format_type *type, const char *msg, const char **msgout)
> +{

...

> +	} else if (*ptr == 'p') {
> +		ptr++;
> +		type->test = printf_fmt_print_pointer;
> +		//todo - check if there's anything after these?
> +		if (*ptr == 'x' || *ptr == 'X') {
> +			ptr++;
> +		} else if (isalpha(*ptr)) {
> +			// probably sxomething that /is/ being de-referenced
> +			ptr++;
> +			type->test = printf_fmt_pointer;
> +		}

This needs an explanation for the isalpha() and why x/X is special cased.

> +	} else if (*ptr == 'z') {
> +		ptr++;
> +		if (*ptr == 'd') {
> +			ptr++;
> +			type->test = printf_fmt_numtype;
> +			type->data = &long_ctype;
> +		} else if (*ptr == 'u' || *ptr == 'x') {
> +			ptr++;
> +			type->test = printf_fmt_numtype;
> +			type->data = &ulong_ctype;

These should use size_t_ctype & ssize_t_type.
They also work with 'i', 'o' & 'X'.

> +		} else if (is_float_spec(*ptr)) {
> +			type->test = printf_fmt_numtype;
> +			type->data = &double_ctype;

if (szmod == 1) type->data = &ldouble_ctype;

-- Luc
Luc Van Oostenryck Oct. 20, 2019, 10:01 p.m. UTC | #6
On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
> +static void evaluate_format_printf(const char *fmt_string, struct symbol *fn, struct expression_list *head)
> +{
> +	struct format_state state = { };
> +	struct expression *expr;
> +
> +	expr = get_expression_n(head, fn->ctype.printf_msg-1);
> +	if (!expr)
> +		return;
> +
> +	state.expr = expr;
> +	state.va_start = fn->ctype.printf_va_start;
> +	state.arg_index = fn->ctype.printf_va_start;
> +
> +	if (!fmt_string) {
> +		warning(expr->pos, "not a format string?");
> +	} else {
> +		const char *string = fmt_string;
> +		int fail = 0;
> +
> +		for (; string[0] != '\0'; string++) {
> +			if (string[0] != '%')
> +				continue;
> +			if (parse_format_printf(&string, &state, head) < 0)
> +				fail++;
> +			string--;

This last statement is wrong, it just needs to be removed.

-- Luc
Luc Van Oostenryck Oct. 21, 2019, 9:30 a.m. UTC | #7
On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
> +static const char *get_printf_fmt(struct symbol *fn, struct expression_list *head)
> +{
> +	struct expression *expr;
> +	const char *fmt_string = NULL;
> +
> +	expr = get_expression_n(head, fn->ctype.printf_msg-1);
> +	if (!expr)
> +		return NULL;
> +	if (expr->string && expr->string->length)

expr->string is only valid if expr->type == EXPR_STRING,
so this must be checked first.
> +		fmt_string = expr->string->data;
> +	if (!fmt_string) {
> +		struct symbol *sym = evaluate_expression(expr);

evaluate_expression() only returns the type of the expression, not
the underlying symbol (if there is one). But yes, confusingly, both
are struct symbol *.
You need to check if expr->type == EXPR_SYMBOL and then simply
use expr->symbol.

> +
> +		/* attempt to find initialiser for this */
> +		if (sym && sym->initializer && sym->initializer->string)

Same here with ->string & EXPR_STRING.

> +			fmt_string = sym->initializer->string->data;
> +	}

I think the code here above should be replaced by something like:
	if (!evaluate_expression(expr))
		return NULL;
	if (expr->type == EXPR_PREOP && expr->op == '*')
		expr = expr->unop;
	if (expr->type == EXPR_SYMBOL) {
		struct symbol *sym = expr->symbol;
		if (sym && sym->initializer)
			expr = sym->initializer;
	}
	if (expr->type == EXPR_STRING)
		return expr->string->data;
	return NULL;

This is not tested but it solves a nasty situation with a test like:
	void __attribute__((format(printf, 1, 2))) prt(char *, ...);
	static int x;
	static inline void fun(void) {
		prt("%08x\n", x);
	}
	static void foo(void) {
		fun;
		fun();
	}

Don't ask the details about this specific test, I don't fully understand
it myself, but it's a reduced case for a failure on the kernel code
where a warning "not a format string?" was wrongly issued.
Please add this one in your tests.

-- Lu
Luc Van Oostenryck Oct. 21, 2019, 11:47 a.m. UTC | #8
On Mon, Oct 21, 2019 at 12:01:30AM +0200, Luc Van Oostenryck wrote:
> On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
> > +static void evaluate_format_printf(const char *fmt_string, struct symbol *fn, struct expression_list *head)
> > +{
> > +	struct format_state state = { };
> > +	struct expression *expr;
> > +
> > +	expr = get_expression_n(head, fn->ctype.printf_msg-1);
> > +	if (!expr)
> > +		return;
> > +
> > +	state.expr = expr;
> > +	state.va_start = fn->ctype.printf_va_start;
> > +	state.arg_index = fn->ctype.printf_va_start;
> > +
> > +	if (!fmt_string) {
> > +		warning(expr->pos, "not a format string?");
> > +	} else {
> > +		const char *string = fmt_string;
> > +		int fail = 0;
> > +
> > +		for (; string[0] != '\0'; string++) {
> > +			if (string[0] != '%')
> > +				continue;
> > +			if (parse_format_printf(&string, &state, head) < 0)
> > +				fail++;
> > +			string--;
> 
> This last statement is wrong, it just needs to be removed.

It's more subtle than that: the string++ should only be done
when stripping the chars before the '%'.
Thus the loop should be something like:
		while (string[0]) {
			if (string[0] != '%') {
				// strip everything before '%'
				string++;
				continue;
			}
			if (parse_format_printf(&string, &state, head) < 0)
				fail++;
		}

-- Luc
Ben Dooks Oct. 22, 2019, 10 a.m. UTC | #9
On 20/10/2019 16:42, Luc Van Oostenryck wrote:
> On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
>> diff --git a/evaluate.c b/evaluate.c
>> --- a/evaluate.c
>> +++ b/evaluate.c
>> @@ -2319,13 +2319,452 @@ static struct symbol *evaluate_alignof(struct expression *expr)
>>   	return size_t_ctype;
>>   }
>>   
>> +struct format_type {
>> +	const char	*format;
>> +	int		(*test)(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff);
>> +	struct symbol	*data;
>> +};
>> +
>> +struct format_state {
>> +	struct expression	*expr;
>> +	unsigned int		va_start;
> 
> the prefix 'va_' is useless.

I would prefer to keep it as it clearly states this is for a va_list.

>> +static int printf_fmt_print_pointer(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
>> +{
>> +	int ret;
>> +	*target = &ptr_ctype;
>> +	ret =check_assignment_types(*target, expr, typediff);
>> +	if (ret == 0) {
>> +		/* if just printing, ignore address-space mismatches */
>> +		if (strcmp(*typediff, "different address spaces") == 0)
> 
> That's terrible.
> It would be better to copy the ctype and then mask out the address spaces.

I'll try and sort this

>> +			ret = 1;
>> +	}
>> +	return ret;
>> +}
>> +
>> +static struct format_type printf_fmt_ptr_ref = { "p", .test = printf_fmt_pointer, };
> 
> Please use a designator for all members:
> 	.format = "p",
> 	.test = ....

ok, fixed.

> 
>> +static struct expression *get_expression_n(struct expression_list *args, int nr)
> 
> get_nth_expression() ?

ok, changed.

> 
>> +static void parse_format_printf_checkpos(struct format_state *state, const char *which)
>> +{
>> +	if (state->used_position)
>> +		warning(state->expr->pos,
>> +			"format %d: %s: no position specified",
>> +			state->arg_index-1, which);
> 
> Please, use braces when the body doesn't hold on a single line.

ok, done

>> +static int parse_format_printf_argfield(const char **fmtptr, struct format_state *state, struct expression_list *args, int *pos, const char *which)
>> +{
> 
> ...
> 
>> +	/* check the vale we got was int/uint type */
> 
> typo: value

ok, fixed


>> +	ctype = evaluate_expression(expr);
>> +	if (ctype) {
>> +		struct symbol *source, *target = &int_ctype;
>> +
>> +		source = degenerate(expr);
>> +
>> +		if (source != &int_ctype && source != &uint_ctype) {
> 
> Is there a good reason to allow uint (the standard only allow a plain int)?

I think just testing with gcc showed it tended to not care about signed.

>> +/*
>> + * printf format parsing code
>> + *
>> + * this code currently does not:
>> + * - check castable types (such as int vs long vs long long)
>> + * - validate all arguments specified are also used...
>> + */
>> +static int parse_format_printf(const char **fmtstring,
>> +			       struct format_state *state,
>> +			       struct expression_list *args)
>> +{
>> +	struct format_type ftype;
>> +	struct format_type *type;
>> +	struct expression *expr;
>> +	const char *fmt = *fmtstring;
>> +	const char *fmtpost = NULL;
>> +	int pos = state->arg_index;
>> +	int error = 0;
>> +	int ret;
> 
> It would be nice to have some comments about the variables, for example
> what is the purpose of fmtpost.

ok, done

> 
>> +		source = degenerate(expr);
>> +		ret = (type->test)(type, &expr, ctype, &target, &typediff);
> 
> The first set of parentheses are unneeded. Please remove them.

ok, done

>> +	} else {
>> +		/* try and find the end of this */
>> +		fmtpost = *fmtstring;
>> +		while (*fmtpost > ' ')
> 
> This merits a comment. Also, what is 'this'?

changed to:
/* try and find the end of this format string by looking for a space*/


>> +static void evaluate_format_printf(const char *fmt_string, struct symbol *fn, struct expression_list *head)
>> +{
>> +		if (fail > 0)
>> +			/* format string may have '\n' etc embedded in it */
>> +			warning(expr->pos, "cannot evaluate format string");
> 
> If fail > 0, a specific warnings has already been issued, right?
> then this warning should be removed.

I think there are a place or two where there are no warnings.
I'll go check.

> 
>>   static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
>>   {
>>   	struct expression *expr;
>>   	struct symbol_list *argument_types = fn->arguments;
>> +	const char *fmt_string = NULL;
>>   	struct symbol *argtype;
>>   	int i = 1;
>>   
>> +	/*
>> +	 * do this first, otherwise the arugment info may get lost or changed
> 
> typo: argument.
> Maybe change the message to something like:
> 	Do first part of (printf) format checking. This need to be done
> 	here, ...
> 

ok, thanks.
Ben Dooks Oct. 22, 2019, 10:26 a.m. UTC | #10
On 20/10/2019 17:40, Luc Van Oostenryck wrote:
> On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
>> +static int printf_fmt_string(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
>> +{
>> +	*target = &string_ctype;
> 
> This should be const_string_ctype and a test should be added for "%s"
> with a non-const char pointer/array.

ok.

>> +static int printf_fmt_pointer(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
>> +{
>> +	*target = &ptr_ctype;
> 
> Same here with const_ptr_ctype (but I've not tested it).

ok


>> +static int printf_fmt_print_pointer(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
>> +{
>> +	int ret;
>> +	*target = &ptr_ctype;
> 
>
ok
Ben Dooks Oct. 22, 2019, 10:31 a.m. UTC | #11
On 20/10/2019 17:55, Luc Van Oostenryck wrote:
> On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
>> +		if (*ptr == 'x' || *ptr == 'X' || *ptr == 'u' || *ptr == 'o') {
>> +			ptr++;
>> +			type->test = printf_fmt_numtype;
>> +			switch (szmod) {
>> +			case -1:
>> +				type->data = &ushort_ctype;
>> +				break;
>> +			case 0:
>> +				type->data = &uint_ctype;
>> +				break;
>> +			case 1:
>> +				type->data = &ulong_ctype;
>> +				break;
>> +			case 2:
>> +				type->data = &ullong_ctype;
>> +				break;
>> +			default:
>> +				type->test = NULL;
>> +			}
>> +		} else if (*ptr == 'i' || *ptr == 'd') {
>> +			ptr++;
>> +			type->test = printf_fmt_numtype;
>> +			switch (szmod) {
>> +			case -1:
>> +				type->data = &short_ctype;
>> +				break;
>> +			case 0:
>> +				type->data = &int_ctype;
>> +				break;
>> +			case 1:
>> +				type->data = &long_ctype;
>> +				break;
>> +			case 2:
>> +				type->data = &llong_ctype;
>> +				break;
>> +			default:
>> +				type->test = NULL;
>> +			}
> 
> When testing this on the kernel, I've a bunch of
> 	warning: incorrect type in argument . (different types)
> 	   expected unsigned int
> 	   got int
> This will quickly be quite annoying.
> 
> I've also a bunch of:
> 	warning: incorrect type in argument . (different types)
> 	   expected int
> 	   got int
> but I can't investigate more for now.

I might spend some time fixing those up... there's plenty of warning
fixes to go through.

I've not yet run my test on a current kernel, but printf type seems
to be one place where people have been a bit sloppy.

Thanks!
Ben Dooks Oct. 22, 2019, 10:38 a.m. UTC | #12
On 20/10/2019 20:12, Luc Van Oostenryck wrote:
> On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
>> +static int printf_fmt_numtype(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
>> +{
>> +	struct symbol *type = fmt->data;
>> +	*target = type;
>> +	return ctype == type;
> 
> Comparing these pointer will never be the correct way to compare the
> types. You have to use something like check_assignment_types().
> Currently, a simple test like:
> 	void print(const char *, ...) __attribute__((format(printf, 1, 2)));
> 	static void foo(unsigned int u)
> 	{
> 		print("%x\n", u);
> 	}
> 
> gives a warning like:
> 	warning: incorrect type in argument 2 (different types)
> 	   expected unsigned int
> 	   got unsigned int u
> 
> -- Luc

ok, thanks for pointing this out.
Ben Dooks Oct. 22, 2019, 10:52 a.m. UTC | #13
On 20/10/2019 21:07, Luc Van Oostenryck wrote:
> On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
>> +static struct format_type *parse_printf_get_fmt(struct format_type *type, const char *msg, const char **msgout)
>> +{
> 
> ...
> 
>> +	} else if (*ptr == 'p') {
>> +		ptr++;
>> +		type->test = printf_fmt_print_pointer;
>> +		//todo - check if there's anything after these?
>> +		if (*ptr == 'x' || *ptr == 'X') {
>> +			ptr++;
>> +		} else if (isalpha(*ptr)) {
>> +			// probably sxomething that /is/ being de-referenced
>> +			ptr++;
>> +			type->test = printf_fmt_pointer;
>> +		}
> 
> This needs an explanation for the isalpha() and why x/X is special cased.

ok, added
		/* check for pointer being printed as hex explicitly */

and
		/* probably some extra specifiers after %p */


>> +	} else if (*ptr == 'z') {
>> +		ptr++;
>> +		if (*ptr == 'd') {
>> +			ptr++;
>> +			type->test = printf_fmt_numtype;
>> +			type->data = &long_ctype;
>> +		} else if (*ptr == 'u' || *ptr == 'x') {
>> +			ptr++;
>> +			type->test = printf_fmt_numtype;
>> +			type->data = &ulong_ctype;
> 
> These should use size_t_ctype & ssize_t_type.
> They also work with 'i', 'o' & 'X'.

ok, thanks

>> +		} else if (is_float_spec(*ptr)) {
>> +			type->test = printf_fmt_numtype;
>> +			type->data = &double_ctype;
> 
> if (szmod == 1) type->data = &ldouble_ctype;

ok, changed.

szmod == 1 ? &ldouble_ctype :  &double_ctype;
Ben Dooks Oct. 22, 2019, 11:16 a.m. UTC | #14
On 22/10/2019 11:26, Ben Dooks wrote:
> On 20/10/2019 17:40, Luc Van Oostenryck wrote:
>> On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
>>> +static int printf_fmt_string(struct format_type *fmt, struct 
>>> expression **expr, struct symbol *ctype, struct symbol **target, 
>>> const char **typediff)
>>> +{
>>> +    *target = &string_ctype;
>>
>> This should be const_string_ctype and a test should be added for "%s"
>> with a non-const char pointer/array.
> 
> ok.

I'm now getting weird issue with the tests failing. Example:

-varargs-format-addrspace1.c:12:32:    expected const char *
+varargs-format-addrspace1.c:12:32:    expected char const *


>>> +static int printf_fmt_pointer(struct format_type *fmt, struct 
>>> expression **expr, struct symbol *ctype, struct symbol **target, 
>>> const char **typediff)
>>> +{
>>> +    *target = &ptr_ctype;
>>
>> Same here with const_ptr_ctype (but I've not tested it).
> 
> ok
> 
> 
>>> +static int printf_fmt_print_pointer(struct format_type *fmt, struct 
>>> expression **expr, struct symbol *ctype, struct symbol **target, 
>>> const char **typediff)
>>> +{
>>> +    int ret;
>>> +    *target = &ptr_ctype;
>>
>>
> ok
> 
> 
>
Luc Van Oostenryck Oct. 22, 2019, 8:49 p.m. UTC | #15
On Tue, Oct 22, 2019 at 12:16:57PM +0100, Ben Dooks wrote:
> On 22/10/2019 11:26, Ben Dooks wrote:
> > On 20/10/2019 17:40, Luc Van Oostenryck wrote:
> > > On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
> > > > +static int printf_fmt_string(struct format_type *fmt,
> > > > struct expression **expr, struct symbol *ctype, struct
> > > > symbol **target, const char **typediff)
> > > > +{
> > > > +    *target = &string_ctype;
> > > 
> > > This should be const_string_ctype and a test should be added for "%s"
> > > with a non-const char pointer/array.
> > 
> > ok.
> 
> I'm now getting weird issue with the tests failing. Example:
> 
> -varargs-format-addrspace1.c:12:32:    expected const char *
> +varargs-format-addrspace1.c:12:32:    expected char const *

Well, they're the same types and Sparse print them like this.
I would also prefer to have them with the qualifier in front
and that can be 'fixed' but it is orthogonal with this format checking.
Please use 'char const *' for the moment.

-- Luc
Luc Van Oostenryck Oct. 22, 2019, 9:01 p.m. UTC | #16
On Tue, Oct 22, 2019 at 11:31:27AM +0100, Ben Dooks wrote:
> 
> I might spend some time fixing those up... there's plenty of warning
> fixes to go through.
> 
> I've not yet run my test on a current kernel, but printf type seems
> to be one place where people have been a bit sloppy.

With the changes I sent you and using check_assignment_types() for
printf_fmt_numtype() I get only 3 warnings in the same drivers,
all for trying to print a bitwise type (which should probably be
acceptable for the %x format).

But yes, checking the signedness will most probably gives a lot of
warnings. I think this shouldn't be done without support for the
flag '-Wformat-signedness'.

-- Luc
Ben Dooks Oct. 23, 2019, 3:41 p.m. UTC | #17
On 21/10/2019 12:47, Luc Van Oostenryck wrote:
> On Mon, Oct 21, 2019 at 12:01:30AM +0200, Luc Van Oostenryck wrote:
>> On Wed, Sep 25, 2019 at 11:00:13AM +0100, Ben Dooks wrote:
>>> +static void evaluate_format_printf(const char *fmt_string, struct symbol *fn, struct expression_list *head)
>>> +{
>>> +	struct format_state state = { };
>>> +	struct expression *expr;
>>> +
>>> +	expr = get_expression_n(head, fn->ctype.printf_msg-1);
>>> +	if (!expr)
>>> +		return;
>>> +
>>> +	state.expr = expr;
>>> +	state.va_start = fn->ctype.printf_va_start;
>>> +	state.arg_index = fn->ctype.printf_va_start;
>>> +
>>> +	if (!fmt_string) {
>>> +		warning(expr->pos, "not a format string?");
>>> +	} else {
>>> +		const char *string = fmt_string;
>>> +		int fail = 0;
>>> +
>>> +		for (; string[0] != '\0'; string++) {
>>> +			if (string[0] != '%')
>>> +				continue;
>>> +			if (parse_format_printf(&string, &state, head) < 0)
>>> +				fail++;
>>> +			string--;
>>
>> This last statement is wrong, it just needs to be removed.
> 
> It's more subtle than that: the string++ should only be done
> when stripping the chars before the '%'.
> Thus the loop should be something like:
> 		while (string[0]) {
> 			if (string[0] != '%') {
> 				// strip everything before '%'
> 				string++;
> 				continue;
> 			}
> 			if (parse_format_printf(&string, &state, head) < 0)
> 				fail++;
> 		}

ok, will make this change and test.
diff mbox series

Patch

diff --git a/evaluate.c b/evaluate.c
index 3268333..b7c78a0 100644
--- a/evaluate.c
+++ b/evaluate.c
@@ -2319,13 +2319,452 @@  static struct symbol *evaluate_alignof(struct expression *expr)
 	return size_t_ctype;
 }
 
+struct format_type {
+	const char	*format;
+	int		(*test)(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff);
+	struct symbol	*data;
+};
+
+struct format_state {
+	struct expression	*expr;
+	unsigned int		va_start;
+	unsigned int		fmt_index;
+	unsigned int		arg_index;
+	unsigned int		used_position: 1;
+};
+
+static int printf_fmt_numtype(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
+{
+	struct symbol *type = fmt->data;
+	*target = type;
+	return ctype == type;
+}
+
+static int printf_fmt_string(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
+{
+	*target = &string_ctype;
+	return check_assignment_types(*target, expr, typediff);
+}
+
+static int printf_fmt_pointer(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
+{
+	*target = &ptr_ctype;
+        return check_assignment_types(*target, expr, typediff);
+}
+
+static int printf_fmt_print_pointer(struct format_type *fmt, struct expression **expr, struct symbol *ctype, struct symbol **target, const char **typediff)
+{
+	int ret;
+	*target = &ptr_ctype;
+	ret =check_assignment_types(*target, expr, typediff);
+	if (ret == 0) {
+		/* if just printing, ignore address-space mismatches */
+		if (strcmp(*typediff, "different address spaces") == 0)
+			ret = 1;
+	}
+	return ret;
+}
+
+static struct format_type printf_fmt_ptr_ref = { "p", .test = printf_fmt_pointer, };
+
+static struct expression *get_expression_n(struct expression_list *args, int nr)
+{
+	return ptr_list_nth_entry((struct ptr_list *)args, nr);
+}
+
+static int is_float_spec(char t)
+{
+	return t == 'f' || t == 'g' || t == 'F' || t == 'G';
+}
+
+static struct format_type *parse_printf_get_fmt(struct format_type *type, const char *msg, const char **msgout)
+{
+	const char *ptr = msg;
+	int szmod=0;
+
+	type->test = NULL;
+	*msgout = ptr;
+
+	if (*ptr == 's') {
+		ptr++;
+		type->test = printf_fmt_string;
+	} else if (*ptr == 'c') {
+		ptr++;
+		type->test = printf_fmt_numtype;
+		type->data = &char_ctype;
+	} else if (*ptr == 'p') {
+		ptr++;
+		type->test = printf_fmt_print_pointer;
+		//todo - check if there's anything after these?
+		if (*ptr == 'x' || *ptr == 'X') {
+			ptr++;
+		} else if (isalpha(*ptr)) {
+			// probably sxomething that /is/ being de-referenced
+			ptr++;
+			type->test = printf_fmt_pointer;
+		}
+	} else if (*ptr == 'z') {
+		ptr++;
+		if (*ptr == 'd') {
+			ptr++;
+			type->test = printf_fmt_numtype;
+			type->data = &long_ctype;
+		} else if (*ptr == 'u' || *ptr == 'x') {
+			ptr++;
+			type->test = printf_fmt_numtype;
+			type->data = &ulong_ctype;
+		}
+	} else {
+		if (*ptr == 'l') {
+			szmod++;
+			ptr++;
+			if (*ptr == 'l') {
+				szmod++;
+				ptr++;
+			}
+		} else {
+			if (*ptr == 'h') { // short/char to int
+				szmod = -1;
+				ptr++;
+				if (*ptr == 'h')  // promotion from char
+					ptr++;
+			}
+			if (*ptr == 't') {  // ptrdiff_t
+				szmod = 2;
+				ptr++;
+			}
+			if (*ptr == 'j') { // intmax_t
+				// todo - replace iwth intmax_ctype when added
+				szmod = 1;
+				ptr++;
+			}
+		}
+
+		if (*ptr == 'x' || *ptr == 'X' || *ptr == 'u' || *ptr == 'o') {
+			ptr++;
+			type->test = printf_fmt_numtype;
+			switch (szmod) {
+			case -1:
+				type->data = &ushort_ctype;
+				break;
+			case 0:
+				type->data = &uint_ctype;
+				break;
+			case 1:
+				type->data = &ulong_ctype;
+				break;
+			case 2:
+				type->data = &ullong_ctype;
+				break;
+			default:
+				type->test = NULL;
+			}
+		} else if (*ptr == 'i' || *ptr == 'd') {
+			ptr++;
+			type->test = printf_fmt_numtype;
+			switch (szmod) {
+			case -1:
+				type->data = &short_ctype;
+				break;
+			case 0:
+				type->data = &int_ctype;
+				break;
+			case 1:
+				type->data = &long_ctype;
+				break;
+			case 2:
+				type->data = &llong_ctype;
+				break;
+			default:
+				type->test = NULL;
+			}
+		} else if (*ptr == 'L' && is_float_spec(ptr[1])) {
+			type->test = printf_fmt_numtype;
+			type->data = &ldouble_ctype;
+			ptr += 2;
+		} else if (is_float_spec(*ptr)) {
+			type->test = printf_fmt_numtype;
+			type->data = &double_ctype;
+			ptr++;
+		} else if (*ptr == 'n') {	/* pointer to an de-referenced int/etc */
+			// todo - we should construct pointer to int/etc //
+			// also should not have any flags or widths for this
+			type->test = printf_fmt_pointer;
+			ptr++;
+		} else {
+			// anything else here?
+		}
+	}
+
+	if (type->test == NULL)
+		return NULL;
+
+	*msgout = ptr;
+	return type;
+}
+
+static int is_printf_flag(char ch)
+{
+	return ch == '0' || ch == '+' || ch == '-' || ch == ' ' || ch == '#';
+}
+
+static int printf_check_position(const char **fmt)
+{
+	const char *ptr= *fmt;
+
+	if (!isdigit(*ptr))
+		return -1;
+	while (isdigit(*ptr))
+		ptr++;
+	if (*ptr == '$') {
+		const char *pos = *fmt;
+		*fmt = ptr+1;
+		return strtoul(pos, NULL, 10);
+	}
+	return -1;
+}
+
+static void parse_format_printf_checkpos(struct format_state *state, const char *which)
+{
+	if (state->used_position)
+		warning(state->expr->pos,
+			"format %d: %s: no position specified",
+			state->arg_index-1, which);
+}
+
+static int parse_format_printf_argfield(const char **fmtptr, struct format_state *state, struct expression_list *args, int *pos, const char *which)
+{
+	struct expression *expr;
+	struct symbol *ctype;
+	const char *fmt = *fmtptr;
+	int argpos = -1;
+
+	/* check for simple digit-string width/precision specifier first */
+	if (*fmt != '*') {
+		while (isdigit(*fmt))
+			fmt++;
+		*fmtptr = fmt;
+		return 0;
+	}
+
+	fmt++;
+	argpos = printf_check_position(&fmt);
+
+	if (argpos > 0) {
+		argpos += state->va_start - 1;
+		state->used_position = 1;
+	} else {
+		argpos = (*pos)++;
+		state->arg_index++;
+		parse_format_printf_checkpos(state, which);
+	}
+
+	*fmtptr = fmt;
+	expr = get_expression_n(args, argpos-1);
+	if (!expr) {
+		warning(state->expr->pos, "%s: no argument at position %d", which, argpos);
+		return 1;
+	}
+
+	/* check the vale we got was int/uint type */
+	ctype = evaluate_expression(expr);
+	if (ctype) {
+		struct symbol *source, *target = &int_ctype;
+
+		source = degenerate(expr);
+
+		if (source != &int_ctype && source != &uint_ctype) {
+			warning(expr->pos, "incorrect type for %s argument %d", which, argpos);
+			info(expr->pos, "   expected %s", show_typename(target));
+			info(expr->pos, "   got %s", show_typename(source));
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * printf format parsing code
+ *
+ * this code currently does not:
+ * - check castable types (such as int vs long vs long long)
+ * - validate all arguments specified are also used...
+ */
+static int parse_format_printf(const char **fmtstring,
+			       struct format_state *state,
+			       struct expression_list *args)
+{
+	struct format_type ftype;
+	struct format_type *type;
+	struct expression *expr;
+	const char *fmt = *fmtstring;
+	const char *fmtpost = NULL;
+	int pos = state->arg_index;
+	int error = 0;
+	int ret;
+
+	if (!fmt) {
+		warning(state->expr->pos, "no format string passed");
+		return -1;
+	}
+
+	/* trivial check for %% */
+	fmt++;
+	if (fmt[0] == '%') {
+		*fmtstring = fmt+1;
+		return 0;
+	}
+
+	state->arg_index++;
+	state->fmt_index++;
+
+	ret = printf_check_position(&fmt);
+	if (ret == 0) {
+		/* we got an invalid position argument */
+		error++;
+	} else if (ret < 0) {
+		parse_format_printf_checkpos(state, "position");
+	} else {
+		state->used_position = 1;
+		pos = ret + state->va_start - 1;
+	}
+
+	/* get rid of any formatting flag bits */
+	while (is_printf_flag(*fmt))
+		fmt++;
+
+	/* now there is the posibility of a width specifier */
+	if (parse_format_printf_argfield(&fmt, state, args, &pos, "width"))
+		error++;
+
+	/* now we might have the precision specifier */
+	if (*fmt == '.') {
+		fmt++;
+		if (parse_format_printf_argfield(&fmt, state, args, &pos, "position"))
+			error++;
+	}
+
+	type = parse_printf_get_fmt(&ftype, fmt, &fmtpost);
+
+	if (!type && fmt[0] == 'p')
+		type = &printf_fmt_ptr_ref;	/* probably some extension */
+
+	if (type) {
+		struct symbol *ctype, *source, *target = NULL;
+		const char *typediff = "different types";
+		int ret;
+
+		*fmtstring = fmtpost;
+		expr = get_expression_n(args, pos-1);
+		if (!expr) {
+			/* no argument, but otherwise valid argument string */
+			warning(state->expr->pos, "no argument at position '%d'", pos);
+			return 0;
+		}
+
+		ctype = evaluate_expression(expr);
+		if (!ctype)
+			return -3;
+
+		source = degenerate(expr);
+		ret = (type->test)(type, &expr, ctype, &target, &typediff);
+		if (!target)	/* shouldn't happen, but catch anyway */
+			return -4;
+
+		if (ret == 0) {
+			warning(expr->pos, "incorrect type in argument %d (%s)", pos, typediff);
+			info(expr->pos, "   expected %s", show_typename(target));
+			info(expr->pos, "   got %s", show_typename(source));
+		}
+	} else {
+		/* try and find the end of this */
+		fmtpost = *fmtstring;
+		while (*fmtpost > ' ')
+			fmtpost++;
+		warning(state->expr->pos, "cannot evaluate type '%.*s'",
+			(int)(fmtpost - *fmtstring), *fmtstring);
+		*fmtstring += 1;
+		return -1;
+	}
+
+	return 1;
+}
+
+static const char *get_printf_fmt(struct symbol *fn, struct expression_list *head)
+{
+	struct expression *expr;
+	const char *fmt_string = NULL;
+
+	expr = get_expression_n(head, fn->ctype.printf_msg-1);
+	if (!expr)
+		return NULL;
+	if (expr->string && expr->string->length)
+		fmt_string = expr->string->data;
+	if (!fmt_string) {
+		struct symbol *sym = evaluate_expression(expr);
+
+		/* attempt to find initialiser for this */
+		if (sym && sym->initializer && sym->initializer->string)
+			fmt_string = sym->initializer->string->data;
+	}
+
+	return fmt_string;
+}
+
+/*
+ * attempt to run through a printf format string and work out the types
+ * it specifies. The format is parsed from the __attribute__(format())
+ * in the parser code which stores the positions of the message and arg
+ * start in the ctype.
+ */
+static void evaluate_format_printf(const char *fmt_string, struct symbol *fn, struct expression_list *head)
+{
+	struct format_state state = { };
+	struct expression *expr;
+
+	expr = get_expression_n(head, fn->ctype.printf_msg-1);
+	if (!expr)
+		return;
+
+	state.expr = expr;
+	state.va_start = fn->ctype.printf_va_start;
+	state.arg_index = fn->ctype.printf_va_start;
+
+	if (!fmt_string) {
+		warning(expr->pos, "not a format string?");
+	} else {
+		const char *string = fmt_string;
+		int fail = 0;
+
+		for (; string[0] != '\0'; string++) {
+			if (string[0] != '%')
+				continue;
+			if (parse_format_printf(&string, &state, head) < 0)
+				fail++;
+			string--;
+		}
+
+		if (fail > 0)
+			/* format string may have '\n' etc embedded in it */
+			warning(expr->pos, "cannot evaluate format string");
+	}
+}
+
 static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
 {
 	struct expression *expr;
 	struct symbol_list *argument_types = fn->arguments;
+	const char *fmt_string = NULL;
 	struct symbol *argtype;
 	int i = 1;
 
+	/*
+	 * do this first, otherwise the arugment info may get lost or changed
+	 * later on in the evaluation loop by degenerate()
+	 */
+	if (fn->ctype.printf_va_start)
+		fmt_string = get_printf_fmt(fn, head);
+
 	PREPARE_PTR_LIST(argument_types, argtype);
 	FOR_EACH_PTR (head, expr) {
 		struct expression **p = THIS_ADDRESS(expr);
@@ -2362,6 +2801,10 @@  static int evaluate_arguments(struct symbol *fn, struct expression_list *head)
 		NEXT_PTR_LIST(argtype);
 	} END_FOR_EACH_PTR(expr);
 	FINISH_PTR_LIST(argtype);
+
+	if (fn->ctype.printf_va_start)
+		evaluate_format_printf(fmt_string, fn, head);
+
 	return 1;
 }