diff mbox series

Add the seq builtin and improve some things in printf.c

Message ID 20181009030842.71754-1-husseydevin@gmail.com (mailing list archive)
State Rejected
Delegated to: Herbert Xu
Headers show
Series Add the seq builtin and improve some things in printf.c | expand

Commit Message

Devin Hussey Oct. 9, 2018, 3:08 a.m. UTC
From: Devin Hussey <husseydevin@gmail.com>

I added the seq builtin.

usage: seq [-w] [-f format] [-s string] [-t string] [first [incr]] last

Some notes:
    - 100% from scratch, aside from the small num_width algorithm I took from
      StackOverflow and credited accordingly.
    - I optimize the math by using integer math if we don't have any decimals,
      intmax_t is large enough to support the number range, and if the user
      didn't specify a custom format. GNU Coreutils also does this.
      - If we use integers, we precalculate the width and snprintf a format
        specifer from that for the -w option.
    - I reuse some of the printf code to act as the validator for the format
      string. I want to separate this more for the best possible const
      correctness, though (I use a cast), and so the conditionals are a little
      more clear.
    - It functionally matches the one supplied with macOS and presumably BSD,
      but it's faster: It is reliably slightly faster on my system (a wimpy
      2009 MacBook) to call this
          ./dash -c 'seq 1 10000'
      than it is to call
          seq 1 10000
      directly, even multiple times, from a bash shell.

Other changes:
    - I fixed a few silly gotos in the file, replacing them with negated
      conditional blocks or early returns.
    - I isolated the printf parsing into its own function so we can reuse it.
    - I reordered the static functions to go above the main functions. This
      makes it easier to follow.
    - I changed conv_escape to return the number of characters read instead
      of the string itself so it can accept a constant string without any
      ugly casting.

Signed-off-by: Devin Hussey <husseydevin@gmail.com>
---
 src/bltin/printf.c  | 451 +++++++++++++++++++++++++++++++++++---------
 src/builtins.def.in |   1 +
 2 files changed, 361 insertions(+), 91 deletions(-)

Comments

Herbert Xu Nov. 19, 2018, 10:34 a.m. UTC | #1
Devin Hussey <husseydevin@gmail.com> wrote:
> From: Devin Hussey <husseydevin@gmail.com>
> 
> I added the seq builtin.
> 
> usage: seq [-w] [-f format] [-s string] [-t string] [first [incr]] last
> 
> Some notes:
>    - 100% from scratch, aside from the small num_width algorithm I took from
>      StackOverflow and credited accordingly.
>    - I optimize the math by using integer math if we don't have any decimals,
>      intmax_t is large enough to support the number range, and if the user
>      didn't specify a custom format. GNU Coreutils also does this.
>      - If we use integers, we precalculate the width and snprintf a format
>        specifer from that for the -w option.
>    - I reuse some of the printf code to act as the validator for the format
>      string. I want to separate this more for the best possible const
>      correctness, though (I use a cast), and so the conditionals are a little
>      more clear.
>    - It functionally matches the one supplied with macOS and presumably BSD,
>      but it's faster: It is reliably slightly faster on my system (a wimpy
>      2009 MacBook) to call this
>          ./dash -c 'seq 1 10000'
>      than it is to call
>          seq 1 10000
>      directly, even multiple times, from a bash shell.
> 
> Other changes:
>    - I fixed a few silly gotos in the file, replacing them with negated
>      conditional blocks or early returns.
>    - I isolated the printf parsing into its own function so we can reuse it.
>    - I reordered the static functions to go above the main functions. This
>      makes it easier to follow.
>    - I changed conv_escape to return the number of characters read instead
>      of the string itself so it can accept a constant string without any
>      ugly casting.
> 
> Signed-off-by: Devin Hussey <husseydevin@gmail.com>

Sorry, I'm not adding this builtin to dash as this conflicts with
the goal of dash trying to be minimal.  Perhaps bash or mksh would
be a better option?

Thanks,
diff mbox series

Patch

diff --git a/src/bltin/printf.c b/src/bltin/printf.c
index 7785735..7d67514 100644
--- a/src/bltin/printf.c
+++ b/src/bltin/printf.c
@@ -37,17 +37,18 @@ 
 #include <limits.h>
 #include <stdarg.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <string.h>
 #include <unistd.h>
 
 static int	 conv_escape_str(char *, char **);
-static char	*conv_escape(char *, int *);
+static size_t	 conv_escape(const char *, int *);
 static int	 getchr(void);
 static double	 getdouble(void);
 static uintmax_t getuintmax(int);
 static char	*getstr(void);
 static char	*mklong(const char *, const char *);
-static void      check_conversion(const char *, const char *);
+static void    	 check_conversion(const char *, const char *);
 
 static int	rval;
 static char  **gargv;
@@ -105,109 +106,129 @@  static int print_escape_str(const char *f, int *param, int *array, char *s)
 
 	q[-1] = (!!((f[1] - 's') | done) - 1) & f[2];
 	total += !!q[-1];
-	if (f[1] == 's')
-		goto easy;
 
-	p = makestrspace(len, q);
-	memset(p, 'X', total);
-	p[total] = 0;
+	if (f[1] != 's') {
+		p = makestrspace(len, q);
+		memset(p, 'X', total);
+		p[total] = 0;
 
-	q = stackblock();
-	total = ASPF(&p, f, p);
+		q = stackblock();
+		total = ASPF(&p, f, p);
 
-	len = strchrnul(p, 'X') - p;
-	memcpy(p + len, q, strspn(p + len, "X"));
+		len = strchrnul(p, 'X') - p;
+		memcpy(p + len, q, strspn(p + len, "X"));
+	}
 
-easy:
 	out1mem(p, total);
 
 	popstackmark(&smark);
 	return done;
 }
 
-
-int printfcmd(int argc, char *argv[])
+/*
+ * Either parse or validate a printf string.
+ *
+ * If seq is zero, it will print the formatted characters, functioning for
+ * the printf builtin. format will be modified.
+ *
+ * If seq is non-zero, it will validate a format string for seq, which
+ * only allows EeFfGg, without doing anything else. format will not be modified.
+ */
+static int parse_format(char *format, int seq)
 {
-	char *fmt;
-	char *format;
+#define SKIP1	"#-+ 0"
+#define SKIP2	"*0123456789"
 	int ch;
+	/* whether we found a format argument in format for seq */
+	int found = 0;
+	char *fmt;
+	/*
+	 * Basic algorithm is to scan the format string for conversion
+	 * specifications -- once one is found, find out if the field
+	 * width or precision is a '*'; if it is, gather up value. 
+	 * Note, format strings are reused as necessary to use up the
+	 * provided arguments, arguments of zero/null string are 
+	 * provided to use up the format string.
+	 */
 
-	rval = 0;
-
-	nextopt(nullstr);
-
-	argv = argptr;
-	format = *argv;
-
-	if (!format)
-		error("usage: printf format [arg ...]");
-
-	gargv = ++argv;
+	/* find next format specification */
+	for (fmt = format; (ch = *fmt++) ;) {
 
-#define SKIP1	"#-+ 0"
-#define SKIP2	"*0123456789"
-	do {
-		/*
-		 * Basic algorithm is to scan the format string for conversion
-		 * specifications -- once one is found, find out if the field
-		 * width or precision is a '*'; if it is, gather up value. 
-		 * Note, format strings are reused as necessary to use up the
-		 * provided arguments, arguments of zero/null string are 
-		 * provided to use up the format string.
-		 */
+		const char *start;
+		char nextch;
+		int array[2];
+		int *param;
 
-		/* find next format specification */
-		for (fmt = format; (ch = *fmt++) ;) {
-			char *start;
-			char nextch;
-			int array[2];
-			int *param;
-
-			if (ch == '\\') {
-				int c_ch;
-				fmt = conv_escape(fmt, &c_ch);
-				ch = c_ch;
-				goto pc;
-			}
-			if (ch != '%' || (*fmt == '%' && (++fmt || 1))) {
+		if (ch == '\\') {
+			int c_ch;
+			fmt += conv_escape(fmt, &c_ch);
+			ch = c_ch;
+			goto pc;
+		}
+		if (ch != '%' || (*fmt == '%' && (++fmt || 1))) {
 pc:
+			/* We only output in printf mode */
+			if (!seq)
 				putchar(ch);
-				continue;
-			}
+			continue;
+		}
 
-			/* Ok - we've found a format specification,
-			   Save its address for a later printf(). */
-			start = fmt - 1;
-			param = array;
+		/* Ok - we've found a format specification,
+		   Save its address for a later printf(). */
+		start = fmt - 1;
+		param = array;
+
+		/* skip to field width */
+		fmt += strspn(fmt, SKIP1);
+		if (*fmt == '*') {
+			++fmt;
+			*param++ = getuintmax(1);
+		} else {
+			/* skip to possible '.',
+			 * get following precision
+			 */
+			fmt += strspn(fmt, SKIP2);
+		}
 
-			/* skip to field width */
-			fmt += strspn(fmt, SKIP1);
+		if (*fmt == '.') {
+			++fmt;
 			if (*fmt == '*') {
 				++fmt;
 				*param++ = getuintmax(1);
-			} else {
-				/* skip to possible '.',
-				 * get following precision
-				 */
+			} else
 				fmt += strspn(fmt, SKIP2);
-			}
+		}
 
-			if (*fmt == '.') {
-				++fmt;
-				if (*fmt == '*') {
-					++fmt;
-					*param++ = getuintmax(1);
-				} else
-					fmt += strspn(fmt, SKIP2);
-			}
+		ch = *fmt;
+		if (!ch)
+			error("missing format character");
 
-			ch = *fmt;
-			if (!ch)
-				error("missing format character");
+		/* When invoked as seq, we can only use EeFfGg, and we only use
+		 * it once. We actually printf later, this is just a syntax check. */
+		if (seq) {
+			switch (ch) {
+			case 'e':
+			case 'E':
+			case 'f':
+			case 'F':
+			case 'g':
+			case 'G':
+				/* Only allow it once. */
+				if (!found) {
+					found = 1;
+					break;
+				}
+				/* else FALLTHROUGH */
+			default:
+				error("'%s': invalid format string", format);
+				break;
+			}
+		} else {
 			/* null terminate format string to we can use it
 			   as an argument to printf. */
 			nextch = fmt[1];
 			fmt[1] = 0;
+
 			switch (ch) {
 
 			case 'b':
@@ -215,7 +236,8 @@  pc:
 				/* escape if a \c was encountered */
 				if (print_escape_str(start, param, array,
 						     getstr()))
-					goto out;
+					return rval;
+
 				*fmt = 'b';
 				break;
 			case 'c': {
@@ -261,13 +283,13 @@  pc:
 			}
 			*++fmt = nextch;
 		}
-	} while (gargv != argv && *gargv);
+	}
+	if (seq)
+		return found;
 
-out:
-	return rval;
+	return 0;
 }
 
-
 /*
  * Print SysV echo(1) style escape string 
  *	Halts processing string if a \c escape is encountered.
@@ -303,7 +325,7 @@  conv_escape_str(char *str, char **sp)
 			str++;
 
 		/* Finally test for sequences valid in the format string */
-		str = conv_escape(str - 1, &c);
+		str += conv_escape(str - 1, &c);
 	} while (STPUTC(c, cp), (char)ch);
 
 	*sp = cp;
@@ -314,11 +336,11 @@  conv_escape_str(char *str, char **sp)
 /*
  * Print "standard" escape characters 
  */
-static char *
-conv_escape(char *str, int *conv_ch)
+static size_t conv_escape(const char *str, int *conv_ch)
 {
 	int value;
 	int ch;
+	size_t i = 0;
 
 	ch = *str;
 
@@ -333,8 +355,8 @@  conv_escape(char *str, int *conv_ch)
 		value = 0;
 		do {
 			value <<= 3;
-			value += octtobin(*str++);
-		} while (isodigit(*str) && --ch);
+			value += octtobin(str[i++]);
+		} while (isodigit(str[0]) && --ch);
 		goto out;
 
 	case '\\':	value = '\\';	break;	/* backslash */
@@ -347,10 +369,10 @@  conv_escape(char *str, int *conv_ch)
 	case 'v':	value = '\v';	break;	/* vertical-tab */
 	}
 
-	str++;
+	i++;
 out:
 	*conv_ch = value;
-	return str;
+	return i;
 }
 
 static char *
@@ -405,17 +427,17 @@  getuintmax(int sign)
 
 	cp = *gargv;
 	if (cp == NULL)
-		goto out;
+		return val;
 	gargv++;
 
 	val = (unsigned char) cp[1];
 	if (*cp == '\"' || *cp == '\'')
-		goto out;
+		return val;
 
 	errno = 0;
 	val = sign ? strtoimax(cp, &ep, 0) : strtoumax(cp, &ep, 0);
 	check_conversion(cp, ep);
-out:
+
 	return val;
 }
 
@@ -454,6 +476,253 @@  check_conversion(const char *s, const char *ep)
 	}
 }
 
+
+/* seq stuff here */
+
+#define PO10_LIMIT (INTMAX_MAX/10)
+
+/*
+ * Gets the width in digits of an integer. We return uint8_t to truncate the value.
+ * Credit goes to Curd at https://stackoverflow.com/a/4143288 for the original algorithm
+ * which I modified slightly.
+ */
+static uint8_t num_width(intmax_t i)
+{
+	intmax_t n = 1, po10 = 10;
+
+	/* Flip the sign and increase n for the sign */
+	if (i < 0) {
+		i = -i;
+		++n;
+	}
+
+	while (i >= po10) {
+		++n;
+		if (po10 > PO10_LIMIT)
+			break;
+		po10 *= 10;
+	}
+	return (uint8_t)n;
+}
+
+static inline void sequsage(void)
+{
+	error("usage: seq [-w] [-f format] [-s string] [-t string] [first [incr]] last");
+}
+
+/* Checks if a double is in the range of intmax_t. */
+static inline int in_int_range(const double x)
+{
+	return (x >= INTMAX_MIN && x <= INTMAX_MAX);
+}
+
+/* Whether we want to use int math or double math. */
+static int seq_use_int = 0;
+
+/* strtod which automatically errors on failure and sets seq_use_int if needed. */
+static inline double strtod_seq(char *str)
+{
+	char *end;
+	double ret = strtod(str, &end);
+
+	/* strtod will set end to the first non-matching character.
+	 * If end is equal to str or end[0] is not a null terminator,
+	 * we have a bad number. */
+	if (end == str || end[0] != '\0')
+		error("%s: invalid number", str);
+
+	/* Check if we have a '.' in the number, if so, force double math. */
+	if (seq_use_int && strchr(str, '.') != NULL) {
+		seq_use_int = 0;
+	}
+	return ret;
+}
+
+/*
+ * The seqcmd builtin.
+ */
+int seqcmd(int argc, char **argv)
+{
+	const char *fmt = NULL;
+	char *sep = NULL;
+	char *endstr = NULL;
+
+	double first = 1.0;
+	double incr = 1.0;
+	double last = 0.0;
+
+	int seq_wflag = 0;
+	seq_use_int = 1;
+
+	if (argc == 1)
+		sequsage();
+
+	--argc; ++argv;
+
+	int c;
+
+	opterr = 0; optind = 0; optreset = 1;
+	while ((c = getopt(argc, argv, "+f:s:t:w")) != -1) {
+		switch (c) {
+		/* Format string */
+		case 'f':
+			/* no empty strings allowed */
+			if (optarg[0] == '\0') {
+				sequsage();
+			} else {
+				fmt = optarg;
+				/* We force using double math if they specified format */
+				seq_use_int = 0;
+			}
+			break;
+		/* Separator */
+		case 's':
+			sep = optarg;
+			break;
+		/* Terminator */
+		case 't':
+			endstr = optarg;
+			break;
+		/* Constant width */
+		case 'w':
+			seq_wflag = 1;
+			break;
+		case '?':
+			/* Negative numbers trigger getopt. We force exit the loop and rewind when this happens. */
+			if (isdigit(optopt)) {
+				--optind;
+				goto out;
+			}
+			/* else FALLTHROUGH */
+		default:
+			sequsage();
+			break;
+		}
+	}
+out:
+	argc -= optind; argv += optind;
+
+	if (argc < 1 || argc > 3)
+		sequsage();
+
+	/* first is always the first number when there is more than 1 argument */
+	if (argc > 1) {
+		first = strtod_seq(*argv);
+		--argc; ++argv;
+	}
+
+	/* incr is in the middle. */
+	if (argc == 2) {
+		incr = strtod_seq(*argv);
+		/* Don't allow negative or zero values for incr. */
+		if (incr < 0.0) {
+			error("increment/decrement must be greater than zero");
+		}
+		--argc; ++argv;
+	}
+
+	/* the last argument is always the last number */
+	last = strtod_seq(*argv);
+
+	/* check if we have whole numbers larger than what can be represented in intmax_t */
+	if (seq_use_int) {
+		seq_use_int = in_int_range(first) && in_int_range(last);
+	}
+
+	/* If first is less than last, we go backwards. */
+	int forwards = first <= last;
+	if (!forwards) {
+		incr *= -1.0;
+	}
+
+	const char default_sep = '\n';
+
+	if (fmt == NULL) {
+		/* If the user did not specify a format and only used whole numbers, we can cast away the
+		 * double and use faster int math. glibc also does this. */
+		if (seq_use_int) {
+			intmax_t int_first = (intmax_t)first;
+			intmax_t int_last  = (intmax_t)last;
+			const intmax_t int_incr  = (const intmax_t)incr;
+
+			/* We snprintf the width format into the buffer. */
+			/* % 2 5 5 sizeof(PRIdMAX) - we have an extra digit even though int64_t is max 20 digits. */
+			char fmt_w_int[5 + sizeof(PRIdMAX)];
+
+			if (seq_wflag) {
+				/* Calculate the width we need for this string. We use uint8_t to be safe. */
+				const uint8_t first_width = num_width(int_first);
+				const uint8_t last_width = num_width(int_last);
+				const uint8_t width = (first_width > last_width) ? first_width : last_width;
+
+				/* snprintf the custom format string, resulting in %<width>d. */
+				snprintf(fmt_w_int, sizeof(fmt_w_int), "%%%u" PRIdMAX, width);
+			}
+			/* Loop through the numbers with int math. */
+			for (intmax_t i = int_first; (forwards) ? (i <= int_last) : (i >= int_last); i += int_incr) {
+				/* printf either the string we just made above, or "%" PRIdMAX if not. */
+				printf(seq_wflag ? fmt_w_int : "%" PRIdMAX, i);
+				/* separator */
+				if (sep == NULL)
+					putchar(default_sep);
+				else
+					print_escape_str("%s", NULL, NULL, sep);
+			}
+			/* Everything else is double math */
+			goto end;
+		} else {
+			/* -w uses %e, otherwise, it uses %g. */
+			fmt = (seq_wflag) ? "%e" : "%g";
+		}
+	} else {
+		/* We reuse the code in printf's code to validate our format string.
+		 * It is safe, albeit ugly, to cast away the const. We don't change it and it isn't readonly
+		 * anyways (it will be from argv). */
+		parse_format((char *)fmt, 1);
+	}
+
+	/* Loop through the numbers with double math. */
+	for (double i = first; forwards ? (i <= last) : (i >= last); i += incr) {
+		printf(fmt, i);
+		/* separator */
+		if (sep == NULL)
+			putchar(default_sep);
+		else
+			print_escape_str("%s", NULL, NULL, sep);
+	}
+
+end:
+	/* Print the ending string */
+	if (endstr != NULL)
+		print_escape_str("%s", NULL, NULL, endstr);
+	return 0;
+}
+
+int printfcmd(int argc, char *argv[])
+{
+	char *format;
+
+	rval = 0;
+
+	nextopt(nullstr);
+
+	argv = argptr;
+	format = *argv;
+
+	if (!format)
+		error("usage: printf format [arg ...]");
+
+	gargv = ++argv;
+
+	do {
+		int ret = parse_format(format, 0);
+		if (ret != 0)
+			return ret;
+	} while (gargv != argv && *gargv);
+
+	return rval;
+}
+
 int
 echocmd(int argc, char **argv)
 {
diff --git a/src/builtins.def.in b/src/builtins.def.in
index 95e420c..e15811b 100644
--- a/src/builtins.def.in
+++ b/src/builtins.def.in
@@ -76,6 +76,7 @@  printfcmd	printf
 pwdcmd		-u pwd
 readcmd		-u read
 returncmd	-s return
+seqcmd		-u seq
 setcmd		-s set
 shiftcmd	-s shift
 timescmd	-s times