diff mbox

[v3] parser: Add syntax stack for recursive parsing

Message ID 20180308161402.GA9757@gondor.apana.org.au (mailing list archive)
State Accepted
Delegated to: Herbert Xu
Headers show

Commit Message

Herbert Xu March 8, 2018, 4:14 p.m. UTC
On Thu, Mar 08, 2018 at 12:36:04PM +0100, Harald van Dijk wrote:
>
> The first line of this part of my patch is about something else:
> 
>   x=\"; cat <<EOF
>   ${x#"\""}
>   EOF
> 
> This shouldn't print anything.

Right.  So here is a new patch with that change thrown in.

---8<---
Without a stack of syntaxes we cannot correctly these two cases
together:

        "${a#'$$'}"
        "${a#"${b-'$$'}"}"

A recursive parser also helps in some other corner cases such
as nested arithmetic expansion with paratheses.

This patch adds a syntax stack allocated from the stack using
alloca.  As a side-effect this allows us to remove the naked
backslashes for patterns within double-quotes, which means that
EXP_QPAT also has to go.

This patch also fixes removes any backslashes that precede right
braces when they are present within a parameter expansion context,
and backslashes that precede double quotes within inner double
quotes inside a parameter expansion in a here-document context.

The idea of a recursive parser is based on a patch by Harald van
Dijk.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Comments

Harald van Dijk March 8, 2018, 6:35 p.m. UTC | #1
On 08/03/2018 17:14, Herbert Xu wrote:
> On Thu, Mar 08, 2018 at 12:36:04PM +0100, Harald van Dijk wrote:
>>
>> The first line of this part of my patch is about something else:
>>
>>    x=\"; cat <<EOF
>>    ${x#"\""}
>>    EOF
>>
>> This shouldn't print anything.
> 
> Right.  So here is a new patch with that change thrown in.

Related:

   x=*; cat <<EOF
   ${x#'*'}
   EOF

This shouldn't print anything either: because the * is quoted, it should 
be taken as a literal and removed from $x.

Re-testing, I see this didn't work properly with my patch either.

Cheers,
Harald van Dijk
--
To unsubscribe from this list: send the line "unsubscribe dash" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/src/expand.c b/src/expand.c
index 2a50830..903e250 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -83,7 +83,7 @@ 
 #define RMESCAPE_HEAP	0x10	/* Malloc strings instead of stalloc */
 
 /* Add CTLESC when necessary. */
-#define QUOTES_ESC	(EXP_FULL | EXP_CASE | EXP_QPAT)
+#define QUOTES_ESC	(EXP_FULL | EXP_CASE)
 /* Do not skip NUL characters. */
 #define QUOTES_KEEPNUL	EXP_TILDE
 
@@ -333,16 +333,6 @@  addquote:
 		case CTLESC:
 			startloc++;
 			length++;
-
-			/*
-			 * Quoted parameter expansion pattern: remove quote
-			 * unless inside inner quotes or we have a literal
-			 * backslash.
-			 */
-			if (((flag | inquotes) & (EXP_QPAT | EXP_QUOTED)) ==
-			    EXP_QPAT && *p != '\\')
-				break;
-
 			goto addquote;
 		case CTLVAR:
 			p = evalvar(p, flag | inquotes);
@@ -651,8 +641,7 @@  subevalvar(char *p, char *str, int strloc, int subtype, int startloc, int varfla
 	char *(*scan)(char *, char *, char *, char *, int , int);
 
 	argstr(p, EXP_TILDE | (subtype != VSASSIGN && subtype != VSQUESTION ?
-			       (flag & (EXP_QUOTED | EXP_QPAT) ?
-			        EXP_QPAT : EXP_CASE) : 0));
+			       EXP_CASE : 0));
 	STPUTC('\0', expdest);
 	argbackq = saveargbackq;
 	startp = stackblock() + startloc;
@@ -1644,7 +1633,6 @@  char *
 _rmescapes(char *str, int flag)
 {
 	char *p, *q, *r;
-	unsigned inquotes;
 	int notescaped;
 	int globbing;
 
@@ -1674,24 +1662,23 @@  _rmescapes(char *str, int flag)
 			q = mempcpy(q, str, len);
 		}
 	}
-	inquotes = 0;
 	globbing = flag & RMESCAPE_GLOB;
 	notescaped = globbing;
 	while (*p) {
 		if (*p == (char)CTLQUOTEMARK) {
-			inquotes = ~inquotes;
 			p++;
 			notescaped = globbing;
 			continue;
 		}
+		if (*p == '\\') {
+			/* naked back slash */
+			notescaped = 0;
+			goto copy;
+		}
 		if (*p == (char)CTLESC) {
 			p++;
 			if (notescaped)
 				*q++ = '\\';
-		} else if (*p == '\\' && !inquotes) {
-			/* naked back slash */
-			notescaped = 0;
-			goto copy;
 		}
 		notescaped = globbing;
 copy:
diff --git a/src/expand.h b/src/expand.h
index 26dc5b4..90f5328 100644
--- a/src/expand.h
+++ b/src/expand.h
@@ -55,7 +55,6 @@  struct arglist {
 #define	EXP_VARTILDE	0x4	/* expand tildes in an assignment */
 #define	EXP_REDIR	0x8	/* file glob for a redirection (1 match only) */
 #define EXP_CASE	0x10	/* keeps quotes around for CASE pattern */
-#define EXP_QPAT	0x20	/* pattern in quoted parameter expansion */
 #define EXP_VARTILDE2	0x40	/* expand tildes after colons only */
 #define EXP_WORD	0x80	/* expand word in parameter expansion */
 #define EXP_QUOTED	0x100	/* expand word in double quotes */
diff --git a/src/parser.c b/src/parser.c
index 382658e..3aeb9f6 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -80,6 +80,18 @@  struct heredoc {
 	int striptabs;		/* if set, strip leading tabs */
 };
 
+struct synstack {
+	const char *syntax;
+	struct synstack *prev;
+	struct synstack *next;
+	int innerdq;
+	int varpushed;
+	int dblquote;
+	int varnest;		/* levels of variables expansion */
+	int parenlevel;		/* levels of parens in arithmetic */
+	int dqvarnest;		/* levels of variables expansion within double quotes */
+};
+
 
 
 struct heredoc *heredoclist;	/* list of here documents to read */
@@ -847,6 +859,21 @@  static int pgetc_eatbnl(void)
 	return c;
 }
 
+static void synstack_push(struct synstack **stack, struct synstack *next,
+			  const char *syntax)
+{
+	memset(next, 0, sizeof(*next));
+	next->syntax = syntax;
+	next->next = *stack;
+	(*stack)->prev = next;
+	*stack = next;
+}
+
+static void synstack_pop(struct synstack **stack)
+{
+	*stack = (*stack)->next;
+}
+
 
 
 /*
@@ -876,24 +903,15 @@  readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
 	size_t len;
 	struct nodelist *bqlist;
 	int quotef;
-	int dblquote;
-	int varnest;	/* levels of variables expansion */
-	int arinest;	/* levels of arithmetic expansion */
-	int parenlevel;	/* levels of parens in arithmetic */
-	int dqvarnest;	/* levels of variables expansion within double quotes */
 	int oldstyle;
-	/* syntax before arithmetic */
-	char const *uninitialized_var(prevsyntax);
+	/* syntax stack */
+	struct synstack synbase = { .syntax = syntax };
+	struct synstack *synstack = &synbase;
 
-	dblquote = 0;
 	if (syntax == DQSYNTAX)
-		dblquote = 1;
+		synstack->dblquote = 1;
 	quotef = 0;
 	bqlist = NULL;
-	varnest = 0;
-	arinest = 0;
-	parenlevel = 0;
-	dqvarnest = 0;
 
 	STARTSTACKSTR(out);
 	loop: {	/* for each line, until end of word */
@@ -901,7 +919,7 @@  readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
 		if (c == '\034' && doprompt
 		 && attyset() && ! equal(termval(), "emacs")) {
 			attyline();
-			if (syntax == BASESYNTAX)
+			if (synstack->syntax == BASESYNTAX)
 				return readtoken();
 			c = pgetc();
 			goto loop;
@@ -910,9 +928,9 @@  readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
 		CHECKEND();	/* set c to PEOF if at end of here document */
 		for (;;) {	/* until end of line or end of word */
 			CHECKSTRSPACE(4, out);	/* permit 4 calls to USTPUTC */
-			switch(syntax[c]) {
+			switch(synstack->syntax[c]) {
 			case CNL:	/* '\n' */
-				if (syntax == BASESYNTAX)
+				if (synstack->syntax == BASESYNTAX)
 					goto endword;	/* exit outer loop */
 				USTPUTC(c, out);
 				nlprompt();
@@ -922,7 +940,7 @@  readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
 				USTPUTC(c, out);
 				break;
 			case CCTL:
-				if (eofmark == NULL || dblquote)
+				if (eofmark == NULL || synstack->dblquote)
 					USTPUTC(CTLESC, out);
 				USTPUTC(c, out);
 				break;
@@ -937,13 +955,18 @@  readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
 					nlprompt();
 				} else {
 					if (
-						dblquote &&
+						synstack->dblquote &&
 						c != '\\' && c != '`' &&
 						c != '$' && (
 							c != '"' ||
-							eofmark != NULL
+							(eofmark != NULL &&
+							 !synstack->varnest)
+						) && (
+							c != '}' ||
+							!synstack->varnest
 						)
 					) {
+						USTPUTC(CTLESC, out);
 						USTPUTC('\\', out);
 					}
 					USTPUTC(CTLESC, out);
@@ -952,55 +975,64 @@  readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
 				}
 				break;
 			case CSQUOTE:
-				syntax = SQSYNTAX;
+				synstack->syntax = SQSYNTAX;
 quotemark:
 				if (eofmark == NULL) {
 					USTPUTC(CTLQUOTEMARK, out);
 				}
 				break;
 			case CDQUOTE:
-				syntax = DQSYNTAX;
-				dblquote = 1;
+				synstack->syntax = DQSYNTAX;
+				synstack->dblquote = 1;
+toggledq:
+				if (synstack->varnest)
+					synstack->innerdq ^= 1;
 				goto quotemark;
 			case CENDQUOTE:
-				if (eofmark && !varnest)
+				if (eofmark && !synstack->varnest) {
 					USTPUTC(c, out);
-				else {
-					if (dqvarnest == 0) {
-						syntax = BASESYNTAX;
-						dblquote = 0;
-					}
-					quotef++;
-					goto quotemark;
+					break;
 				}
-				break;
+
+				if (synstack->dqvarnest == 0) {
+					synstack->syntax = BASESYNTAX;
+					synstack->dblquote = 0;
+				}
+
+				quotef++;
+
+				if (c == '"')
+					goto toggledq;
+
+				goto quotemark;
 			case CVAR:	/* '$' */
 				PARSESUB();		/* parse substitution */
 				break;
 			case CENDVAR:	/* '}' */
-				if (varnest > 0) {
-					varnest--;
-					if (dqvarnest > 0) {
-						dqvarnest--;
-					}
+				if (!synstack->innerdq &&
+				    synstack->varnest > 0) {
+					if (!--synstack->varnest &&
+					    synstack->varpushed)
+						synstack_pop(&synstack);
+					else if (synstack->dqvarnest > 0)
+						synstack->dqvarnest--;
 					USTPUTC(CTLENDVAR, out);
 				} else {
 					USTPUTC(c, out);
 				}
 				break;
 			case CLP:	/* '(' in arithmetic */
-				parenlevel++;
+				synstack->parenlevel++;
 				USTPUTC(c, out);
 				break;
 			case CRP:	/* ')' in arithmetic */
-				if (parenlevel > 0) {
+				if (synstack->parenlevel > 0) {
 					USTPUTC(c, out);
-					--parenlevel;
+					--synstack->parenlevel;
 				} else {
 					if (pgetc() == ')') {
 						USTPUTC(CTLENDARI, out);
-						if (!--arinest)
-							syntax = prevsyntax;
+						synstack_pop(&synstack);
 					} else {
 						/*
 						 * unbalanced parens
@@ -1019,7 +1051,7 @@  quotemark:
 			case CIGN:
 				break;
 			default:
-				if (varnest == 0)
+				if (synstack->varnest == 0)
 					goto endword;	/* exit outer loop */
 				if (c != PEOA) {
 					USTPUTC(c, out);
@@ -1029,11 +1061,11 @@  quotemark:
 		}
 	}
 endword:
-	if (syntax == ARISYNTAX)
+	if (synstack->syntax == ARISYNTAX)
 		synerror("Missing '))'");
-	if (syntax != BASESYNTAX && eofmark == NULL)
+	if (synstack->syntax != BASESYNTAX && eofmark == NULL)
 		synerror("Unterminated quoted string");
-	if (varnest != 0) {
+	if (synstack->varnest != 0) {
 		/* { */
 		synerror("Missing '}'");
 	}
@@ -1210,6 +1242,8 @@  parsesub: {
 			PARSEBACKQNEW();
 		}
 	} else {
+		const char *newsyn = synstack->syntax;
+
 		USTPUTC(CTLVAR, out);
 		typeloc = out - (char *)stackblock();
 		STADJUST(1, out);
@@ -1260,6 +1294,8 @@  varname:
 		}
 
 		if (subtype == 0) {
+			int cc = c;
+
 			switch (c) {
 			case ':':
 				subtype = VSNUL;
@@ -1273,27 +1309,40 @@  varname:
 				break;
 			case '%':
 			case '#':
-				{
-					int cc = c;
-					subtype = c == '#' ? VSTRIMLEFT :
-							     VSTRIMRIGHT;
-					c = pgetc_eatbnl();
-					if (c == cc)
-						subtype++;
-					else
-						pungetc();
-					break;
-				}
+				subtype = c == '#' ? VSTRIMLEFT :
+						     VSTRIMRIGHT;
+				c = pgetc_eatbnl();
+				if (c == cc)
+					subtype++;
+				else
+					pungetc();
+
+				newsyn = BASESYNTAX;
+				break;
 			}
 		} else {
 badsub:
 			pungetc();
 		}
+
+		if (newsyn == ARISYNTAX && subtype > VSNORMAL)
+			newsyn = DQSYNTAX;
+
+		if (newsyn != synstack->syntax) {
+			synstack_push(&synstack,
+				      synstack->prev ?:
+				      alloca(sizeof(*synstack)),
+				      newsyn);
+
+			synstack->varpushed++;
+			synstack->dblquote = newsyn != BASESYNTAX;
+		}
+
 		*((char *)stackblock() + typeloc) = subtype;
 		if (subtype != VSNORMAL) {
-			varnest++;
-			if (dblquote)
-				dqvarnest++;
+			synstack->varnest++;
+			if (synstack->dblquote)
+				synstack->dqvarnest++;
 		}
 		STPUTC('=', out);
 	}
@@ -1352,7 +1401,7 @@  parsebackq: {
 					continue;
 				}
                                 if (pc != '\\' && pc != '`' && pc != '$'
-                                    && (!dblquote || pc != '"'))
+                                    && (!synstack->dblquote || pc != '"'))
                                         STPUTC('\\', pout);
 				if (pc > PEOA) {
 					break;
@@ -1428,10 +1477,10 @@  done:
  */
 parsearith: {
 
-	if (++arinest == 1) {
-		prevsyntax = syntax;
-		syntax = ARISYNTAX;
-	}
+	synstack_push(&synstack,
+		      synstack->prev ?: alloca(sizeof(*synstack)),
+		      ARISYNTAX);
+	synstack->dblquote = 1;
 	USTPUTC(CTLARI, out);
 	goto parsearith_return;
 }