From patchwork Sun Apr 28 03:56:58 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Herbert Xu X-Patchwork-Id: 13645827 X-Patchwork-Delegate: herbert@gondor.apana.org.au Received: from abb.hmeau.com (abb.hmeau.com [144.6.53.87]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6292123BB for ; Sun, 28 Apr 2024 03:56:44 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=144.6.53.87 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276607; cv=none; b=JIb4lM879Vw+bcGAU1xAxEK8U9yq+wNSjClVAQh3nvJXd7RyPa7jmxZRucSbZXN9hdpjY5MKU3cB5as7dHI6ZBk73Qb6Wh++QYmWQ96Ns+WsL2MBRYsi7P5Dl0YeJEJtbWRYurEleT0kCe9NYXr1ckHFJy8OFpXL4RQHeZaLygE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276607; c=relaxed/simple; bh=PTuY1FwaYoW0+ftpuGe503VHsrk4LYewOCTtoEdnLtQ=; h=Date:Message-Id:In-Reply-To:References:From:Subject:To; b=nWxixvcd53cWe3VgOsWEzD2ReHuNi8G5lhmh5YbVVzzn6Kw4wKSeFVV3RmBjULE0n2VA63bsL4eHj4EuO6ypk0zgROYZUDVn5CM9HcJeRC5PnEKGp+1AEvTf3PdWJ2xpredsLyTED8vyE+av3kicsxW2l0K2VzYtHozY2DDvYS4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au; spf=pass smtp.mailfrom=gondor.apana.org.au; arc=none smtp.client-ip=144.6.53.87 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=gondor.apana.org.au Received: from loth.rohan.me.apana.org.au ([192.168.167.2]) by formenos.hmeau.com with smtp (Exim 4.96 #2 (Debian)) id 1s0veZ-007PZ9-38; Sun, 28 Apr 2024 11:56:41 +0800 Received: by loth.rohan.me.apana.org.au (sSMTP sendmail emulation); Sun, 28 Apr 2024 11:56:58 +0800 Date: Sun, 28 Apr 2024 11:56:58 +0800 Message-Id: In-Reply-To: References: From: Herbert Xu Subject: [v2 PATCH 1/8] shell: Call setlocale To: DASH Mailing List Precedence: bulk X-Mailing-List: dash@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Call setlocale to initialise locale settings for libc. Signed-off-by: Herbert Xu --- src/main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/main.c b/src/main.c index 7beb280..1e192f8 100644 --- a/src/main.c +++ b/src/main.c @@ -32,6 +32,7 @@ * SUCH DAMAGE. */ +#include #include #include #include @@ -101,6 +102,9 @@ main(int argc, char **argv) #if PROFILE monitor(4, etext, profile_buf, sizeof profile_buf, 50); #endif + + setlocale(LC_ALL, ""); + state = 0; if (unlikely(setjmp(main_handler.loc))) { int e; From patchwork Sun Apr 28 03:57:00 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Herbert Xu X-Patchwork-Id: 13645828 X-Patchwork-Delegate: herbert@gondor.apana.org.au Received: from abb.hmeau.com (abb.hmeau.com [144.6.53.87]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9C7C21876 for ; Sun, 28 Apr 2024 03:56:45 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=144.6.53.87 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276608; cv=none; b=p0KoKSXWuBcKtyXUaSUi1envf6xRIm6vhAo05TJYZ7OrDFFIEAG4yuX2Jt2t5UijnK45RmhjuzSY1gzYCcbaylylNX3LZU3lQxPAyMdI/jcY6G/4AMj0cKjUexsfQFz3xLMvNO/XmX+KwCdTEshjS1atqVurFwK3IiQ8xIHfZPE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276608; c=relaxed/simple; bh=QSNOuGchN7VIKs3Dii5wq9S5Z13unfd2KegrKIUOKMs=; h=Date:Message-Id:In-Reply-To:References:From:Subject:To; b=syFM0n40g4TEF+76QIdiLECBsf1mtt5FK+8VJLRs/DgOaHwW5IECI4/O+D5gvYEwl+r1g10a8yOpF3dkVfar76oL5IZaed845u6Eyqmn+2wLQ8Iqv38vG0x9bEMBxf7cxxHU9C3W5SxeQCyEWDjR4O1VlXn8y754/clpJorc+6w= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au; spf=pass smtp.mailfrom=gondor.apana.org.au; arc=none smtp.client-ip=144.6.53.87 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=gondor.apana.org.au Received: from loth.rohan.me.apana.org.au ([192.168.167.2]) by formenos.hmeau.com with smtp (Exim 4.96 #2 (Debian)) id 1s0vec-007PZL-0x; Sun, 28 Apr 2024 11:56:43 +0800 Received: by loth.rohan.me.apana.org.au (sSMTP sendmail emulation); Sun, 28 Apr 2024 11:57:00 +0800 Date: Sun, 28 Apr 2024 11:57:00 +0800 Message-Id: <6bb4ba1fe8c3bf97b993aa037dda946ff6947d86.1714276539.git.herbert@gondor.apana.org.au> In-Reply-To: References: From: Herbert Xu Subject: [v2 PATCH 2/8] shell: Use strcoll instead of strcmp where applicable To: DASH Mailing List Precedence: bulk X-Mailing-List: dash@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Use strcoll instead of strcmp so that the locale is taken into account when sorting strings during pathname expansion, and for the built-in test(1) string comparison operators. Signed-off-by: Herbert Xu --- src/bltin/test.c | 8 ++++---- src/expand.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/bltin/test.c b/src/bltin/test.c index fd8a43b..2db4d0f 100644 --- a/src/bltin/test.c +++ b/src/bltin/test.c @@ -353,13 +353,13 @@ binop(void) /* NOTREACHED */ #endif case STREQ: - return strcmp(opnd1, opnd2) == 0; + return strcoll(opnd1, opnd2) == 0; case STRNE: - return strcmp(opnd1, opnd2) != 0; + return strcoll(opnd1, opnd2) != 0; case STRLT: - return strcmp(opnd1, opnd2) < 0; + return strcoll(opnd1, opnd2) < 0; case STRGT: - return strcmp(opnd1, opnd2) > 0; + return strcoll(opnd1, opnd2) > 0; case INTEQ: return getn(opnd1) == getn(opnd2); case INTNE: diff --git a/src/expand.c b/src/expand.c index 0db2b29..9ac981e 100644 --- a/src/expand.c +++ b/src/expand.c @@ -1476,7 +1476,7 @@ msort(struct strlist *list, int len) p = msort(p, len - half); /* sort second half */ lpp = &list; for (;;) { - if (strcmp(p->text, q->text) < 0) { + if (strcoll(p->text, q->text) < 0) { *lpp = p; lpp = &p->next; if ((p = *lpp) == NULL) { From patchwork Sun Apr 28 03:57:02 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Herbert Xu X-Patchwork-Id: 13645829 X-Patchwork-Delegate: herbert@gondor.apana.org.au Received: from abb.hmeau.com (abb.hmeau.com [144.6.53.87]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A882023BB for ; Sun, 28 Apr 2024 03:56:48 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=144.6.53.87 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276611; cv=none; b=WLI0PF2avbkcreRuCV45jihBO1g7/R8sphQ9RS1n1VL+VQIdt9e/A213EKML4OaleA0LxSO9xzjkvrt64NwfGBOYthMxAVucCtIrYSwlNhlp+OURza4oIZexHxWnD8THcX5awu3+dEN09RzqCzxZbOD7oDOhj2eMpXI0K552pGo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276611; c=relaxed/simple; bh=WM4K4qsej20xU9qC+ULpNgYM938hmoTCpH8V5sWVgEA=; h=Date:Message-Id:In-Reply-To:References:From:Subject:To; b=cPtWR3OSqIeCPlJTX85IjjsnkG2E6wCqP/5BRVz0QcgsmXi7DYg8SFipDsHnq4aZUPb68FOvkrgJodpqBbQmIVTGxckUhBHlAcBbC4SaAQR+5XWSKVbjMYtbahs9Ny4MiGbFeB4XUGQn7+SSSWtDjSLSkGNklDmmiza6EC9k3m0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au; spf=pass smtp.mailfrom=gondor.apana.org.au; arc=none smtp.client-ip=144.6.53.87 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=gondor.apana.org.au Received: from loth.rohan.me.apana.org.au ([192.168.167.2]) by formenos.hmeau.com with smtp (Exim 4.96 #2 (Debian)) id 1s0vee-007PZX-1r; Sun, 28 Apr 2024 11:56:45 +0800 Received: by loth.rohan.me.apana.org.au (sSMTP sendmail emulation); Sun, 28 Apr 2024 11:57:02 +0800 Date: Sun, 28 Apr 2024 11:57:02 +0800 Message-Id: <6f609a418ff19ab6ffc7b63cc3f2575a101e455d.1714276539.git.herbert@gondor.apana.org.au> In-Reply-To: References: From: Herbert Xu Subject: [v2 PATCH 3/8] expand: Count multi-byte characters for VSLENGTH To: DASH Mailing List Precedence: bulk X-Mailing-List: dash@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Count multi-byte characters in variables and rather than bytes and return that as the length expansion. Signed-off-by: Herbert Xu --- src/expand.c | 62 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/src/expand.c b/src/expand.c index 9ac981e..ad186b0 100644 --- a/src/expand.c +++ b/src/expand.c @@ -53,6 +53,7 @@ #endif #include #include +#include /* * Routines to expand arguments to commands. We have to deal with @@ -796,6 +797,18 @@ really_record: return p; } +static char *chtodest(int c, int flags, char *out) +{ + const char *syntax = flags & EXP_QUOTED ? DQSYNTAX : BASESYNTAX; + + if ((flags & QUOTES_ESC) && + ((syntax[c] == CCTL) || + (flags & EXP_QUOTED && syntax[c] == CBACK))) + USTPUTC(CTLESC, out); + USTPUTC(c, out); + + return out; +} /* * Put a string on the stack. @@ -803,38 +816,48 @@ really_record: static size_t memtodest(const char *p, size_t len, int flags) { - const char *syntax = flags & EXP_QUOTED ? DQSYNTAX : BASESYNTAX; + size_t count = 0; char *q; - char *s; + int c; if (unlikely(!len)) return 0; q = makestrspace(len * 2, expdest); - s = q; do { - int c = (signed char)*p++; - if (c) { - if ((flags & QUOTES_ESC) && - ((syntax[c] == CCTL) || - (flags & EXP_QUOTED && syntax[c] == CBACK))) - USTPUTC(CTLESC, q); - } else if (!(flags & EXP_KEEPNUL)) + c = (signed char)*p++; + + if (c) + count++; + else if (!(flags & EXP_KEEPNUL)) continue; - USTPUTC(c, q); + + if (c < 0) { + mbstate_t mbs = {}; + + p--; + do { + q = chtodest(c, flags, q); + } while (mbrlen(p++, 1, &mbs) == -2 && + (c = *p, --len)); + if (!len) + break; + continue; + } + + q = chtodest(c, flags, q); } while (--len); expdest = q; - return q - s; + return count; } static size_t strtodest(const char *p, int flags) { size_t len = strlen(p); - memtodest(p, len, flags); - return len; + return memtodest(p, len, flags); } @@ -856,6 +879,7 @@ varvalue(char *name, int varflags, int flags, int quoted) int discard = (subtype == VSPLUS || subtype == VSLENGTH) | (flags & EXP_DISCARD); ssize_t len = 0; + size_t start; char c; if (!subtype) { @@ -865,9 +889,9 @@ varvalue(char *name, int varflags, int flags, int quoted) sh_error("Bad substitution"); } - flags |= EXP_KEEPNUL; flags &= discard ? ~QUOTES_ESC : ~0; sep = (flags & EXP_FULL) << CHAR_BIT; + start = expdest - (char *)stackblock(); switch (*name) { case '$': @@ -927,7 +951,7 @@ param: if (*ap && sep) { len++; - memtodest(&sepc, 1, flags); + memtodest(&sepc, 1, flags | EXP_KEEPNUL); } } break; @@ -957,7 +981,7 @@ value: } if (discard) - STADJUST(-len, expdest); + expdest = (char *)stackblock() + start; return len; } @@ -1758,11 +1782,13 @@ casematch(union node *pattern, char *val) static size_t cvtnum(intmax_t num, int flags) { + size_t start = expdest - (char *)stackblock(); int len = max_int_length(sizeof(num)); char buf[len]; len = fmtstr(buf, len, "%" PRIdMAX, num); - return memtodest(buf, len, flags); + memtodest(buf, len, flags); + return (expdest - (char *)stackblock()) - start; } STATIC void From patchwork Sun Apr 28 03:57:05 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Herbert Xu X-Patchwork-Id: 13645830 X-Patchwork-Delegate: herbert@gondor.apana.org.au Received: from abb.hmeau.com (abb.hmeau.com [144.6.53.87]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B7C822F43 for ; Sun, 28 Apr 2024 03:56:50 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=144.6.53.87 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276613; cv=none; b=n53ZQe0jgmUD67Ez4L4POVwwXI187aiEFsZxBWkqQxD+Jnvsd4vfzNKcUcTuRKRh1vNzf0cHrvYSnI0L5PCWyQBNKF6fInZq542F4BvlO3+BZ7b2idAGaG2kPQdMb9yLGaJyAf+SSEO1tj7qMBguqiI7CY53ThHelf0xkz2K3CY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276613; c=relaxed/simple; bh=pNKNV+W2oj7oHc53rjrXu/H5VZuqawu5yMMwiCJx1Dk=; h=Date:Message-Id:In-Reply-To:References:From:Subject:To; b=s7aOSpACERjPd9r0uVkx++u1W2C1m4kkk4F1qJ2LCUvRunCEYdy4E+dCcyKrDkfKbXcY9ug4vKuiNv7nZZt1nYMilHfVNVqCnN4PtqXo03fzxljeeWKI9ifd8uWCGX/ilZXtVDkF3jKF9SNJW0WOON/Vs2HysOKXdjKg8GLsbcs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au; spf=pass smtp.mailfrom=gondor.apana.org.au; arc=none smtp.client-ip=144.6.53.87 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=gondor.apana.org.au Received: from loth.rohan.me.apana.org.au ([192.168.167.2]) by formenos.hmeau.com with smtp (Exim 4.96 #2 (Debian)) id 1s0veg-007PZl-2w; Sun, 28 Apr 2024 11:56:47 +0800 Received: by loth.rohan.me.apana.org.au (sSMTP sendmail emulation); Sun, 28 Apr 2024 11:57:05 +0800 Date: Sun, 28 Apr 2024 11:57:05 +0800 Message-Id: <008ebecbab03a2504589f69ae9c2ed1353f7b6a3.1714276539.git.herbert@gondor.apana.org.au> In-Reply-To: References: From: Herbert Xu Subject: [v2 PATCH 4/8] expand: Process multi-byte characters in subevalvar To: DASH Mailing List Precedence: bulk X-Mailing-List: dash@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: When trimming variables in subevalvar, process multi-byte characters as one unit instead of their constituent bytes. Signed-off-by: Herbert Xu --- src/expand.c | 192 ++++++++++++++++++++++++++++++++++--------------- src/expand.h | 1 + src/mystring.c | 2 +- src/parser.h | 1 + 4 files changed, 136 insertions(+), 60 deletions(-) diff --git a/src/expand.c b/src/expand.c index ad186b0..60a51b1 100644 --- a/src/expand.c +++ b/src/expand.c @@ -32,27 +32,27 @@ * SUCH DAMAGE. */ -#include -#include -#include +#include #include -#include -#ifdef HAVE_GETPWNAM -#include -#endif -#include -#include -#include -#include -#include #ifdef HAVE_FNMATCH #include #endif #ifdef HAVE_GLOB #include #endif -#include +#include +#include +#ifdef HAVE_GETPWNAM +#include +#endif +#include #include +#include +#include +#include +#include +#include +#include #include /* @@ -550,8 +550,10 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend, loc = startp; loc2 = rmesc; do { - int match; const char *s = loc2; + unsigned ml; + int match; + c = *loc2; if (zero) { *loc2 = '\0'; @@ -560,12 +562,26 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend, match = pmatch(str, s); *loc2 = c; if (match) - return loc; - if (quotes && *loc == (char)CTLESC) + return quotes ? loc : loc2; + + if (!c) + break; + + if (*loc != (char)CTLMBCHAR) { + if (*loc == (char)CTLESC) + loc++; loc++; - loc++; - loc2++; - } while (c); + loc2++; + continue; + } + + if (*++loc == (char)CTLESC) + loc++; + + ml = (unsigned char)*loc; + loc += ml + 3; + loc2 += ml; + } while (1); return 0; } @@ -573,14 +589,16 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend, static char *scanright(char *startp, char *endp, char *rmesc, char *rmescend, char *str, int quotes, int zero ) { - int esc = 0; + size_t esc = 0; char *loc; char *loc2; for (loc = endp, loc2 = rmescend; loc >= startp; loc2--) { - int match; - char c = *loc2; const char *s = loc2; + char c = *loc2; + unsigned ml; + int match; + if (zero) { *loc2 = '\0'; s = rmesc; @@ -588,17 +606,23 @@ static char *scanright(char *startp, char *endp, char *rmesc, char *rmescend, match = pmatch(str, s); *loc2 = c; if (match) - return loc; + return quotes ? loc : loc2; loc--; - if (quotes) { - if (--esc < 0) { - esc = esclen(startp, loc); - } - if (esc % 2) { - esc--; - loc--; - } + if (!esc--) + esc = esclen(startp, loc); + if (esc % 2) { + esc--; + loc--; + continue; } + if (*loc != (char)CTLMBCHAR) + continue; + + ml = (unsigned char)*--loc; + loc -= ml + 2; + if (*loc == (char)CTLESC) + loc--; + loc2 -= ml - 1; } return 0; } @@ -652,14 +676,11 @@ static char *subevalvar(char *start, char *str, int strloc, int startloc, nstrloc = str - (char *)stackblock(); } - rmesc = startp; - if (quotes) { - rmesc = _rmescapes(startp, RMESCAPE_ALLOC | RMESCAPE_GROW); - if (rmesc != startp) - rmescend = expdest; - startp = stackblock() + startloc; - str = stackblock() + nstrloc; - } + rmesc = _rmescapes(startp, RMESCAPE_ALLOC | RMESCAPE_GROW); + if (rmesc != startp) + rmescend = expdest; + startp = stackblock() + startloc; + str = stackblock() + nstrloc; rmescend--; /* zero = subtype == VSTRIMLEFT || subtype == VSTRIMLEFTMAX */ @@ -669,16 +690,29 @@ static char *subevalvar(char *start, char *str, int strloc, int startloc, endp = stackblock() + strloc - 1; loc = scan(startp, endp, rmesc, rmescend, str, quotes, zero); - if (loc) { - if (zero) { - memmove(startp, loc, endp - loc); - loc = startp + (endp - loc); + if (!loc) { + if (quotes) { + rmesc = startp; + rmescend = endp; } - *loc = '\0'; - } else - loc = endp; + } else if (!quotes) { + if (zero) + rmesc = loc; + else + rmescend = loc; + } else if (zero) { + rmesc = loc; + rmescend = endp; + } else { + rmesc = startp; + rmescend = loc; + } + + memmove(startp, rmesc, rmescend - rmesc); + loc = startp + (rmescend - rmesc); out: + *loc = '\0'; amount = loc - expdest; STADJUST(amount, expdest); @@ -704,6 +738,7 @@ evalvar(char *p, int flag) ssize_t varlen; int discard; int quoted; + int mbchar; varflags = *p++ & ~VSBIT; subtype = varflags & VSTYPE; @@ -713,8 +748,18 @@ evalvar(char *p, int flag) startloc = expdest - (char *)stackblock(); p = strchr(p, '=') + 1; + mbchar = 0; + switch (subtype) { + case VSTRIMLEFT: + case VSTRIMLEFTMAX: + case VSTRIMRIGHT: + case VSTRIMRIGHTMAX: + mbchar = EXP_MBCHAR; + break; + } + again: - varlen = varvalue(var, varflags, flag, quoted); + varlen = varvalue(var, varflags, flag | mbchar, quoted); if (varflags & VSNUL) varlen--; @@ -801,7 +846,7 @@ static char *chtodest(int c, int flags, char *out) { const char *syntax = flags & EXP_QUOTED ? DQSYNTAX : BASESYNTAX; - if ((flags & QUOTES_ESC) && + if ((flags & (QUOTES_ESC | EXP_MBCHAR)) && ((syntax[c] == CCTL) || (flags & EXP_QUOTED && syntax[c] == CBACK))) USTPUTC(CTLESC, out); @@ -823,9 +868,13 @@ static size_t memtodest(const char *p, size_t len, int flags) if (unlikely(!len)) return 0; - q = makestrspace(len * 2, expdest); + /* CTLMBCHAR, 2, c, c, 2, CTLMBCHAR */ + q = makestrspace(len * 3, expdest); do { + mbstate_t mbs = {}; + size_t ml; + c = (signed char)*p++; if (c) @@ -833,19 +882,30 @@ static size_t memtodest(const char *p, size_t len, int flags) else if (!(flags & EXP_KEEPNUL)) continue; - if (c < 0) { - mbstate_t mbs = {}; + if (c >= 0) + goto copy; - p--; - do { - q = chtodest(c, flags, q); - } while (mbrlen(p++, 1, &mbs) == -2 && - (c = *p, --len)); - if (!len) - break; - continue; + ml = mbrlen(p - 1, len, &mbs); + if (ml == -1 || ml == -2 || ml < 2 || ml > MB_LEN_MAX) + goto copy; + + if ((flags & (QUOTES_ESC | EXP_MBCHAR))) { + USTPUTC(CTLMBCHAR, q); + USTPUTC(ml, q); } + q = mempcpy(q, p - 1, ml); + + if ((flags & (QUOTES_ESC | EXP_MBCHAR))) { + USTPUTC(ml, q); + USTPUTC(CTLMBCHAR, q); + } + + p += ml - 1; + len -= ml - 1; + continue; + +copy: q = chtodest(c, flags, q); } while (--len); @@ -1720,6 +1780,8 @@ _rmescapes(char *str, int flag) inquotes = 0; notescaped = globbing; while (*p) { + unsigned ml; + if (*p == (char)CTLQUOTEMARK) { p++; inquotes ^= globbing; @@ -1743,6 +1805,18 @@ add_escape: } } notescaped = globbing; + + if (*p != (char)CTLMBCHAR) + goto copy; + + if (*++p == (char)CTLESC) + p++; + + ml = (unsigned char)*p++; + q = mempcpy(q, p, ml); + p += ml + 2; + continue; + copy: *q++ = *p++; } diff --git a/src/expand.h b/src/expand.h index 49a18f9..e5a990e 100644 --- a/src/expand.h +++ b/src/expand.h @@ -60,6 +60,7 @@ struct arglist { #define EXP_QUOTED 0x100 /* expand word in double quotes */ #define EXP_KEEPNUL 0x200 /* do not skip NUL characters */ #define EXP_DISCARD 0x400 /* discard result of expansion */ +#define EXP_MBCHAR 0x800 /* mark multi-byte characters */ struct jmploc; diff --git a/src/mystring.c b/src/mystring.c index 5eace6c..77b457c 100644 --- a/src/mystring.c +++ b/src/mystring.c @@ -67,7 +67,7 @@ const char cqchars[] = { #ifdef HAVE_FNMATCH '^', #endif - CTLESC, CTLQUOTEMARK, 0 + CTLESC, CTLMBCHAR, CTLQUOTEMARK, 0 }; const char illnum[] = "Illegal number: %s"; const char homestr[] = "HOME"; diff --git a/src/parser.h b/src/parser.h index 433573d..14bfc4f 100644 --- a/src/parser.h +++ b/src/parser.h @@ -44,6 +44,7 @@ union node; #define CTLVAR -126 /* variable defn */ #define CTLENDVAR -125 #define CTLBACKQ -124 +#define CTLMBCHAR -123 #define CTLARI -122 /* arithmetic expression */ #define CTLENDARI -121 #define CTLQUOTEMARK -120 From patchwork Sun Apr 28 03:57:07 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Herbert Xu X-Patchwork-Id: 13645831 X-Patchwork-Delegate: herbert@gondor.apana.org.au Received: from abb.hmeau.com (abb.hmeau.com [144.6.53.87]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 529932566 for ; Sun, 28 Apr 2024 03:56:53 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=144.6.53.87 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276615; cv=none; b=TC4ShQefqLPr51n/6MuCVvMb7ZY69F0d7/fcfg+hiB61n1yRurHW0EtPQBC+UZJk5xJ6k95jEE5XMXR4RTf+A9ZHgQAW+AVMG4gVC2RhwlVkIrvrXmqiRJtYx0TAs3DSX0lxrllClOMVMeypLpB0UvXWuUXpB+/9G/wfVj+vUAQ= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276615; c=relaxed/simple; bh=+o2MMlHVZgTfWjXs652tU8pzdcHBFnXBRWwTu28m+Vc=; h=Date:Message-Id:In-Reply-To:References:From:Subject:To; b=osAXdZNU77MtdlyWIAnArNIxPXEatXvPL5v3nZ9TDTztSZB6hfRJTRMeNOXqE+Qw2Ua/MbKNgCWwpv914BgTp4THQ1YhpBZDOoKQXd2kOO7AMQkOXrsjFfD+xC4BEKYEtANJxGhrvDbr4yxRm7mffqC5SODk41q97aLobZ8Qrv0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au; spf=pass smtp.mailfrom=gondor.apana.org.au; arc=none smtp.client-ip=144.6.53.87 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=gondor.apana.org.au Received: from loth.rohan.me.apana.org.au ([192.168.167.2]) by formenos.hmeau.com with smtp (Exim 4.96 #2 (Debian)) id 1s0vej-007PZz-0j; Sun, 28 Apr 2024 11:56:50 +0800 Received: by loth.rohan.me.apana.org.au (sSMTP sendmail emulation); Sun, 28 Apr 2024 11:57:07 +0800 Date: Sun, 28 Apr 2024 11:57:07 +0800 Message-Id: In-Reply-To: References: From: Herbert Xu Subject: [v2 PATCH 5/8] expand: Process multi-byte characters in expmeta To: DASH Mailing List Precedence: bulk X-Mailing-List: dash@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: When glob(3) is not in use, make sure that expmeta processes multi-byte characters correctly. Signed-off-by: Herbert Xu --- src/expand.c | 107 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 73 insertions(+), 34 deletions(-) diff --git a/src/expand.c b/src/expand.c index 60a51b1..0e85025 100644 --- a/src/expand.c +++ b/src/expand.c @@ -84,6 +84,7 @@ #define RMESCAPE_GLOB 0x2 /* Add backslashes for glob */ #define RMESCAPE_GROW 0x8 /* Grow strings instead of stalloc */ #define RMESCAPE_HEAP 0x10 /* Malloc strings instead of stalloc */ +#define RMESCAPE_EMETA 0x20 /* Remove backslashes too */ /* Add CTLESC when necessary. */ #define QUOTES_ESC (EXP_FULL | EXP_CASE) @@ -1347,15 +1348,13 @@ expandmeta(struct strlist *str) savelastp = exparg.lastp; INTOFF; - p = preglob(str->text, RMESCAPE_ALLOC | RMESCAPE_HEAP); + p = str->text; len = strlen(p); expdir_max = len + PATH_MAX; expdir = ckmalloc(expdir_max); expmeta(p, len, 0); ckfree(expdir); - if (p != str->text) - ckfree(p); INTON; if (exparg.lastp == savelastp) { /* @@ -1376,6 +1375,41 @@ nometa: } } +static void expmeta_rmescapes(char *enddir, char *name) +{ + preglob(strcpy(enddir, name), RMESCAPE_EMETA); +} + +static unsigned mbcharlen(char *p) +{ + int esc = 0; + + if (*++p == (char)CTLESC) + esc++; + + return esc + 3 + (unsigned char)p[esc]; +} + +static int skipesc(char *p) +{ + int esc = 0; + + if (p[esc] == (char)CTLMBCHAR) + return esc + mbcharlen(p); + + if (*p == (char)CTLESC) + esc++; + + if (p[esc] == '\\' && p[esc + 1]) { + esc++; + if (p[esc] == (char)CTLMBCHAR) + return esc + mbcharlen(p + esc); + if (p[esc] == (char)CTLESC) + esc++; + } + + return esc; +} /* * Do metacharacter (i.e. *, ?, [...]) expansion. @@ -1385,17 +1419,18 @@ STATIC void expmeta(char *name, unsigned name_len, unsigned expdir_len) { char *enddir = expdir + expdir_len; - char *p; + struct stat64 statb; + struct dirent64 *dp; const char *cp; - char *start; char *endname; int metaflag; - struct stat64 statb; - DIR *dirp; - struct dirent64 *dp; - int atend; int matchdot; + char *start; + DIR *dirp; + char *pat; + char *p; int esc; + int c; metaflag = 0; start = name; @@ -1407,8 +1442,7 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len) if (*q == '!') q++; for (;;) { - if (*q == '\\') - q++; + q += skipesc(q); if (*q == '/' || *q == '\0') break; if (*++q == ']') { @@ -1417,8 +1451,8 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len) } } } else { - if (*p == '\\' && p[1]) - esc++; + esc = skipesc(p); + if (p[esc] == '/') { if (metaflag) break; @@ -1429,24 +1463,18 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len) if (metaflag == 0) { /* we've reached the end of the file name */ if (!expdir_len) return; - p = name; - do { - if (*p == '\\' && p[1]) - p++; - *enddir++ = *p; - } while (*p++); + expmeta_rmescapes(enddir, name); if (lstat64(expdir, &statb) >= 0) addfname(expdir); return; } endname = p; if (name < start) { - p = name; - do { - if (*p == '\\' && p[1]) - p++; - *enddir++ = *p++; - } while (p < start); + c = *start; + *start = 0; + expmeta_rmescapes(enddir, name); + *start = c; + enddir += strlen(enddir); } *enddir = 0; cp = expdir; @@ -1455,25 +1483,26 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len) cp = "."; if ((dirp = opendir(cp)) == NULL) return; - if (*endname == 0) { - atend = 1; - } else { - atend = 0; + c = *endname; + if (c) { *endname = '\0'; endname += esc + 1; } name_len -= endname - name; matchdot = 0; p = start; + if (*p == (char)CTLESC) + p++; if (*p == '\\') p++; if (*p == '.') matchdot++; + pat = preglob(start, RMESCAPE_ALLOC | RMESCAPE_HEAP); while (! int_pending() && (dp = readdir64(dirp)) != NULL) { if (dp->d_name[0] == '.' && ! matchdot) continue; - if (pmatch(start, dp->d_name)) { - if (atend) { + if (pmatch(pat, dp->d_name)) { + if (!c) { scopy(dp->d_name, enddir); addfname(expdir); } else { @@ -1496,9 +1525,11 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len) } } } + if (pat != start) + ckfree(pat); closedir(dirp); - if (! atend) - endname[-esc - 1] = esc ? '\\' : '/'; + if (c) + endname[-esc - 1] = c; } #endif /* HAVE_GLOB */ @@ -1743,6 +1774,7 @@ _rmescapes(char *str, int flag) int notescaped; int globbing; int inquotes; + int expmeta; p = strpbrk(str, cqchars); if (!p) { @@ -1751,6 +1783,7 @@ _rmescapes(char *str, int flag) q = p; r = str; globbing = flag & RMESCAPE_GLOB; + expmeta = (flag & RMESCAPE_EMETA) ? RMESCAPE_GLOB : 0; if (flag & RMESCAPE_ALLOC) { size_t len = p - str; @@ -1790,6 +1823,10 @@ _rmescapes(char *str, int flag) if (*p == '\\') { /* naked back slash */ notescaped ^= globbing; + if (expmeta & ~notescaped) { + p++; + continue; + } goto copy; } if (FNMATCH_IS_ENABLED && *p == '^') @@ -1797,7 +1834,9 @@ _rmescapes(char *str, int flag) if (*p == (char)CTLESC) { p++; add_escape: - if (notescaped) + if (expmeta) + ; + else if (notescaped) *q++ = '\\'; else if (inquotes) { *q++ = '\\'; From patchwork Sun Apr 28 03:57:09 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Herbert Xu X-Patchwork-Id: 13645832 X-Patchwork-Delegate: herbert@gondor.apana.org.au Received: from abb.hmeau.com (abb.hmeau.com [144.6.53.87]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 41AC62900 for ; Sun, 28 Apr 2024 03:56:55 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=144.6.53.87 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276617; cv=none; b=KyvoSDL7ByzIR9c+cTtNVAUdeicR0j8JA+qAURnIaHDUETDquZ6t4hGkXFg4MuZCqJ+JsHOZO42fPLR2tuZTE3V3ThFsYErBCL80b2KE/xV6LsFGbeiKLrY44bQLUk8QK+QzaDMSG+2Vffwr4zu4869QvyfrEXcJ6lzIRzWZ808= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276617; c=relaxed/simple; bh=qd9G7hnmDp8igWOW5LymgjpCbWab9ETYgYteQgf2G8Y=; h=Date:Message-Id:In-Reply-To:References:From:Subject:To; b=gd3c/a3SkPeBvAVPeNU+1tEwEEg6ul4L337XWQ0h+UJtJ6rCfrNIhBUqx2+HCZ/4YZWerxInQIjB755xcP4aM3rJogx3PpdOMQC8LX8wXypQ733hGa5xqkK3D9hqxtHurGnp8sFj2UO4LilgjCbEUZacJenQEUXhxHbayYkjv6w= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au; spf=pass smtp.mailfrom=gondor.apana.org.au; arc=none smtp.client-ip=144.6.53.87 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=gondor.apana.org.au Received: from loth.rohan.me.apana.org.au ([192.168.167.2]) by formenos.hmeau.com with smtp (Exim 4.96 #2 (Debian)) id 1s0vel-007PaZ-1i; Sun, 28 Apr 2024 11:56:52 +0800 Received: by loth.rohan.me.apana.org.au (sSMTP sendmail emulation); Sun, 28 Apr 2024 11:57:09 +0800 Date: Sun, 28 Apr 2024 11:57:09 +0800 Message-Id: In-Reply-To: References: From: Herbert Xu Subject: [v2 PATCH 6/8] expand: Support multi-byte characters during field splitting To: DASH Mailing List Precedence: bulk X-Mailing-List: dash@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: When multi-byte characters are used in IFS, they will be used for field splitting. Signed-off-by: Herbert Xu --- src/expand.c | 201 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 140 insertions(+), 61 deletions(-) diff --git a/src/expand.c b/src/expand.c index 0e85025..dd2b71e 100644 --- a/src/expand.c +++ b/src/expand.c @@ -54,6 +54,7 @@ #include #include #include +#include /* * Routines to expand arguments to commands. We have to deal with @@ -164,6 +165,30 @@ esclen(const char *start, const char *p) { return esc; } +static __attribute__((noinline)) unsigned mbnext(const char *p) +{ + unsigned start = 0; + unsigned end = 0; + unsigned ml; + int c; + + c = p[end++]; + + switch (c) { + case CTLMBCHAR: + if (p[end] == CTLESC) + end++; + ml = (unsigned char)p[end++]; + start = end; + end = ml + 2; + break; + case CTLESC: + start++; + break; + } + + return start | end << 8; +} static inline const char *getpwhome(const char *name) { @@ -552,6 +577,7 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend, loc2 = rmesc; do { const char *s = loc2; + unsigned mb; unsigned ml; int match; @@ -568,19 +594,9 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend, if (!c) break; - if (*loc != (char)CTLMBCHAR) { - if (*loc == (char)CTLESC) - loc++; - loc++; - loc2++; - continue; - } - - if (*++loc == (char)CTLESC) - loc++; - - ml = (unsigned char)*loc; - loc += ml + 3; + mb = mbnext(loc); + loc += (mb & 0xff) + (mb >> 8); + ml = (mb >> 8) > 3 ? (mb >> 8) - 2 : 1; loc2 += ml; } while (1); return 0; @@ -930,18 +946,22 @@ static size_t strtodest(const char *p, int flags) STATIC ssize_t varvalue(char *name, int varflags, int flags, int quoted) { + int subtype = varflags & VSTYPE; + const char *seps; + ssize_t len = 0; + unsigned seplen; + size_t start; + int discard; + char sepc; + char **ap; + int sep; int num; char *p; int i; - int sep; - char sepc; - char **ap; - int subtype = varflags & VSTYPE; - int discard = (subtype == VSPLUS || subtype == VSLENGTH) | - (flags & EXP_DISCARD); - ssize_t len = 0; - size_t start; - char c; + int c; + + discard = (subtype == VSPLUS || subtype == VSLENGTH) | + (flags & EXP_DISCARD); if (!subtype) { if (discard) @@ -1004,15 +1024,27 @@ numvar: sep &= ~quoted; sep |= ifsset() ? (unsigned char)(c & ifsval()[0]) : ' '; param: - sepc = sep; if (!(ap = shellparam.p)) return -1; + sepc = sep; + seps = &sepc; + seplen = 1; + if (sepc < 0) { + mbstate_t mbs = {}; + size_t ml; + + ml = mbrlen(ifsval(), strlen(ifsval()), &mbs); + if (ml != -1 && ml != -2 && ml > 1) { + seps = ifsval(); + seplen = ml; + } + } while ((p = *ap++)) { len += strtodest(p, flags); if (*ap && sep) { len++; - memtodest(&sepc, 1, flags | EXP_KEEPNUL); + memtodest(seps, seplen, flags | EXP_KEEPNUL); } } break; @@ -1074,7 +1106,54 @@ recordregion(int start, int end, int nulonly) ifslastp->nulonly = nulonly; } +static __attribute__((noinline)) unsigned ifsisifs( + const char *p, unsigned ml, const char *ifs, size_t ifslen) +{ + bool isdefifs = false; + size_t slen = ifslen; + const char *s = ifs; + wchar_t c = *p; + bool isifs; + isifs = !c; + if (isifs) { + p = ifs; + c = *p; + slen = 0; + } + + while (slen) { + mbstate_t mbst = {}; + size_t ifsml; + wchar_t c2; + + if ((signed char)*s > 0 || + (ifsml = mbrtowc(&c2, s, slen, &mbst), + ifsml == -2 || ifsml == -1 || ifsml < 2)) { + if (c == *s) { + isifs = true; + break; + } + s++; + slen--; + continue; + } + + if (ifsml == ml && !memcmp(p, s, ifsml)) { + isifs = true; + c = c2; + break; + } + + s += ifsml; + slen -= ifsml; + } + + if (isifs) + isdefifs = iswspace(c); + + return isifs | isdefifs << 1; +} /* * Break the argument string into pieces based upon IFS and add the @@ -1086,16 +1165,16 @@ recordregion(int start, int end, int nulonly) void ifsbreakup(char *string, int maxargs, struct arglist *arglist) { + const char *ifs, *realifs; struct ifsregion *ifsp; struct strlist *sp; + char *r = NULL; + size_t ifslen; char *start; + int nulonly; + int ifsspc; char *p; char *q; - char *r = NULL; - const char *ifs, *realifs; - int ifsspc; - int nulonly; - start = string; if (ifslastp != NULL) { @@ -1110,21 +1189,27 @@ ifsbreakup(char *string, int maxargs, struct arglist *arglist) afternul = nulonly; nulonly = ifsp->nulonly; ifs = nulonly ? nullstr : realifs; + ifslen = strlen(ifs); ifsspc = 0; while (p < string + ifsp->endoff) { - int c; - bool isifs; + unsigned ifschar; + unsigned sisifs; bool isdefifs; + unsigned ml; + bool isifs; q = p; - c = *p++; - if (c == (char)CTLESC) - c = *p++; - isifs = strchr(ifs, c); - isdefifs = false; - if (isifs) - isdefifs = strchr(defifs, c); + ifschar = mbnext(p); + p += ifschar & 0xff; + ml = (ifschar >> 8) > 3 ? + (ifschar >> 8) - 2 : 0; + + sisifs = ifsisifs(p, ml, ifs, ifslen); + p += ifschar >> 8; + + isifs = sisifs & 1; + isdefifs = sisifs >> 1; /* If only reading one more argument: * If we have exactly one field, @@ -1380,32 +1465,24 @@ static void expmeta_rmescapes(char *enddir, char *name) preglob(strcpy(enddir, name), RMESCAPE_EMETA); } -static unsigned mbcharlen(char *p) -{ - int esc = 0; - - if (*++p == (char)CTLESC) - esc++; - - return esc + 3 + (unsigned char)p[esc]; -} - static int skipesc(char *p) { + unsigned short mb; int esc = 0; - if (p[esc] == (char)CTLMBCHAR) - return esc + mbcharlen(p); + mb = mbnext(p); + if ((mb >> 8) > 3) + return (mb & 0xff) + (mb >> 8) - 1; - if (*p == (char)CTLESC) - esc++; + esc = mb & 0xff; if (p[esc] == '\\' && p[esc + 1]) { esc++; - if (p[esc] == (char)CTLMBCHAR) - return esc + mbcharlen(p + esc); - if (p[esc] == (char)CTLESC) - esc++; + mb = mbnext(p + esc); + if ((mb >> 8) > 3) + return esc + (mb & 0xff) + (mb >> 8) - 1; + + esc += mb & 0xff; } return esc; @@ -1813,6 +1890,7 @@ _rmescapes(char *str, int flag) inquotes = 0; notescaped = globbing; while (*p) { + unsigned mb; unsigned ml; if (*p == (char)CTLQUOTEMARK) { @@ -1845,13 +1923,14 @@ add_escape: } notescaped = globbing; - if (*p != (char)CTLMBCHAR) + mb = mbnext(p); + ml = mb >> 8; + + if (ml <= 3) goto copy; - if (*++p == (char)CTLESC) - p++; - - ml = (unsigned char)*p++; + ml -= 2; + p += mb & 0xff; q = mempcpy(q, p, ml); p += ml + 2; continue; From patchwork Sun Apr 28 03:57:11 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Herbert Xu X-Patchwork-Id: 13645833 X-Patchwork-Delegate: herbert@gondor.apana.org.au Received: from abb.hmeau.com (abb.hmeau.com [144.6.53.87]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 75F122900 for ; Sun, 28 Apr 2024 03:56:57 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=144.6.53.87 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276619; cv=none; b=RQhvvdIXEQzkyC/XXQxVMo7orYWMgnhyRyTYJRpfrfS0xO24rQBV5SYYK5ff7JhtaN5Cr6K8tTxvSuifS2bD01RrzLgpk/EgmrwThSZscr51nUyftvDqh/eR9Rz0T9/dTtl+/fuqKwmN9ys+J9PCoir8G4C1T0rqQY4oejw6f94= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276619; c=relaxed/simple; bh=A2FhhB1B6M9VpjHmg4QUwZ8lF7X6HxYALBXcqawPvrI=; h=Date:Message-Id:In-Reply-To:References:From:Subject:To; b=KjnYsX7FuhfAsxeIGY1Kr782a6D5g3yhZgDNPgIyXM9FgXrrLTPI3WJ4ZmzxDTH0QC1vLtSTpIW8ofSHqwDZh3BCTMgK7wgFteM0Yz6769D5TYNHWoBhMplbb4/gjg1bLEFunFpaHy6jjGadB4FWsPlIo0UP/rUMo7NFXSHaRG8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au; spf=pass smtp.mailfrom=gondor.apana.org.au; arc=none smtp.client-ip=144.6.53.87 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=gondor.apana.org.au Received: from loth.rohan.me.apana.org.au ([192.168.167.2]) by formenos.hmeau.com with smtp (Exim 4.96 #2 (Debian)) id 1s0ven-007PbI-2b; Sun, 28 Apr 2024 11:56:54 +0800 Received: by loth.rohan.me.apana.org.au (sSMTP sendmail emulation); Sun, 28 Apr 2024 11:57:11 +0800 Date: Sun, 28 Apr 2024 11:57:11 +0800 Message-Id: <9a1c18b16b066510266ed9f14ec954840221e7c1.1714276539.git.herbert@gondor.apana.org.au> In-Reply-To: References: From: Herbert Xu Subject: [v2 PATCH 7/8] input: Allow MB_LEN_MAX calls to pungetc To: DASH Mailing List Precedence: bulk X-Mailing-List: dash@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: In order to parse multi-byte characters which may be up to MB_LEN_MAX bytes long, allow enough calls to pungetc to undo a single multi-byte character. Also add a function pungetn to do multiple pungetc calls in a row. Signed-off-by: Herbert Xu --- src/input.c | 58 ++++++++++++++++++++++++++++++++++------------------- src/input.h | 11 +++++----- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/src/input.c b/src/input.c index fb9858f..c7805ad 100644 --- a/src/input.c +++ b/src/input.c @@ -56,7 +56,7 @@ #include "main.h" #include "myhistedit.h" -#define IBUFSIZ (BUFSIZ + 1) +#define IBUFSIZ (BUFSIZ + PUNGETC_MAX + 1) MKINIT struct parsefile basepf; /* top level input file */ @@ -83,13 +83,16 @@ INIT { } RESET { + int c; + /* clear input buffer */ popallfiles(); - basepf.unget = 0; - while (basepf.lastc[0] != '\n' && - basepf.lastc[0] != PEOF && - !int_pending()) - pgetc(); + + c = PEOF; + if (basepf.nextc - basebuf > basepf.unget) + c = basepf.nextc[-basepf.unget]; + while (c != '\n' && c != PEOF && !int_pending()) + c = pgetc(); } FORKRESET { @@ -131,17 +134,20 @@ static int __pgetc(void) { int c; - if (parsefile->unget) - return parsefile->lastc[--parsefile->unget]; + if (parsefile->unget) { + long unget = -(long)(unsigned)parsefile->unget--; + + if (parsefile->nleft < 0) + return preadbuffer(); + + return parsefile->nextc[unget]; + } if (--parsefile->nleft >= 0) c = (signed char)*parsefile->nextc++; else c = preadbuffer(); - parsefile->lastc[1] = parsefile->lastc[0]; - parsefile->lastc[0] = c; - return c; } @@ -176,9 +182,16 @@ static int stdin_clear_nonblock(void) static int preadfd(void) { + char *buf = parsefile->buf; + int unget; int nr; - char *buf = parsefile->buf; - parsefile->nextc = buf; + + unget = parsefile->nextc - buf; + if (unget > PUNGETC_MAX) + unget = PUNGETC_MAX; + + memmove(buf, parsefile->nextc - unget, unget); + parsefile->nextc = buf += unget; retry: #ifndef SMALL @@ -196,8 +209,8 @@ retry: nr = 0; else { nr = el_len; - if (nr > IBUFSIZ - 1) - nr = IBUFSIZ - 1; + if (nr > BUFSIZ) + nr = BUFSIZ; memcpy(buf, rl_cp, nr); if (nr != el_len) { el_len -= nr; @@ -209,9 +222,9 @@ retry: } else #endif if (parsefile->fd) - nr = read(parsefile->fd, buf, IBUFSIZ - 1); + nr = read(parsefile->fd, buf, BUFSIZ); else { - unsigned len = IBUFSIZ - 1; + unsigned len = BUFSIZ; nr = 0; @@ -348,6 +361,11 @@ done: return (signed char)*parsefile->nextc++; } +void pungetn(int n) +{ + parsefile->unget += n; +} + /* * Undo a call to pgetc. Only two characters may be pushed back. * PEOF may be pushed back. @@ -356,7 +374,7 @@ done: void pungetc(void) { - parsefile->unget++; + pungetn(1); } /* @@ -383,7 +401,6 @@ pushstring(char *s, void *ap) sp->prevnleft = parsefile->nleft; sp->unget = parsefile->unget; sp->spfree = parsefile->spfree; - memcpy(sp->lastc, parsefile->lastc, sizeof(sp->lastc)); sp->ap = (struct alias *)ap; if (ap) { ((struct alias *)ap)->flag |= ALIASINUSE; @@ -413,7 +430,6 @@ static void popstring(void) parsefile->nextc = sp->prevstring; parsefile->nleft = sp->prevnleft; parsefile->unget = sp->unget; - memcpy(parsefile->lastc, sp->lastc, sizeof(sp->lastc)); /*dprintf("*** calling popstring: restoring to '%s'\n", parsenextc);*/ parsefile->strpush = sp->prev; parsefile->spfree = sp; @@ -457,7 +473,7 @@ setinputfd(int fd, int push) } parsefile->fd = fd; if (parsefile->buf == NULL) - parsefile->buf = ckmalloc(IBUFSIZ); + parsefile->nextc = parsefile->buf = ckmalloc(IBUFSIZ); input_set_lleft(parsefile, parsefile->nleft = 0); plinno = 1; } diff --git a/src/input.h b/src/input.h index 1ff5773..5b4a045 100644 --- a/src/input.h +++ b/src/input.h @@ -34,12 +34,16 @@ * @(#)input.h 8.2 (Berkeley) 5/4/95 */ +#include + #ifdef SMALL #define IS_DEFINED_SMALL 1 #else #define IS_DEFINED_SMALL 0 #endif +#define PUNGETC_MAX (MB_LEN_MAX > 16 ? MB_LEN_MAX : 16) + /* PEOF (the end of file marker) is defined in syntax.h */ enum { @@ -59,9 +63,6 @@ struct strpush { /* Delay freeing so we can stop nested aliases. */ struct strpush *spfree; - /* Remember last two characters for pungetc. */ - int lastc[2]; - /* Number of outstanding calls to pungetc. */ int unget; }; @@ -87,9 +88,6 @@ struct parsefile { /* Delay freeing so we can stop nested aliases. */ struct strpush *spfree; - /* Remember last two characters for pungetc. */ - int lastc[2]; - /* Number of outstanding calls to pungetc. */ int unget; }; @@ -106,6 +104,7 @@ extern struct parsefile *parsefile; int pgetc(void); int pgetc2(void); void pungetc(void); +void pungetn(int); void pushstring(char *, void *); int setinputfile(const char *, int); void setinputstring(char *); From patchwork Sun Apr 28 03:57:14 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Herbert Xu X-Patchwork-Id: 13645834 X-Patchwork-Delegate: herbert@gondor.apana.org.au Received: from abb.hmeau.com (abb.hmeau.com [144.6.53.87]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id F328223BB for ; Sun, 28 Apr 2024 03:56:59 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=144.6.53.87 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276622; cv=none; b=b69Bnr4UJO/XtA92EvuBKNVpGxxWiNUgbvLTklecn8sdWpJOc35dWy3Ys3rd/VpIZPsew6FQa0HafWJnw9ObTlwFUay/XT6akMnis0IBVSWuebBGGlFrTTJUPi1pgKGfvp20vDw5hujyzDNwyk+zeuiCf9sWphC0BbWPJ/eIIjY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714276622; c=relaxed/simple; bh=9DeyUWy+7ZsfY7+O+R9fx+V9wy1CXnGISLbJ6MORgTQ=; h=Date:Message-Id:In-Reply-To:References:From:Subject:To; b=hey1BAzoTJ+3lCghFbWDpJiEbqsb0XRlNfneeyVWgcNk2I1G9TlVWZfy+24tF5Bme0EELj/HGdq6rNswCAdRclrBFKVRzEnKld3M2xJObEpQvtvthunaNjxgUvQiNloRESxk44pKdh6Q0aI5QYkKaucQzsWhVdrU96/VQa6dKn0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au; spf=pass smtp.mailfrom=gondor.apana.org.au; arc=none smtp.client-ip=144.6.53.87 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=gondor.apana.org.au Received: from loth.rohan.me.apana.org.au ([192.168.167.2]) by formenos.hmeau.com with smtp (Exim 4.96 #2 (Debian)) id 1s0veq-007Pc7-0F; Sun, 28 Apr 2024 11:56:57 +0800 Received: by loth.rohan.me.apana.org.au (sSMTP sendmail emulation); Sun, 28 Apr 2024 11:57:14 +0800 Date: Sun, 28 Apr 2024 11:57:14 +0800 Message-Id: <6384c8226045aca00ee06249b456ab123a09d0ee.1714276539.git.herbert@gondor.apana.org.au> In-Reply-To: References: From: Herbert Xu Subject: [v2 PATCH 8/8] parser: Add support for multi-byte characters To: DASH Mailing List Precedence: bulk X-Mailing-List: dash@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Add the requisite markers for multi-byte characters so that the expansion code can recognise them. Also allow wide blank characters to terminate words. Signed-off-by: Herbert Xu --- src/expand.c | 19 ++++++++ src/parser.c | 127 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 121 insertions(+), 25 deletions(-) diff --git a/src/expand.c b/src/expand.c index dd2b71e..402289f 100644 --- a/src/expand.c +++ b/src/expand.c @@ -265,6 +265,7 @@ static char *argstr(char *p, int flag) CTLESC, CTLVAR, CTLBACKQ, + CTLMBCHAR, CTLARI, CTLENDARI, 0 @@ -289,6 +290,8 @@ tilde: start: startloc = expdest - (char *)stackblock(); for (;;) { + unsigned ml; + unsigned mb; int end; length += strcspn(p + length, reject); @@ -351,6 +354,22 @@ addquote: startloc++; } break; + case CTLMBCHAR: + c = (signed char)*p--; + mb = mbnext(p); + ml = (mb >> 8) - 2; + if (flag & QUOTES_ESC) { + length = (mb >> 8) + (mb & 0xff); + if (c == (char)CTLESC) + startloc += length; + break; + } + if (c == CTLESC) + startloc += ml; + p += mb & 0xff; + expdest = stnputs(p, ml, expdest); + p += mb >> 8; + break; case CTLESC: startloc++; length++; diff --git a/src/parser.c b/src/parser.c index 27611f0..c23cc9b 100644 --- a/src/parser.c +++ b/src/parser.c @@ -36,7 +36,11 @@ #include #endif +#include +#include #include +#include +#include #include "shell.h" #include "parser.h" @@ -876,7 +880,53 @@ static void synstack_pop(struct synstack **stack) *stack = (*stack)->next; } +static unsigned getmbc(int c, char *out, int mode) +{ + char *const start = out; + mbstate_t mbst = {}; + unsigned ml = 0; + size_t ml2; + wchar_t wc; + char *mbc; + if (likely(c >= 0)) + return 0; + + mbc = (mode & 3) < 2 ? out + 2 + (mode == 1) : out; + mbc[ml] = c; + while ((ml2 = mbrtowc(&wc, mbc + ml++, 1, &mbst)) == -2) { + if (ml >= MB_LEN_MAX) + break; + c = pgetc(); + if (c == PEOF) + break; + mbc[ml] = c; + } + + if (ml2 == 1 && ml > 1) { + if (mode == 4 && iswblank(wc)) + return 1; + + if ((mode & 3) < 2) { + USTPUTC(CTLMBCHAR, out); + if (mode == 1) + USTPUTC(CTLESC, out); + USTPUTC(ml, out); + } + STADJUST(ml, out); + if ((mode & 3) < 2) { + USTPUTC(ml, out); + USTPUTC(CTLMBCHAR, out); + } + + return out - start; + } + + if (ml > 1) + pungetn(ml - 1); + + return 0; +} /* * If eofmark is NULL, read a word or a redirection symbol. If eofmark @@ -929,12 +979,27 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs) } #endif CHECKEND(); /* set c to PEOF if at end of here document */ - for (;;) { /* until end of line or end of word */ - CHECKSTRSPACE(4, out); /* permit 4 calls to USTPUTC */ + /* Until end of line or end of word */ + for (;; c = pgetc_top(synstack)) { + int fieldsplitting; + unsigned ml; + + /* Permit max(MB_LEN_MAX, 23) calls to USTPUTC. */ + CHECKSTRSPACE((MB_LEN_MAX > 16 ? MB_LEN_MAX : 16) + 7, + out); + fieldsplitting = synstack->syntax == BASESYNTAX && + !synstack->varnest ? 4 : 0; + ml = getmbc(c, out, fieldsplitting); + if (ml == 1) { + c = pgetc(); + break; + } + out += ml; + if (ml) + continue; switch(synstack->syntax[c]) { case CNL: /* '\n' */ - if (synstack->syntax == BASESYNTAX && - !synstack->varnest) + if (fieldsplitting) goto endword; /* exit outer loop */ USTPUTC(c, out); nlprompt(); @@ -956,26 +1021,33 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs) USTPUTC(CTLESC, out); USTPUTC('\\', out); pungetc(); - } else { - if ( - synstack->dblquote && - c != '\\' && c != '`' && - c != '$' && ( - c != '"' || - (eofmark != NULL && - !synstack->varnest) - ) && ( - c != '}' || - !synstack->varnest - ) - ) { - USTPUTC(CTLESC, out); - USTPUTC('\\', out); - } - USTPUTC(CTLESC, out); - USTPUTC(c, out); - quotef++; + break; } + + if ( + synstack->dblquote && + c != '\\' && c != '`' && + c != '$' && ( + c != '"' || + (eofmark != NULL && + !synstack->varnest) + ) && ( + c != '}' || + !synstack->varnest + ) + ) { + USTPUTC(CTLESC, out); + USTPUTC('\\', out); + } + quotef++; + + ml = getmbc(c, out, 1); + out += ml; + if (ml) + break; + + USTPUTC(CTLESC, out); + USTPUTC(c, out); break; case CSQUOTE: synstack->syntax = SQSYNTAX; @@ -1053,11 +1125,10 @@ toggledq: case CEOF: goto endword; /* exit outer loop */ default: - if (synstack->varnest == 0) + if (fieldsplitting) goto endword; /* exit outer loop */ USTPUTC(c, out); } - c = pgetc_top(synstack); } } endword: @@ -1384,6 +1455,7 @@ parsebackq: { size_t psavelen; size_t savelen; union node *n; + unsigned ml; char *pstr; char *str; @@ -1415,6 +1487,11 @@ parsebackq: { if (pc != '\\' && pc != '`' && pc != '$' && (!synstack->dblquote || pc != '"')) STPUTC('\\', pout); + CHECKSTRSPACE(MB_LEN_MAX, pout); + ml = getmbc(pc, pout, 2); + pout += ml; + if (ml) + continue; break; case PEOF: