diff mbox series

[1/5] git-compat-util: add isblank() and isgraph()

Message ID 20230210075939.44949-2-masahiroy@kernel.org (mailing list archive)
State Superseded
Headers show
Series Clean up wildmatch.c | expand

Commit Message

Masahiro Yamada Feb. 10, 2023, 7:59 a.m. UTC
git-compat-util.h implements most of is*() macros.

Add isblank() and isgraph(), which are useful to clean up wildmatch.c
in a consistent way (in this and later commits).

Use them with care because they are not robust against the pointer
increment, like isblank(*s++).

The same issue already exists for isspace().

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---

 git-compat-util.h |  4 ++++
 wildmatch.c       | 14 ++------------
 2 files changed, 6 insertions(+), 12 deletions(-)

Comments

Ævar Arnfjörð Bjarmason Feb. 10, 2023, 1:16 p.m. UTC | #1
On Fri, Feb 10 2023, Masahiro Yamada wrote:

> git-compat-util.h implements most of is*() macros.
>
> Add isblank() and isgraph(), which are useful to clean up wildmatch.c
> in a consistent way (in this and later commits).

You are on a journey to fix wildmatch.c, so...

> The same issue already exists for isspace().
>
> Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
> ---
> [...]
> -#ifdef isblank
> -# define ISBLANK(c) (ISASCII(c) && isblank(c))
> -#else
> -# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
> -#endif
> -
> -#ifdef isgraph
> -# define ISGRAPH(c) (ISASCII(c) && isgraph(c))
> -#else
> -# define ISGRAPH(c) (ISASCII(c) && isprint(c) && !isspace(c))
> -#endif
> -
> +#define ISBLANK(c) (ISASCII(c) && isblank(c))
> +#define ISGRAPH(c) (ISASCII(c) && isgraph(c))
>  #define ISPRINT(c) (ISASCII(c) && isprint(c))
>  #define ISDIGIT(c) (ISASCII(c) && isdigit(c))
>  #define ISALNUM(c) (ISASCII(c) && isalnum(c))

You make this change, but not others in tree.

Personally I wouldn't mind seeing this expanded to fix the various other
trivially convertable cases in-tree, e.g.:
	
	$ git -P grep -n "(' '|'\\\t').*\|\|.*(' '|'\\\t')"
	builtin/am.c:602:               if (*sb.buf == '\t' || *sb.buf == ' ')
	compat/regex/regex_internal.h:60:# define isblank(ch) ((ch) == ' ' || (ch) == '\t')
	compat/regex/regex_internal.h:73:   return (c == ' ' || c == '\t');
	config.c:893:   while (c == ' ' || c == '\t')
	fsck.c:837:     while (*p == ' ' || *p == '\t')
	mailinfo.c:749:     (line->buf[0] == ' ' || line->buf[0] == '\t')) {
	sequencer.c:2476:                (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r' || !*p) &&
	sequencer.c:2536:                  (*bol == ' ' || *bol == '\t')) {
	sequencer.c:2540:                                 (*bol == ' ' || *bol == '\t')) {
	sequencer.c:2594:       if (is_command(TODO_PICK, &bol) && (*bol == ' ' || *bol == '\t'))
	sequencer.c:2597:                (*bol == ' ' || *bol == '\t'))
	t/helper/test-json-writer.c:443:                if (c == '\n' || c == '\r' || c == ' ' || c == '\t')
	t/helper/test-json-writer.c:449:        while (*buf == ' ' || *buf == '\t')
	t/t4256/1/mailinfo.c:737:           (line->buf[0] == ' ' || line->buf[0] == '\t')) {
	t/t4256/1/mailinfo.c.orig:726:      (line->buf[0] == ' ' || line->buf[0] == '\t')) {
	trailer.c:630:          if (c != line && (*c == ' ' || *c == '\t')) {
	wildmatch.c:34:# define ISBLANK(c) ((c) == ' ' || (c) == '\t')

Some of those are false positves, but most are not.
Masahiro Yamada Feb. 10, 2023, 4:56 p.m. UTC | #2
On Fri, Feb 10, 2023 at 10:20 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>
> On Fri, Feb 10 2023, Masahiro Yamada wrote:
>
> > git-compat-util.h implements most of is*() macros.
> >
> > Add isblank() and isgraph(), which are useful to clean up wildmatch.c
> > in a consistent way (in this and later commits).
>
> You are on a journey to fix wildmatch.c, so...
>
> > The same issue already exists for isspace().
> >
> > Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
> > ---
> > [...]
> > -#ifdef isblank
> > -# define ISBLANK(c) (ISASCII(c) && isblank(c))
> > -#else
> > -# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
> > -#endif
> > -
> > -#ifdef isgraph
> > -# define ISGRAPH(c) (ISASCII(c) && isgraph(c))
> > -#else
> > -# define ISGRAPH(c) (ISASCII(c) && isprint(c) && !isspace(c))
> > -#endif
> > -
> > +#define ISBLANK(c) (ISASCII(c) && isblank(c))
> > +#define ISGRAPH(c) (ISASCII(c) && isgraph(c))
> >  #define ISPRINT(c) (ISASCII(c) && isprint(c))
> >  #define ISDIGIT(c) (ISASCII(c) && isdigit(c))
> >  #define ISALNUM(c) (ISASCII(c) && isalnum(c))
>
> You make this change, but not others in tree.
>
> Personally I wouldn't mind seeing this expanded to fix the various other


I wouldn't mind seeing somebody use this for tree-wide cleanups.
I do not see any reason to do that in this patch set, either.





> trivially convertable cases in-tree, e.g.:
>
>         $ git -P grep -n "(' '|'\\\t').*\|\|.*(' '|'\\\t')"
>         builtin/am.c:602:               if (*sb.buf == '\t' || *sb.buf == ' ')
>         compat/regex/regex_internal.h:60:# define isblank(ch) ((ch) == ' ' || (ch) == '\t')
>         compat/regex/regex_internal.h:73:   return (c == ' ' || c == '\t');
>         config.c:893:   while (c == ' ' || c == '\t')
>         fsck.c:837:     while (*p == ' ' || *p == '\t')
>         mailinfo.c:749:     (line->buf[0] == ' ' || line->buf[0] == '\t')) {
>         sequencer.c:2476:                (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r' || !*p) &&
>         sequencer.c:2536:                  (*bol == ' ' || *bol == '\t')) {
>         sequencer.c:2540:                                 (*bol == ' ' || *bol == '\t')) {
>         sequencer.c:2594:       if (is_command(TODO_PICK, &bol) && (*bol == ' ' || *bol == '\t'))
>         sequencer.c:2597:                (*bol == ' ' || *bol == '\t'))
>         t/helper/test-json-writer.c:443:                if (c == '\n' || c == '\r' || c == ' ' || c == '\t')
>         t/helper/test-json-writer.c:449:        while (*buf == ' ' || *buf == '\t')
>         t/t4256/1/mailinfo.c:737:           (line->buf[0] == ' ' || line->buf[0] == '\t')) {
>         t/t4256/1/mailinfo.c.orig:726:      (line->buf[0] == ' ' || line->buf[0] == '\t')) {
>         trailer.c:630:          if (c != line && (*c == ' ' || *c == '\t')) {
>         wildmatch.c:34:# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
>
> Some of those are false positves, but most are not.



--
Best Regards
Masahiro Yamada
Junio C Hamano Feb. 10, 2023, 7:10 p.m. UTC | #3
Masahiro Yamada <masahiroy@kernel.org> writes:

> Use them with care because they are not robust against the pointer
> increment, like isblank(*s++).
>
> The same issue already exists for isspace().

Does it?

>  #define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
>  #define isascii(x) (((x) & ~0x7f) == 0)
>  #define isspace(x) sane_istest(x,GIT_SPACE)

sane_istest() evaluates x and mask only once, and isspace evaluates
x only once, no?

	isspace(*x++)
	->
	sane_istest(*x++,GIT_SPACE)
	->
	((sane_ctype[(unsigned char)(*x++)] & GIT_SPACE) != 0)

> +#define isblank(x) (isspace(x) || (x) == '\t')
> +#define isgraph(x) (isprint(x) && !isspace(x))

Given that all the other tests are implemented with just picking an
integer from the table and checking bit(s), the above two look
somewhat out of place.  The fact, as you noted, that they cannot be
used safely does not help, either.
Masahiro Yamada Feb. 10, 2023, 7:25 p.m. UTC | #4
On Sat, Feb 11, 2023 at 4:11 AM Junio C Hamano <gitster@pobox.com> wrote:
>
> Masahiro Yamada <masahiroy@kernel.org> writes:
>
> > Use them with care because they are not robust against the pointer
> > increment, like isblank(*s++).
> >
> > The same issue already exists for isspace().
>
> Does it?


Sorry, I meant:

 "The same issue already exists for isprint()"




   #define isprint(x) ((x) >= 0x20 && (x) <= 0x7e)


is not robust against pointer increment.




>
> >  #define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
> >  #define isascii(x) (((x) & ~0x7f) == 0)
> >  #define isspace(x) sane_istest(x,GIT_SPACE)
>
> sane_istest() evaluates x and mask only once, and isspace evaluates
> x only once, no?
>
>         isspace(*x++)
>         ->
>         sane_istest(*x++,GIT_SPACE)
>         ->
>         ((sane_ctype[(unsigned char)(*x++)] & GIT_SPACE) != 0)
>
> > +#define isblank(x) (isspace(x) || (x) == '\t')
> > +#define isgraph(x) (isprint(x) && !isspace(x))
>
> Given that all the other tests are implemented with just picking an
> integer from the table and checking bit(s), the above two look
> somewhat out of place.  The fact, as you noted, that they cannot be
> used safely does not help, either.
>
>


At first, I hesitated to move these to git-compat-util.h
for this reason, but I noticed isprint() was in the same situation.

So, I decided to go.
René Scharfe Feb. 10, 2023, 10:03 p.m. UTC | #5
Am 10.02.23 um 08:59 schrieb Masahiro Yamada:
> git-compat-util.h implements most of is*() macros.
>
> Add isblank() and isgraph(), which are useful to clean up wildmatch.c
> in a consistent way (in this and later commits).
>
> Use them with care because they are not robust against the pointer
> increment, like isblank(*s++).
>
> The same issue already exists for isspace().
>
> Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
> ---
>
>  git-compat-util.h |  4 ++++
>  wildmatch.c       | 14 ++------------
>  2 files changed, 6 insertions(+), 12 deletions(-)
>
> diff --git a/git-compat-util.h b/git-compat-util.h
> index 4f0028ce60..90b43b2bc9 100644
> --- a/git-compat-util.h
> +++ b/git-compat-util.h
> @@ -1212,10 +1212,12 @@ extern const unsigned char tolower_trans_tbl[256];
>  /* Sane ctype - no locale, and works with signed chars */
>  #undef isascii
>  #undef isspace
> +#undef isblank
>  #undef isdigit
>  #undef isalpha
>  #undef isalnum
>  #undef isprint
> +#undef isgraph
>  #undef islower
>  #undef isupper
>  #undef tolower
> @@ -1236,10 +1238,12 @@ extern const unsigned char sane_ctype[256];
>  #define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
>  #define isascii(x) (((x) & ~0x7f) == 0)
>  #define isspace(x) sane_istest(x,GIT_SPACE)
> +#define isblank(x) (isspace(x) || (x) == '\t')

Our isspace() matches \t, \n, \r and space.  A standard implementation
would also match \v (vertical tab) and \f (form feed).  Anyway, your
isblank() here is the same as isspace() because the check for \t is
redundant...

>  #define isdigit(x) sane_istest(x,GIT_DIGIT)
>  #define isalpha(x) sane_istest(x,GIT_ALPHA)
>  #define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
>  #define isprint(x) ((x) >= 0x20 && (x) <= 0x7e)
> +#define isgraph(x) (isprint(x) && !isspace(x))
>  #define islower(x) sane_iscase(x, 1)
>  #define isupper(x) sane_iscase(x, 0)
>  #define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
> diff --git a/wildmatch.c b/wildmatch.c
> index 7e5a7ea1ea..85c4c7f8a7 100644
> --- a/wildmatch.c
> +++ b/wildmatch.c
> @@ -28,18 +28,8 @@ typedef unsigned char uchar;
>  # define ISASCII(c) isascii(c)
>  #endif
>
> -#ifdef isblank
> -# define ISBLANK(c) (ISASCII(c) && isblank(c))
> -#else
> -# define ISBLANK(c) ((c) == ' ' || (c) == '\t')

... and that's not the case for the original code, which only
matches \t and space.

We already use eight bits for the lookup table values in sane_ctype.
Perhaps it's time to move GIT_GLOB_SPECIAL, GIT_REGEX_SPECIAL,
GIT_PATHSPEC_MAGIC and GIT_PUNCT to their own table to make room for
flags for isprint(), isgraph() and isblank().

> -#endif
> -
> -#ifdef isgraph
> -# define ISGRAPH(c) (ISASCII(c) && isgraph(c))
> -#else
> -# define ISGRAPH(c) (ISASCII(c) && isprint(c) && !isspace(c))
> -#endif
> -
> +#define ISBLANK(c) (ISASCII(c) && isblank(c))
> +#define ISGRAPH(c) (ISASCII(c) && isgraph(c))
>  #define ISPRINT(c) (ISASCII(c) && isprint(c))
>  #define ISDIGIT(c) (ISASCII(c) && isdigit(c))
>  #define ISALNUM(c) (ISASCII(c) && isalnum(c))
Masahiro Yamada Feb. 11, 2023, 7:01 a.m. UTC | #6
On Sat, Feb 11, 2023 at 7:03 AM René Scharfe <l.s.r@web.de> wrote:
>
> Am 10.02.23 um 08:59 schrieb Masahiro Yamada:
> > git-compat-util.h implements most of is*() macros.
> >
> > Add isblank() and isgraph(), which are useful to clean up wildmatch.c
> > in a consistent way (in this and later commits).
> >
> > Use them with care because they are not robust against the pointer
> > increment, like isblank(*s++).
> >
> > The same issue already exists for isspace().
> >
> > Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
> > ---
> >
> >  git-compat-util.h |  4 ++++
> >  wildmatch.c       | 14 ++------------
> >  2 files changed, 6 insertions(+), 12 deletions(-)
> >
> > diff --git a/git-compat-util.h b/git-compat-util.h
> > index 4f0028ce60..90b43b2bc9 100644
> > --- a/git-compat-util.h
> > +++ b/git-compat-util.h
> > @@ -1212,10 +1212,12 @@ extern const unsigned char tolower_trans_tbl[256];
> >  /* Sane ctype - no locale, and works with signed chars */
> >  #undef isascii
> >  #undef isspace
> > +#undef isblank
> >  #undef isdigit
> >  #undef isalpha
> >  #undef isalnum
> >  #undef isprint
> > +#undef isgraph
> >  #undef islower
> >  #undef isupper
> >  #undef tolower
> > @@ -1236,10 +1238,12 @@ extern const unsigned char sane_ctype[256];
> >  #define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
> >  #define isascii(x) (((x) & ~0x7f) == 0)
> >  #define isspace(x) sane_istest(x,GIT_SPACE)
> > +#define isblank(x) (isspace(x) || (x) == '\t')
>
> Our isspace() matches \t, \n, \r and space.  A standard implementation
> would also match \v (vertical tab) and \f (form feed).  Anyway, your
> isblank() here is the same as isspace() because the check for \t is
> redundant...


My bad - I missed 'Z' in cane_ctype[].


>
> >  #define isdigit(x) sane_istest(x,GIT_DIGIT)
> >  #define isalpha(x) sane_istest(x,GIT_ALPHA)
> >  #define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
> >  #define isprint(x) ((x) >= 0x20 && (x) <= 0x7e)
> > +#define isgraph(x) (isprint(x) && !isspace(x))
> >  #define islower(x) sane_iscase(x, 1)
> >  #define isupper(x) sane_iscase(x, 0)
> >  #define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
> > diff --git a/wildmatch.c b/wildmatch.c
> > index 7e5a7ea1ea..85c4c7f8a7 100644
> > --- a/wildmatch.c
> > +++ b/wildmatch.c
> > @@ -28,18 +28,8 @@ typedef unsigned char uchar;
> >  # define ISASCII(c) isascii(c)
> >  #endif
> >
> > -#ifdef isblank
> > -# define ISBLANK(c) (ISASCII(c) && isblank(c))
> > -#else
> > -# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
>
> ... and that's not the case for the original code, which only
> matches \t and space.
>
> We already use eight bits for the lookup table values in sane_ctype.
> Perhaps it's time to move GIT_GLOB_SPECIAL, GIT_REGEX_SPECIAL,
> GIT_PATHSPEC_MAGIC and GIT_PUNCT to their own table to make room for
> flags for isprint(), isgraph() and isblank().



I think that is a good idea.

If we can have more room for isupper() and islower(),
we will be able to delete sane_iscase().


(I was thinking 'unsigned short', but having two tables is OK.)



So, how can I proceed with this patchset?



[A] After somebody refactors ctype.c table,
    I will rebase this series on top of that.

[B] keep isblank() and isgraph() local to wildmatch.c
    until that happens, so I can proceed without
    depending on the ctype.c refactoring.

    Apparently, wildmatch.c is not using a pointer
    increment with isblank() or isgraph().

[C] If 'somebody' in [A] is supposed to me,
    my v2 will include ctype refactoring.



Thought?





> > -#endif
> > -
> > -#ifdef isgraph
> > -# define ISGRAPH(c) (ISASCII(c) && isgraph(c))
> > -#else
> > -# define ISGRAPH(c) (ISASCII(c) && isprint(c) && !isspace(c))
> > -#endif
> > -
> > +#define ISBLANK(c) (ISASCII(c) && isblank(c))
> > +#define ISGRAPH(c) (ISASCII(c) && isgraph(c))
> >  #define ISPRINT(c) (ISASCII(c) && isprint(c))
> >  #define ISDIGIT(c) (ISASCII(c) && isdigit(c))
> >  #define ISALNUM(c) (ISASCII(c) && isalnum(c))
>
--
Best Regards
Masahiro Yamada
René Scharfe Feb. 11, 2023, 1:48 p.m. UTC | #7
Am 11.02.23 um 08:01 schrieb Masahiro Yamada:
> On Sat, Feb 11, 2023 at 7:03 AM René Scharfe <l.s.r@web.de> wrote:
>>
>> We already use eight bits for the lookup table values in sane_ctype.
>> Perhaps it's time to move GIT_GLOB_SPECIAL, GIT_REGEX_SPECIAL,
>> GIT_PATHSPEC_MAGIC and GIT_PUNCT to their own table to make room for
>> flags for isprint(), isgraph() and isblank().
>
>
>
> I think that is a good idea.
>
> If we can have more room for isupper() and islower(),
> we will be able to delete sane_iscase().
>
>
> (I was thinking 'unsigned short', but having two tables is OK.)
>
>
>
> So, how can I proceed with this patchset?
>
>
>
> [A] After somebody refactors ctype.c table,
>     I will rebase this series on top of that.
>
> [B] keep isblank() and isgraph() local to wildmatch.c
>     until that happens, so I can proceed without
>     depending on the ctype.c refactoring.
>
>     Apparently, wildmatch.c is not using a pointer
>     increment with isblank() or isgraph().
>
> [C] If 'somebody' in [A] is supposed to me,
>     my v2 will include ctype refactoring.

[D] We need more tests first. :)  I sent patches to test the current
classifiers separately.  A similar test for isblank would have helped
you detect the mismatch quickly.  Full test coverage also gives
confidence when tinkering with the existing classifiers.

1c149ab2dd (ctype: support iscntrl, ispunct, isxdigit and isprint,
2012-10-15) added an implementation of isprint that evaluated its
argument only once, by the way, but the one from 0fcec2ce54
(format-patch: make rfc2047 encoding more strict, 2012-10-18)
replaced it.

Widening the element type of the lookup table would work, but might
impact performance.  I guess it would be slower, but perhaps we'd
have to measure it to be sure.  Splitting the table into unrelated
subsets would avoid that.

Deciding which flags to move requires knowing the full target set,
I think.  The punctuation-related flags looked to me like good
candidates until I saw the original isprint implementation which
uses them and several of the others.  So I'm not so sure anymore,
but here's a patch that moves them out:

---
 ctype.c           | 51 +++++++++++++++++++++++++++++++++++++----------
 git-compat-util.h | 21 ++++++++++---------
 2 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/ctype.c b/ctype.c
index fc0225cebd..c7a0dcba13 100644
--- a/ctype.c
+++ b/ctype.c
@@ -9,26 +9,57 @@ enum {
 	S = GIT_SPACE,
 	A = GIT_ALPHA,
 	D = GIT_DIGIT,
-	G = GIT_GLOB_SPECIAL,	/* *, ?, [, \\ */
-	R = GIT_REGEX_SPECIAL,	/* $, (, ), +, ., ^, {, | */
-	P = GIT_PATHSPEC_MAGIC, /* other non-alnum, except for ] and } */
 	X = GIT_CNTRL,
-	U = GIT_PUNCT,
 	Z = GIT_CNTRL | GIT_SPACE
 };

 const unsigned char sane_ctype[256] = {
 	X, X, X, X, X, X, X, X, X, Z, Z, X, X, Z, X, X,		/*   0.. 15 */
 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,		/*  16.. 31 */
-	S, P, P, P, R, P, P, P, R, R, G, R, P, P, R, P,		/*  32.. 47 */
-	D, D, D, D, D, D, D, D, D, D, P, P, P, P, P, G,		/*  48.. 63 */
-	P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
-	A, A, A, A, A, A, A, A, A, A, A, G, G, U, R, P,		/*  80.. 95 */
-	P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
-	A, A, A, A, A, A, A, A, A, A, A, R, R, U, P, X,		/* 112..127 */
+	S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		/*  32.. 47 */
+	D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, 0,		/*  48.. 63 */
+	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
+	A, A, A, A, A, A, A, A, A, A, A, 0, 0, 0, 0, 0,		/*  80.. 95 */
+	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
+	A, A, A, A, A, A, A, A, A, A, A, 0, 0, 0, 0, X,		/* 112..127 */
 	/* Nothing in the 128.. range */
 };

+const unsigned char sane_ctype2[256] = {
+	['*'] = GIT_GLOB_SPECIAL,
+	['?'] = GIT_GLOB_SPECIAL,
+	['['] = GIT_GLOB_SPECIAL,
+	['\\'] = GIT_GLOB_SPECIAL,
+	['$'] = GIT_REGEX_SPECIAL,
+	['('] = GIT_REGEX_SPECIAL,
+	[')'] = GIT_REGEX_SPECIAL,
+	['+'] = GIT_REGEX_SPECIAL,
+	['.'] = GIT_REGEX_SPECIAL,
+	['^'] = GIT_REGEX_SPECIAL,
+	['{'] = GIT_REGEX_SPECIAL,
+	['|'] = GIT_REGEX_SPECIAL,
+	['!'] = GIT_PATHSPEC_MAGIC,
+	['"'] = GIT_PATHSPEC_MAGIC,
+	['#'] = GIT_PATHSPEC_MAGIC,
+	['%'] = GIT_PATHSPEC_MAGIC,
+	['&'] = GIT_PATHSPEC_MAGIC,
+	['\''] = GIT_PATHSPEC_MAGIC,
+	[','] = GIT_PATHSPEC_MAGIC,
+	['-'] = GIT_PATHSPEC_MAGIC,
+	['/'] = GIT_PATHSPEC_MAGIC,
+	[':'] = GIT_PATHSPEC_MAGIC,
+	[';'] = GIT_PATHSPEC_MAGIC,
+	['<'] = GIT_PATHSPEC_MAGIC,
+	['='] = GIT_PATHSPEC_MAGIC,
+	['>'] = GIT_PATHSPEC_MAGIC,
+	['@'] = GIT_PATHSPEC_MAGIC,
+	['_'] = GIT_PATHSPEC_MAGIC,
+	['`'] = GIT_PATHSPEC_MAGIC,
+	['~'] = GIT_PATHSPEC_MAGIC,
+	[']'] = GIT_PUNCT,
+	['}'] = GIT_PUNCT,
+};
+
 /* For case-insensitive kwset */
 const unsigned char tolower_trans_tbl[256] = {
 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
diff --git a/git-compat-util.h b/git-compat-util.h
index 4f0028ce60..61e71fe702 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -1228,11 +1228,7 @@ extern const unsigned char sane_ctype[256];
 #define GIT_SPACE 0x01
 #define GIT_DIGIT 0x02
 #define GIT_ALPHA 0x04
-#define GIT_GLOB_SPECIAL 0x08
-#define GIT_REGEX_SPECIAL 0x10
-#define GIT_PATHSPEC_MAGIC 0x20
 #define GIT_CNTRL 0x40
-#define GIT_PUNCT 0x80
 #define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
 #define isascii(x) (((x) & ~0x7f) == 0)
 #define isspace(x) sane_istest(x,GIT_SPACE)
@@ -1242,15 +1238,22 @@ extern const unsigned char sane_ctype[256];
 #define isprint(x) ((x) >= 0x20 && (x) <= 0x7e)
 #define islower(x) sane_iscase(x, 1)
 #define isupper(x) sane_iscase(x, 0)
-#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
-#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
 #define iscntrl(x) (sane_istest(x,GIT_CNTRL))
-#define ispunct(x) sane_istest(x, GIT_PUNCT | GIT_REGEX_SPECIAL | \
-		GIT_GLOB_SPECIAL | GIT_PATHSPEC_MAGIC)
 #define isxdigit(x) (hexval_table[(unsigned char)(x)] != -1)
 #define tolower(x) sane_case((unsigned char)(x), 0x20)
 #define toupper(x) sane_case((unsigned char)(x), 0)
-#define is_pathspec_magic(x) sane_istest(x,GIT_PATHSPEC_MAGIC)
+
+extern const unsigned char sane_ctype2[256];
+#define GIT_GLOB_SPECIAL 0x01
+#define GIT_REGEX_SPECIAL 0x02
+#define GIT_PATHSPEC_MAGIC 0x04
+#define GIT_PUNCT 0x08
+#define sane_istest2(x,mask) ((sane_ctype2[(unsigned char)(x)] & (mask)) != 0)
+#define is_glob_special(x) sane_istest2(x,GIT_GLOB_SPECIAL)
+#define is_regex_special(x) sane_istest2(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
+#define ispunct(x) sane_istest2(x, GIT_PUNCT | GIT_REGEX_SPECIAL | \
+		GIT_GLOB_SPECIAL | GIT_PATHSPEC_MAGIC)
+#define is_pathspec_magic(x) sane_istest2(x,GIT_PATHSPEC_MAGIC)

 static inline int sane_case(int x, int high)
 {
--
2.39.1
René Scharfe Feb. 11, 2023, 2:11 p.m. UTC | #8
Am 11.02.23 um 14:48 schrieb René Scharfe:
> Am 11.02.23 um 08:01 schrieb Masahiro Yamada:
>> On Sat, Feb 11, 2023 at 7:03 AM René Scharfe <l.s.r@web.de> wrote:
>>>
>>> We already use eight bits for the lookup table values in sane_ctype.
>>> Perhaps it's time to move GIT_GLOB_SPECIAL, GIT_REGEX_SPECIAL,
>>> GIT_PATHSPEC_MAGIC and GIT_PUNCT to their own table to make room for
>>> flags for isprint(), isgraph() and isblank().
>>
>>
>>
>> I think that is a good idea.
>>
>> If we can have more room for isupper() and islower(),
>> we will be able to delete sane_iscase().
>>
>>
>> (I was thinking 'unsigned short', but having two tables is OK.)
>>
>>
>>
>> So, how can I proceed with this patchset?
>>
>>
>>
>> [A] After somebody refactors ctype.c table,
>>     I will rebase this series on top of that.
>>
>> [B] keep isblank() and isgraph() local to wildmatch.c
>>     until that happens, so I can proceed without
>>     depending on the ctype.c refactoring.
>>
>>     Apparently, wildmatch.c is not using a pointer
>>     increment with isblank() or isgraph().
>>
>> [C] If 'somebody' in [A] is supposed to me,
>>     my v2 will include ctype refactoring.
>
> [D] We need more tests first. :)  I sent patches to test the current
> classifiers separately.  A similar test for isblank would have helped
> you detect the mismatch quickly.  Full test coverage also gives
> confidence when tinkering with the existing classifiers.
>
> 1c149ab2dd (ctype: support iscntrl, ispunct, isxdigit and isprint,
> 2012-10-15) added an implementation of isprint that evaluated its
> argument only once, by the way, but the one from 0fcec2ce54
> (format-patch: make rfc2047 encoding more strict, 2012-10-18)
> replaced it.
>
> Widening the element type of the lookup table would work, but might
> impact performance.  I guess it would be slower, but perhaps we'd
> have to measure it to be sure.  Splitting the table into unrelated
> subsets would avoid that.
>
> Deciding which flags to move requires knowing the full target set,
> I think.  The punctuation-related flags looked to me like good
> candidates until I saw the original isprint implementation which
> uses them and several of the others.  So I'm not so sure anymore,
> but here's a patch that moves them out:

Perhaps:

[E] Implement isblank and isgraph as inline functions and leave the
lookup table integration for later:

   #undef isblank
   #define isblank(x) sane_isblank(x)
   static inline int sane_isblank(int c) {return c == ' ' || c == '\t';}

   #undef isgraph
   #define isgraph(x) sane_isgraph(x)
   static inline int sane_isgraph(int c) {return isprint(c) && c != ' ';}

The Lazy Way™ ;)

René
diff mbox series

Patch

diff --git a/git-compat-util.h b/git-compat-util.h
index 4f0028ce60..90b43b2bc9 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -1212,10 +1212,12 @@  extern const unsigned char tolower_trans_tbl[256];
 /* Sane ctype - no locale, and works with signed chars */
 #undef isascii
 #undef isspace
+#undef isblank
 #undef isdigit
 #undef isalpha
 #undef isalnum
 #undef isprint
+#undef isgraph
 #undef islower
 #undef isupper
 #undef tolower
@@ -1236,10 +1238,12 @@  extern const unsigned char sane_ctype[256];
 #define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
 #define isascii(x) (((x) & ~0x7f) == 0)
 #define isspace(x) sane_istest(x,GIT_SPACE)
+#define isblank(x) (isspace(x) || (x) == '\t')
 #define isdigit(x) sane_istest(x,GIT_DIGIT)
 #define isalpha(x) sane_istest(x,GIT_ALPHA)
 #define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
 #define isprint(x) ((x) >= 0x20 && (x) <= 0x7e)
+#define isgraph(x) (isprint(x) && !isspace(x))
 #define islower(x) sane_iscase(x, 1)
 #define isupper(x) sane_iscase(x, 0)
 #define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
diff --git a/wildmatch.c b/wildmatch.c
index 7e5a7ea1ea..85c4c7f8a7 100644
--- a/wildmatch.c
+++ b/wildmatch.c
@@ -28,18 +28,8 @@  typedef unsigned char uchar;
 # define ISASCII(c) isascii(c)
 #endif
 
-#ifdef isblank
-# define ISBLANK(c) (ISASCII(c) && isblank(c))
-#else
-# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
-#endif
-
-#ifdef isgraph
-# define ISGRAPH(c) (ISASCII(c) && isgraph(c))
-#else
-# define ISGRAPH(c) (ISASCII(c) && isprint(c) && !isspace(c))
-#endif
-
+#define ISBLANK(c) (ISASCII(c) && isblank(c))
+#define ISGRAPH(c) (ISASCII(c) && isgraph(c))
 #define ISPRINT(c) (ISASCII(c) && isprint(c))
 #define ISDIGIT(c) (ISASCII(c) && isdigit(c))
 #define ISALNUM(c) (ISASCII(c) && isalnum(c))