From 8cd26dbde1bf7845438a45e8cbbe4907abbff7f0 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Mon, 7 Apr 2014 23:06:45 -0700 Subject: [PATCH] grep: prefer bool in DFA internals * src/dfa.c (bool_bf): New type. (dfa_state): Use it, as this seems to generate slightly better code with GCC. (struct mb_char_classes, struct dfa, equal, case_fold, dfasyntax) (laststart, parse_bracket_exp, lex, dfaparse, dfaanalyze, dfastate) (match_mb_charset, dfamust): Use bool for boolean. (using_utf8) [!HAVE_LANGINFO_CODESET]: Tune. (dfaanalyze): Prefer & to && and | to || on booleans; it's simpler here. (dfastate): Simplify charclass nonzero testing. Redo has_mbcset test so that the compiler's more likely to optimize it. --- src/dfa.c | 140 ++++++++++++++++++++++++++++++++------------------------------ 1 file changed, 73 insertions(+), 67 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 7bab84d..76f7e79 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -34,6 +34,13 @@ #include #include +/* The pre-C99 emulation doesn't work for bool bitfields. */ +#if __STDC_VERSION__ < 199901 +typedef unsigned int bool_bf; +#else +typedef bool bool_bf; +#endif + #define STREQ(a, b) (strcmp (a, b) == 0) /* ISASCIIDIGIT differs from isdigit, as follows: @@ -288,8 +295,8 @@ typedef struct size_t hash; /* Hash of the positions of this state. */ position_set elems; /* Positions this state could match. */ unsigned char context; /* Context from previous state. */ - bool has_backref; /* True if this state matches a \. */ - bool has_mbcset; /* True if this state matches a MBCSET. */ + bool_bf has_backref : 1; /* True if this state matches a \. */ + bool_bf has_mbcset : 1; /* True if this state matches a MBCSET. */ unsigned short constraint; /* Constraint for this state to accept. */ token first_end; /* Token value of the first END in elems. */ position_set mbps; /* Positions which can match multibyte @@ -306,7 +313,7 @@ typedef ptrdiff_t state_num; struct mb_char_classes { ptrdiff_t cset; - int invert; + bool invert; wchar_t *chars; /* Normal characters. */ size_t nchars; wctype_t *ch_classes; /* Character classes. */ @@ -390,7 +397,7 @@ struct dfa matching the given position in a string matching the regexp. Allocated to the maximum possible position index. */ - int searchflag; /* True if we are supposed to build a searching + bool searchflag; /* True if we are supposed to build a searching as opposed to an exact matcher. A searching matcher finds the first and shortest string matching a regexp anywhere in the buffer, @@ -669,7 +676,7 @@ notset (charclass s) s[i] = ~s[i]; } -static int +static bool equal (charclass const s1, charclass const s2) { return memcmp (s1, s2, sizeof (charclass)) == 0; @@ -704,7 +711,7 @@ charclass_index (charclass const s) static reg_syntax_t syntax_bits, syntax_bits_set; /* Flag for case-folding letters into sets. */ -static int case_fold; +static bool case_fold; /* End-of-line byte in data. */ static unsigned char eolbyte; @@ -759,7 +766,7 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) syntax_bits_set = 1; syntax_bits = bits; - case_fold = fold; + case_fold = fold != 0; eolbyte = eol; for (i = 0; i < NOTCHAR; ++i) @@ -812,17 +819,14 @@ setbit_case_fold_c (int b, charclass c) int using_utf8 (void) { +#ifdef HAVE_LANGINFO_CODESET static int utf8 = -1; - if (utf8 == -1) - { -#if defined HAVE_LANGINFO_CODESET - utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8")); + if (utf8 < 0) + utf8 = STREQ (nl_langinfo (CODESET), "UTF-8"); + return utf8; #else - utf8 = 0; + return 0; #endif - } - - return utf8; } /* Return true if the current locale is known to be a unibyte locale @@ -873,7 +877,7 @@ using_simple_locale (void) static char const *lexptr; /* Pointer to next input character. */ static size_t lexleft; /* Number of characters remaining. */ static token lasttok; /* Previous token returned; initially END. */ -static int laststart; /* True if we're separated from beginning or (, +static bool laststart; /* True if we're separated from beginning or (, | only by zero-width characters. */ static size_t parens; /* Count of outstanding left parens. */ static int minrep, maxrep; /* Repeat counts for {m,n}. */ @@ -1009,7 +1013,7 @@ find_pred (const char *str) static token parse_bracket_exp (void) { - int invert; + bool invert; int c, c1, c2; charclass ccl; @@ -1057,11 +1061,11 @@ parse_bracket_exp (void) if (c == '^') { FETCH_WC (c, wc, _("unbalanced [")); - invert = 1; + invert = true; known_bracket_exp = using_simple_locale (); } else - invert = 0; + invert = false; colon_warning_state = (c == ':'); do @@ -1279,7 +1283,7 @@ static token lex (void) { unsigned int c, c2; - int backslash = 0; + bool backslash = false; charclass ccl; int i; @@ -1302,7 +1306,7 @@ lex (void) goto normal_char; if (lexleft == 0) dfaerror (_("unfinished \\ escape")); - backslash = 1; + backslash = true; break; case '^': @@ -1340,7 +1344,7 @@ lex (void) case '9': if (backslash && !(syntax_bits & RE_NO_BK_REFS)) { - laststart = 0; + laststart = false; return lasttok = BACKREF; } goto normal_char; @@ -1455,7 +1459,7 @@ lex (void) lexptr = p; lexleft = lim - p; } - laststart = 0; + laststart = false; return lasttok = REPMN; case '|': @@ -1463,21 +1467,21 @@ lex (void) goto normal_char; if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) goto normal_char; - laststart = 1; + laststart = true; return lasttok = OR; case '\n': if (syntax_bits & RE_LIMITED_OPS || backslash || !(syntax_bits & RE_NEWLINE_ALT)) goto normal_char; - laststart = 1; + laststart = true; return lasttok = OR; case '(': if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; ++parens; - laststart = 1; + laststart = true; return lasttok = LPAREN; case ')': @@ -1486,7 +1490,7 @@ lex (void) if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) goto normal_char; --parens; - laststart = 0; + laststart = false; return lasttok = RPAREN; case '.': @@ -1496,7 +1500,7 @@ lex (void) { /* In multibyte environment period must match with a single character not a byte. So we use ANYCHAR. */ - laststart = 0; + laststart = false; return lasttok = ANYCHAR; } zeroset (ccl); @@ -1505,7 +1509,7 @@ lex (void) clrbit (eolbyte, ccl); if (syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', ccl); - laststart = 0; + laststart = false; return lasttok = CSET + charclass_index (ccl); case 's': @@ -1520,7 +1524,7 @@ lex (void) setbit (c2, ccl); if (c == 'S') notset (ccl); - laststart = 0; + laststart = false; return lasttok = CSET + charclass_index (ccl); } @@ -1550,7 +1554,7 @@ lex (void) POP_LEX_STATE (); - laststart = 0; + laststart = false; return lasttok; case 'w': @@ -1563,18 +1567,18 @@ lex (void) setbit (c2, ccl); if (c == 'W') notset (ccl); - laststart = 0; + laststart = false; return lasttok = CSET + charclass_index (ccl); case '[': if (backslash) goto normal_char; - laststart = 0; + laststart = false; return lasttok = parse_bracket_exp (); default: normal_char: - laststart = 0; + laststart = false; /* For multibyte character sets, folding is done in atom. Always return WCHAR. */ if (MB_CUR_MAX > 1) @@ -1977,7 +1981,7 @@ dfaparse (char const *s, size_t len, struct dfa *d) lexptr = s; lexleft = len; lasttok = END; - laststart = 1; + laststart = true; parens = 0; if (MB_CUR_MAX > 1) { @@ -2323,7 +2327,7 @@ state_separate_contexts (position_set const *s) void dfaanalyze (struct dfa *d, int searchflag) { - int *nullable; /* Nullable stack. */ + bool *nullable; /* Nullable stack. */ size_t *nfirstpos; /* Element count stack for firstpos sets. */ position *firstpos; /* Array where firstpos elements are stored. */ size_t *nlastpos; /* Element count stack for lastpos sets. */ @@ -2331,7 +2335,7 @@ dfaanalyze (struct dfa *d, int searchflag) position_set tmp; /* Temporary set for merging sets. */ position_set merged; /* Result of merging sets. */ int separate_contexts; /* Context wanted by some position. */ - int *o_nullable; + bool *o_nullable; size_t *o_nfirst, *o_nlast; position *o_firstpos, *o_lastpos; size_t i, j; @@ -2347,7 +2351,7 @@ dfaanalyze (struct dfa *d, int searchflag) putc ('\n', stderr); #endif - d->searchflag = searchflag; + d->searchflag = searchflag != 0; MALLOC (nullable, d->depth); o_nullable = nullable; @@ -2369,7 +2373,7 @@ dfaanalyze (struct dfa *d, int searchflag) { case EMPTY: /* The empty set is nullable. */ - *nullable++ = 1; + *nullable++ = true; /* The firstpos and lastpos of the empty leaf are both empty. */ *nfirstpos++ = *nlastpos++ = 0; @@ -2391,7 +2395,7 @@ dfaanalyze (struct dfa *d, int searchflag) case QMARK: /* A QMARK or STAR node is automatically nullable. */ if (d->tokens[i] != PLUS) - nullable[-1] = 1; + nullable[-1] = true; break; case CAT: @@ -2429,7 +2433,7 @@ dfaanalyze (struct dfa *d, int searchflag) --nlastpos; /* A CAT node is nullable if both arguments are nullable. */ - nullable[-2] = nullable[-1] && nullable[-2]; + nullable[-2] &= nullable[-1]; --nullable; break; @@ -2443,7 +2447,7 @@ dfaanalyze (struct dfa *d, int searchflag) --nlastpos; /* An OR node is nullable if either argument is nullable. */ - nullable[-2] = nullable[-1] || nullable[-2]; + nullable[-2] |= nullable[-1]; --nullable; break; @@ -2574,11 +2578,11 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) size_t ngrps = 0; /* Number of groups actually used. */ position pos; /* Current position being considered. */ charclass matches; /* Set of matching characters. */ - int matchesf; /* True if matches is nonempty. */ + unsigned int matchesf; /* Nonzero if matches is nonempty. */ charclass intersect; /* Intersection with some label set. */ - int intersectf; /* True if intersect is nonempty. */ + unsigned int intersectf; /* Nonzero if intersect is nonempty. */ charclass leftovers; /* Stuff in the label that didn't match. */ - int leftoversf; /* True if leftovers is nonempty. */ + unsigned int leftoversf; /* Nonzero if leftovers is nonempty. */ position_set follows; /* Union of the follows of some group. */ position_set tmp; /* Temporary space for merging sets. */ int possible_contexts; /* Contexts that this group can match. */ @@ -2586,7 +2590,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) state_num state; /* New state. */ state_num state_newline; /* New state on a newline transition. */ state_num state_letter; /* New state on a letter transition. */ - int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */ + bool next_isnt_1st_byte = false; /* Flag if we can't add state0. */ size_t i, j, k; MALLOC (grps, NOTCHAR); @@ -2601,21 +2605,23 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (d->tokens[pos.index] == ANYCHAR - || d->tokens[pos.index] == MBCSET) - /* MB_CUR_MAX > 1 */ + else { - /* ANYCHAR and MBCSET must match with a single character, so we - must put it to d->states[s].mbps, which contains the positions - which can match with a single character not a byte. */ - if (d->states[s].mbps.nelem == 0) - alloc_position_set (&d->states[s].mbps, 1); - insert (pos, &(d->states[s].mbps)); - d->states[s].has_mbcset |= (d->tokens[pos.index] == MBCSET); + if (d->tokens[pos.index] == MBCSET + || d->tokens[pos.index] == ANYCHAR) + { + /* MB_CUR_MAX > 1 */ + if (d->tokens[pos.index] == MBCSET) + d->states[s].has_mbcset = true; + /* ANYCHAR and MBCSET must match with a single character, so we + must put it to d->states[s].mbps, which contains the positions + which can match with a single character not a byte. */ + if (d->states[s].mbps.nelem == 0) + alloc_position_set (&d->states[s].mbps, 1); + insert (pos, &(d->states[s].mbps)); + } continue; } - else - continue; /* Some characters may need to be eliminated from matches because they fail in the current context. */ @@ -2654,7 +2660,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) matches. */ intersectf = 0; for (k = 0; k < CHARCLASS_INTS; ++k) - (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; + intersectf |= intersect[k] = matches[k] & labels[j][k]; if (!intersectf) continue; @@ -2665,8 +2671,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) /* Even an optimizing compiler can't know this for sure. */ int match = matches[k], label = labels[j][k]; - (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; - (matches[k] = match & ~label) ? (matchesf = 1) : 0; + leftoversf |= leftovers[k] = ~match & label; + matchesf |= matches[k] = match & ~label; } /* If there were leftovers, create a new group labeled with them. */ @@ -2763,12 +2769,12 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) codepoint of , it must not be but 2nd byte of , so we cannot add state[0]. */ - next_isnt_1st_byte = 0; + next_isnt_1st_byte = false; for (j = 0; j < follows.nelem; ++j) { if (!(d->multibyte_prop[follows.elems[j].index] & 1)) { - next_isnt_1st_byte = 1; + next_isnt_1st_byte = true; break; } } @@ -3052,7 +3058,7 @@ static int match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) { size_t i; - int match; /* Matching succeeded. */ + bool match; /* Matching succeeded. */ int match_len; /* Length of the character (or collating element) with which this operator matches. */ int op_len; /* Length of the operator. */ @@ -4040,14 +4046,14 @@ dfamust (struct dfa *d) char *result; size_t ri; size_t i; - int exact; + bool exact; token t; static must must0; struct dfamust *dm; static char empty_string[] = ""; result = empty_string; - exact = 0; + exact = false; MALLOC (musts, d->tindex + 1); mp = musts; for (i = 0; i <= d->tindex; ++i) @@ -4142,7 +4148,7 @@ dfamust (struct dfa *d) if (strlen (musts[0].in[i]) > strlen (result)) result = musts[0].in[i]; if (STREQ (result, musts[0].is)) - exact = 1; + exact = true; goto done; case CAT: assert (&musts[2] <= mp); -- 1.9.0