From 8cd26dbde1bf7845438a45e8cbbe4907abbff7f0 Mon Sep 17 00:00:00 2001
From: Paul Eggert
Date: Mon, 7 Apr 2014 23:06:45 -0700
Subject: [PATCH] grep: prefer bool in DFA internals
* src/dfa.c (bool_bf): New type.
(dfa_state): Use it, as this seems to generate slightly better
code with GCC.
(struct mb_char_classes, struct dfa, equal, case_fold, dfasyntax)
(laststart, parse_bracket_exp, lex, dfaparse, dfaanalyze, dfastate)
(match_mb_charset, dfamust):
Use bool for boolean.
(using_utf8) [!HAVE_LANGINFO_CODESET]: Tune.
(dfaanalyze): Prefer & to && and | to || on booleans; it's simpler here.
(dfastate): Simplify charclass nonzero testing. Redo has_mbcset
test so that the compiler's more likely to optimize it.
---
src/dfa.c | 140 ++++++++++++++++++++++++++++++++------------------------------
1 file changed, 73 insertions(+), 67 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 7bab84d..76f7e79 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -34,6 +34,13 @@
#include
#include
+/* The pre-C99 emulation doesn't work for bool bitfields. */
+#if __STDC_VERSION__ < 199901
+typedef unsigned int bool_bf;
+#else
+typedef bool bool_bf;
+#endif
+
#define STREQ(a, b) (strcmp (a, b) == 0)
/* ISASCIIDIGIT differs from isdigit, as follows:
@@ -288,8 +295,8 @@ typedef struct
size_t hash; /* Hash of the positions of this state. */
position_set elems; /* Positions this state could match. */
unsigned char context; /* Context from previous state. */
- bool has_backref; /* True if this state matches a \. */
- bool has_mbcset; /* True if this state matches a MBCSET. */
+ bool_bf has_backref : 1; /* True if this state matches a \. */
+ bool_bf has_mbcset : 1; /* True if this state matches a MBCSET. */
unsigned short constraint; /* Constraint for this state to accept. */
token first_end; /* Token value of the first END in elems. */
position_set mbps; /* Positions which can match multibyte
@@ -306,7 +313,7 @@ typedef ptrdiff_t state_num;
struct mb_char_classes
{
ptrdiff_t cset;
- int invert;
+ bool invert;
wchar_t *chars; /* Normal characters. */
size_t nchars;
wctype_t *ch_classes; /* Character classes. */
@@ -390,7 +397,7 @@ struct dfa
matching the given position in a string
matching the regexp. Allocated to the
maximum possible position index. */
- int searchflag; /* True if we are supposed to build a searching
+ bool searchflag; /* True if we are supposed to build a searching
as opposed to an exact matcher. A searching
matcher finds the first and shortest string
matching a regexp anywhere in the buffer,
@@ -669,7 +676,7 @@ notset (charclass s)
s[i] = ~s[i];
}
-static int
+static bool
equal (charclass const s1, charclass const s2)
{
return memcmp (s1, s2, sizeof (charclass)) == 0;
@@ -704,7 +711,7 @@ charclass_index (charclass const s)
static reg_syntax_t syntax_bits, syntax_bits_set;
/* Flag for case-folding letters into sets. */
-static int case_fold;
+static bool case_fold;
/* End-of-line byte in data. */
static unsigned char eolbyte;
@@ -759,7 +766,7 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
syntax_bits_set = 1;
syntax_bits = bits;
- case_fold = fold;
+ case_fold = fold != 0;
eolbyte = eol;
for (i = 0; i < NOTCHAR; ++i)
@@ -812,17 +819,14 @@ setbit_case_fold_c (int b, charclass c)
int
using_utf8 (void)
{
+#ifdef HAVE_LANGINFO_CODESET
static int utf8 = -1;
- if (utf8 == -1)
- {
-#if defined HAVE_LANGINFO_CODESET
- utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
+ if (utf8 < 0)
+ utf8 = STREQ (nl_langinfo (CODESET), "UTF-8");
+ return utf8;
#else
- utf8 = 0;
+ return 0;
#endif
- }
-
- return utf8;
}
/* Return true if the current locale is known to be a unibyte locale
@@ -873,7 +877,7 @@ using_simple_locale (void)
static char const *lexptr; /* Pointer to next input character. */
static size_t lexleft; /* Number of characters remaining. */
static token lasttok; /* Previous token returned; initially END. */
-static int laststart; /* True if we're separated from beginning or (,
+static bool laststart; /* True if we're separated from beginning or (,
| only by zero-width characters. */
static size_t parens; /* Count of outstanding left parens. */
static int minrep, maxrep; /* Repeat counts for {m,n}. */
@@ -1009,7 +1013,7 @@ find_pred (const char *str)
static token
parse_bracket_exp (void)
{
- int invert;
+ bool invert;
int c, c1, c2;
charclass ccl;
@@ -1057,11 +1061,11 @@ parse_bracket_exp (void)
if (c == '^')
{
FETCH_WC (c, wc, _("unbalanced ["));
- invert = 1;
+ invert = true;
known_bracket_exp = using_simple_locale ();
}
else
- invert = 0;
+ invert = false;
colon_warning_state = (c == ':');
do
@@ -1279,7 +1283,7 @@ static token
lex (void)
{
unsigned int c, c2;
- int backslash = 0;
+ bool backslash = false;
charclass ccl;
int i;
@@ -1302,7 +1306,7 @@ lex (void)
goto normal_char;
if (lexleft == 0)
dfaerror (_("unfinished \\ escape"));
- backslash = 1;
+ backslash = true;
break;
case '^':
@@ -1340,7 +1344,7 @@ lex (void)
case '9':
if (backslash && !(syntax_bits & RE_NO_BK_REFS))
{
- laststart = 0;
+ laststart = false;
return lasttok = BACKREF;
}
goto normal_char;
@@ -1455,7 +1459,7 @@ lex (void)
lexptr = p;
lexleft = lim - p;
}
- laststart = 0;
+ laststart = false;
return lasttok = REPMN;
case '|':
@@ -1463,21 +1467,21 @@ lex (void)
goto normal_char;
if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
goto normal_char;
- laststart = 1;
+ laststart = true;
return lasttok = OR;
case '\n':
if (syntax_bits & RE_LIMITED_OPS
|| backslash || !(syntax_bits & RE_NEWLINE_ALT))
goto normal_char;
- laststart = 1;
+ laststart = true;
return lasttok = OR;
case '(':
if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
goto normal_char;
++parens;
- laststart = 1;
+ laststart = true;
return lasttok = LPAREN;
case ')':
@@ -1486,7 +1490,7 @@ lex (void)
if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
goto normal_char;
--parens;
- laststart = 0;
+ laststart = false;
return lasttok = RPAREN;
case '.':
@@ -1496,7 +1500,7 @@ lex (void)
{
/* In multibyte environment period must match with a single
character not a byte. So we use ANYCHAR. */
- laststart = 0;
+ laststart = false;
return lasttok = ANYCHAR;
}
zeroset (ccl);
@@ -1505,7 +1509,7 @@ lex (void)
clrbit (eolbyte, ccl);
if (syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', ccl);
- laststart = 0;
+ laststart = false;
return lasttok = CSET + charclass_index (ccl);
case 's':
@@ -1520,7 +1524,7 @@ lex (void)
setbit (c2, ccl);
if (c == 'S')
notset (ccl);
- laststart = 0;
+ laststart = false;
return lasttok = CSET + charclass_index (ccl);
}
@@ -1550,7 +1554,7 @@ lex (void)
POP_LEX_STATE ();
- laststart = 0;
+ laststart = false;
return lasttok;
case 'w':
@@ -1563,18 +1567,18 @@ lex (void)
setbit (c2, ccl);
if (c == 'W')
notset (ccl);
- laststart = 0;
+ laststart = false;
return lasttok = CSET + charclass_index (ccl);
case '[':
if (backslash)
goto normal_char;
- laststart = 0;
+ laststart = false;
return lasttok = parse_bracket_exp ();
default:
normal_char:
- laststart = 0;
+ laststart = false;
/* For multibyte character sets, folding is done in atom. Always
return WCHAR. */
if (MB_CUR_MAX > 1)
@@ -1977,7 +1981,7 @@ dfaparse (char const *s, size_t len, struct dfa *d)
lexptr = s;
lexleft = len;
lasttok = END;
- laststart = 1;
+ laststart = true;
parens = 0;
if (MB_CUR_MAX > 1)
{
@@ -2323,7 +2327,7 @@ state_separate_contexts (position_set const *s)
void
dfaanalyze (struct dfa *d, int searchflag)
{
- int *nullable; /* Nullable stack. */
+ bool *nullable; /* Nullable stack. */
size_t *nfirstpos; /* Element count stack for firstpos sets. */
position *firstpos; /* Array where firstpos elements are stored. */
size_t *nlastpos; /* Element count stack for lastpos sets. */
@@ -2331,7 +2335,7 @@ dfaanalyze (struct dfa *d, int searchflag)
position_set tmp; /* Temporary set for merging sets. */
position_set merged; /* Result of merging sets. */
int separate_contexts; /* Context wanted by some position. */
- int *o_nullable;
+ bool *o_nullable;
size_t *o_nfirst, *o_nlast;
position *o_firstpos, *o_lastpos;
size_t i, j;
@@ -2347,7 +2351,7 @@ dfaanalyze (struct dfa *d, int searchflag)
putc ('\n', stderr);
#endif
- d->searchflag = searchflag;
+ d->searchflag = searchflag != 0;
MALLOC (nullable, d->depth);
o_nullable = nullable;
@@ -2369,7 +2373,7 @@ dfaanalyze (struct dfa *d, int searchflag)
{
case EMPTY:
/* The empty set is nullable. */
- *nullable++ = 1;
+ *nullable++ = true;
/* The firstpos and lastpos of the empty leaf are both empty. */
*nfirstpos++ = *nlastpos++ = 0;
@@ -2391,7 +2395,7 @@ dfaanalyze (struct dfa *d, int searchflag)
case QMARK:
/* A QMARK or STAR node is automatically nullable. */
if (d->tokens[i] != PLUS)
- nullable[-1] = 1;
+ nullable[-1] = true;
break;
case CAT:
@@ -2429,7 +2433,7 @@ dfaanalyze (struct dfa *d, int searchflag)
--nlastpos;
/* A CAT node is nullable if both arguments are nullable. */
- nullable[-2] = nullable[-1] && nullable[-2];
+ nullable[-2] &= nullable[-1];
--nullable;
break;
@@ -2443,7 +2447,7 @@ dfaanalyze (struct dfa *d, int searchflag)
--nlastpos;
/* An OR node is nullable if either argument is nullable. */
- nullable[-2] = nullable[-1] || nullable[-2];
+ nullable[-2] |= nullable[-1];
--nullable;
break;
@@ -2574,11 +2578,11 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
size_t ngrps = 0; /* Number of groups actually used. */
position pos; /* Current position being considered. */
charclass matches; /* Set of matching characters. */
- int matchesf; /* True if matches is nonempty. */
+ unsigned int matchesf; /* Nonzero if matches is nonempty. */
charclass intersect; /* Intersection with some label set. */
- int intersectf; /* True if intersect is nonempty. */
+ unsigned int intersectf; /* Nonzero if intersect is nonempty. */
charclass leftovers; /* Stuff in the label that didn't match. */
- int leftoversf; /* True if leftovers is nonempty. */
+ unsigned int leftoversf; /* Nonzero if leftovers is nonempty. */
position_set follows; /* Union of the follows of some group. */
position_set tmp; /* Temporary space for merging sets. */
int possible_contexts; /* Contexts that this group can match. */
@@ -2586,7 +2590,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
state_num state; /* New state. */
state_num state_newline; /* New state on a newline transition. */
state_num state_letter; /* New state on a letter transition. */
- int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
+ bool next_isnt_1st_byte = false; /* Flag if we can't add state0. */
size_t i, j, k;
MALLOC (grps, NOTCHAR);
@@ -2601,21 +2605,23 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
setbit (d->tokens[pos.index], matches);
else if (d->tokens[pos.index] >= CSET)
copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
- else if (d->tokens[pos.index] == ANYCHAR
- || d->tokens[pos.index] == MBCSET)
- /* MB_CUR_MAX > 1 */
+ else
{
- /* ANYCHAR and MBCSET must match with a single character, so we
- must put it to d->states[s].mbps, which contains the positions
- which can match with a single character not a byte. */
- if (d->states[s].mbps.nelem == 0)
- alloc_position_set (&d->states[s].mbps, 1);
- insert (pos, &(d->states[s].mbps));
- d->states[s].has_mbcset |= (d->tokens[pos.index] == MBCSET);
+ if (d->tokens[pos.index] == MBCSET
+ || d->tokens[pos.index] == ANYCHAR)
+ {
+ /* MB_CUR_MAX > 1 */
+ if (d->tokens[pos.index] == MBCSET)
+ d->states[s].has_mbcset = true;
+ /* ANYCHAR and MBCSET must match with a single character, so we
+ must put it to d->states[s].mbps, which contains the positions
+ which can match with a single character not a byte. */
+ if (d->states[s].mbps.nelem == 0)
+ alloc_position_set (&d->states[s].mbps, 1);
+ insert (pos, &(d->states[s].mbps));
+ }
continue;
}
- else
- continue;
/* Some characters may need to be eliminated from matches because
they fail in the current context. */
@@ -2654,7 +2660,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
matches. */
intersectf = 0;
for (k = 0; k < CHARCLASS_INTS; ++k)
- (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
+ intersectf |= intersect[k] = matches[k] & labels[j][k];
if (!intersectf)
continue;
@@ -2665,8 +2671,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
/* Even an optimizing compiler can't know this for sure. */
int match = matches[k], label = labels[j][k];
- (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
- (matches[k] = match & ~label) ? (matchesf = 1) : 0;
+ leftoversf |= leftovers[k] = ~match & label;
+ matchesf |= matches[k] = match & ~label;
}
/* If there were leftovers, create a new group labeled with them. */
@@ -2763,12 +2769,12 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
codepoint of , it must not be but 2nd byte of
, so we cannot add state[0]. */
- next_isnt_1st_byte = 0;
+ next_isnt_1st_byte = false;
for (j = 0; j < follows.nelem; ++j)
{
if (!(d->multibyte_prop[follows.elems[j].index] & 1))
{
- next_isnt_1st_byte = 1;
+ next_isnt_1st_byte = true;
break;
}
}
@@ -3052,7 +3058,7 @@ static int
match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
{
size_t i;
- int match; /* Matching succeeded. */
+ bool match; /* Matching succeeded. */
int match_len; /* Length of the character (or collating element)
with which this operator matches. */
int op_len; /* Length of the operator. */
@@ -4040,14 +4046,14 @@ dfamust (struct dfa *d)
char *result;
size_t ri;
size_t i;
- int exact;
+ bool exact;
token t;
static must must0;
struct dfamust *dm;
static char empty_string[] = "";
result = empty_string;
- exact = 0;
+ exact = false;
MALLOC (musts, d->tindex + 1);
mp = musts;
for (i = 0; i <= d->tindex; ++i)
@@ -4142,7 +4148,7 @@ dfamust (struct dfa *d)
if (strlen (musts[0].in[i]) > strlen (result))
result = musts[0].in[i];
if (STREQ (result, musts[0].is))
- exact = 1;
+ exact = true;
goto done;
case CAT:
assert (&musts[2] <= mp);
--
1.9.0