From 3d0c130808c974f1271561c7433b2aa661c49507 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sun, 10 Jul 2016 10:26:39 +0900 Subject: [PATCH] dfa: use algorithm for single byte character to any single byte character in input text always Even in non-UTF8 locales, if a character at current position in input text is single byte, we can use CSET to match ANYCHAR. * src/dfa.c (charclass_index_anychar): New var. Cache index of CSET for ANYCHAR. (lex): Make CSET for ANYCHAR. (state_index): Simplify. (dfastate): Consider CSET for ANYCHAR. (transit_state_singlebyte, transit_state): Remove handling for eolbyte, as we assume that eolbyte does not appear at current position. (dfaexec_main): use algorithm for single byte character to any single byte character in input text always. --- src/dfa.c | 115 ++++++++++++++++++++++++------------------------------------ 1 files changed, 46 insertions(+), 69 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 3bd0c68..e932b2c 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -79,6 +79,8 @@ enum CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS }; +static size_t charclass_index_anychar = -1; + /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */ typedef charclass_word charclass[CHARCLASS_WORDS]; @@ -1429,21 +1431,25 @@ lex (void) case '.': if (backslash) goto normal_char; - if (dfa->multibyte) + if (charclass_index_anychar == (size_t) -1) { - /* In multibyte environment period must match with a single - character not a byte. So we use ANYCHAR. */ - laststart = false; - return lasttok = ANYCHAR; + zeroset (ccl); + notset (ccl); + if (!(syntax_bits & RE_DOT_NEWLINE)) + clrbit ('\n', ccl); + if (syntax_bits & RE_DOT_NOT_NULL) + clrbit ('\0', ccl); + if (dfa->multibyte) + { + for (c2 = 0; c2 < NOTCHAR; ++c2) + if (mbrtowc_cache[c2] == WEOF) + clrbit (c2, ccl); + } + charclass_index_anychar = charclass_index (ccl); } - zeroset (ccl); - notset (ccl); - if (!(syntax_bits & RE_DOT_NEWLINE)) - clrbit ('\n', ccl); - if (syntax_bits & RE_DOT_NOT_NULL) - clrbit ('\0', ccl); laststart = false; - return lasttok = CSET + charclass_index (ccl); + return lasttok = + dfa->multibyte ? ANYCHAR : CSET + charclass_index_anychar; case 's': case 'S': @@ -2125,7 +2131,7 @@ state_index (struct dfa *d, position_set const *s, int context) } else if (d->tokens[s->elems[j].index] == BACKREF) d->states[i].constraint = NO_CONSTRAINT; - if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR) + else if (d->tokens[s->elems[j].index] == ANYCHAR) { int acceptable = 0; if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_NEWLINE)) @@ -2588,14 +2594,16 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (d->multibyte && d->tokens[pos.index] == ANYCHAR) - /* ANYCHAR must match with a single character, so we must put - it to D->states[s].mbps which contains the positions which - can match with a single character not a byte. If all - positions which has ANYCHAR does not depend on context of - next character, we put the follows instead of it to - D->states[s].mbps to optimize. */ + else if (d->tokens[pos.index] == ANYCHAR) { + copyset (d->charclasses[charclass_index_anychar], matches); + + /* ANYCHAR must match with a single character, so we must put + it to D->states[s].mbps which contains the positions which + can match with a single character not a byte. If all + positions which has ANYCHAR does not depend on context of + next character, we put the follows instead of it to + D->states[s].mbps to optimize. */ if (d->states[s].curr_dependent) { if (d->states[s].mbps.nelem == 0) @@ -2606,9 +2614,11 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) d->states[s].context, CTX_ANY)) { if (d->states[s].mbps.nelem == 0) - alloc_position_set (&d->states[s].mbps, 1); + alloc_position_set (&d->states[s].mbps, + d->follows[pos.index].nelem); for (j = 0; j < d->follows[pos.index].nelem; ++j) - insert (d->follows[pos.index].elems[j], &(d->states[s].mbps)); + insert (d->follows[pos.index].elems[j], + &(d->states[s].mbps)); } } else @@ -2984,16 +2994,6 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp) { state_num *t; - if (**pp == eolbyte) - { - /* S is always an initial state in transit_state, so the - transition table for the state must have been built already. */ - assert (d->trans[s] || d->fails[s]); - - ++*pp; - return d->newlines[s]; - } - if (d->trans[s]) t = d->trans[s]; else if (d->fails[s]) @@ -3020,15 +3020,12 @@ static state_num transit_state (struct dfa *d, state_num s, unsigned char const **pp, unsigned char const *end) { - state_num s1; + state_num s1, s2; wint_t wc; int separate_contexts; - state_num state, state_newline; size_t i, j; int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); - int context = wc == eolbyte ? CTX_NEWLINE : CTX_NONE; - bool context_newline = context == CTX_NEWLINE; /* This state has some operators which can match a multibyte character. */ d->mb_follows.nelem = 0; @@ -3051,7 +3048,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, for (i = 0; i < d->states[s1].mbps.nelem; ++i) { if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint, - d->states[s1].context, context)) + d->states[s1].context, CTX_NONE)) continue; for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) @@ -3060,10 +3057,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, } separate_contexts = state_separate_contexts (&d->mb_follows); - if (context_newline && separate_contexts & CTX_NEWLINE) - s = state_index (d, &d->mb_follows, CTX_NEWLINE); - else - s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); + s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); realloc_trans_if_necessary (d, s); return s; @@ -3091,17 +3085,14 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, d->states[s1].mb_trindex = d->mb_trcount++; } - size_t mb_index = d->states[s1].mb_trindex << 1 | (context_newline - ? 1 : 0); - if (d->mb_trans[s] == NULL) { - d->mb_trans[s] = xnmalloc (2 * MAX_TRCOUNT, sizeof *d->mb_trans[s]); - for (i = 0; i < 2 * MAX_TRCOUNT; i++) + d->mb_trans[s] = xnmalloc (MAX_TRCOUNT, sizeof *d->mb_trans[s]); + for (i = 0; i < MAX_TRCOUNT; i++) d->mb_trans[s][i] = -1; } - else if (d->mb_trans[s][mb_index] >= 0) - return d->mb_trans[s][mb_index]; + else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0) + return d->mb_trans[s][d->states[s1].mb_trindex]; if (s < 0) copy (&d->states[s1].mbps, &d->mb_follows); @@ -3109,17 +3100,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows); separate_contexts = state_separate_contexts (&d->mb_follows); - state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); - if (separate_contexts & CTX_NEWLINE) - state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE); - else - state_newline = state; - realloc_trans_if_necessary (d, state_newline); + s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); + realloc_trans_if_necessary (d, s2); - d->mb_trans[s][mb_index & ~0] = state; - d->mb_trans[s][mb_index | 1] = state_newline; + d->mb_trans[s][d->states[s1].mb_trindex] = s2; - return context_newline ? state_newline : state; + return s2; } /* The initial state may encounter a byte which is not a single byte character @@ -3246,10 +3232,8 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, } } - if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl) - || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE)) - || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL)) - || (char *) p >= end) + if (d->states[s].mbps.nelem == 0 + || mbrtowc_cache[*p] != WEOF || (char *) p >= end) { /* If an input character does not match ANYCHAR, do it like a single-byte character. */ @@ -3258,8 +3242,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, else { s = transit_state (d, s, &p, (unsigned char *) end); - if (s >= 0 && p[-1] == eol) - nlcount++; mbp = p; trans = d->trans; } @@ -3311,10 +3293,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, s1 = s; if (!multibyte || d->states[s].mbps.nelem == 0 - || (*p == eol && !allow_nl) - || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE)) - || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL)) - || (char *) p >= end) + || mbrtowc_cache[*p] != WEOF || (char *) p >= end) { /* If a input character does not match ANYCHAR, do it like a single-byte character. */ @@ -3323,8 +3302,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, else { s = transit_state (d, s, &p, (unsigned char *) end); - if (s >= 0 && p[-1] == eol) - nlcount++; mbp = p; trans = d->trans; } -- 1.7.1