From e2a434467a616d1da72f190176b90ba6d3ea578f Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 3 Nov 2014 08:12:40 +0900 Subject: [PATCH 1/2] dfa: avoid execution for a pattern including an unsupported expression If a pattern includes unsupported expression by DFA matcher, the search with the matcher will fail in most cases. So dfaexec immediately returns for the pattern. src/dfa.c (struct dfa_state): Remove 'has_backref' and 'has_mbcset'. All uses removed. (dfaexec_main): Remove 'backref'. All uses and callers removed. (dfaexec_br): New function, always sets a backreference bit and returns a beginning pointer in an input. When a pattern includes any unsupported expression by DFA matcher, the function is used. (dfabackref): Test wether patterns has any unsupported expression by DFA matcher or not. (dfassbuild): Remove no longer unsed code. (dfacomp): If A pattern has any unsupported expression by DFA matcher, avoid analysis for the pattern. --- src/dfa.c | 68 +++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 806cb04..49c9505 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -284,8 +284,6 @@ typedef struct size_t hash; /* Hash of the positions of this state. */ position_set elems; /* Positions this state could match. */ unsigned char context; /* Context from previous state. */ - bool has_backref; /* This state matches a \. */ - bool has_mbcset; /* This state matches a MBCSET. */ unsigned short constraint; /* Constraint for this state to accept. */ token first_end; /* Token value of the first END in elems. */ position_set mbps; /* Positions which can match multibyte @@ -2152,8 +2150,6 @@ state_index (struct dfa *d, position_set const *s, int context) alloc_position_set (&d->states[i].elems, s->nelem); copy (s, &d->states[i].elems); d->states[i].context = context; - d->states[i].has_backref = false; - d->states[i].has_mbcset = false; d->states[i].constraint = 0; d->states[i].first_end = 0; d->states[i].mbps.nelem = 0; @@ -2169,10 +2165,7 @@ state_index (struct dfa *d, position_set const *s, int context) d->states[i].first_end = d->tokens[s->elems[j].index]; } else if (d->tokens[s->elems[j].index] == BACKREF) - { - d->states[i].constraint = NO_CONSTRAINT; - d->states[i].has_backref = true; - } + d->states[i].constraint = NO_CONSTRAINT; ++d->sindex; @@ -2627,9 +2620,6 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) if (d->tokens[pos.index] == MBCSET || d->tokens[pos.index] == ANYCHAR) { - /* MB_CUR_MAX > 1 */ - if (d->tokens[pos.index] == MBCSET) - d->states[s].has_mbcset = true; /* ANYCHAR and MBCSET must match with a single character, so we must put it to d->states[s].mbps, which contains the positions which can match with a single character not a byte. */ @@ -3304,8 +3294,8 @@ skip_remains_mb (struct dfa *d, unsigned char const *p, encoding-error bytes. Otherwise, the input consists of single-byte characters. */ static inline char * -dfaexec_main (struct dfa *d, char const *begin, char *end, - int allow_nl, size_t *count, int *backref, bool multibyte) +dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl, + size_t *count, bool multibyte) { state_num s, s1; /* Current state. */ unsigned char const *p, *mbp; /* Current input character. */ @@ -3395,16 +3385,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, Use a macro to avoid the risk that they diverge. */ #define State_transition() \ do { \ - /* Falling back to the glibc matcher in this case gives \ - better performance (up to 25% better on [a-z], for \ - example) and enables support for collating symbols and \ - equivalence classes. */ \ - if (d->states[s].has_mbcset && backref) \ - { \ - *backref = 1; \ - goto done; \ - } \ - \ /* Can match with a multibyte character (and multi-character \ collating element). Transition table might be updated. */ \ s = transit_state (d, s, &p, (unsigned char *) end); \ @@ -3478,11 +3458,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, if (d->fails[s]) { if (d->success[s] & sbit[*p]) - { - if (backref) - *backref = d->states[s].has_backref; - goto done; - } + goto done; s1 = s; if (multibyte) @@ -3512,14 +3488,22 @@ static char * dfaexec_mb (struct dfa *d, char const *begin, char *end, int allow_nl, size_t *count, int *backref) { - return dfaexec_main (d, begin, end, allow_nl, count, backref, true); + return dfaexec_main (d, begin, end, allow_nl, count, true); } static char * dfaexec_sb (struct dfa *d, char const *begin, char *end, int allow_nl, size_t *count, int *backref) { - return dfaexec_main (d, begin, end, allow_nl, count, backref, false); + return dfaexec_main (d, begin, end, allow_nl, count, false); +} + +static char * +dfaexec_br (struct dfa *d, char const *begin, char *end, + int allow_nl, size_t *count, int *backref) +{ + *backref = 1; + return (char *) begin; } /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, BACKREF, D->multibyte), @@ -3585,6 +3569,19 @@ dfainit (struct dfa *d) d->fast = !d->multibyte; } +bool +dfabackref (struct dfa *d) +{ + size_t i; + for (i = 0; i < d->tindex; i++) + if (d->tokens[i] == BACKREF || d->tokens[i] == MBCSET) + { + d->dfaexec = dfaexec_br; + return true; + } + return false; +} + static void dfaoptimize (struct dfa *d) { @@ -3683,10 +3680,8 @@ dfassbuild (struct dfa *d) if (d->multibyte) { /* These constraints aren't supported in a multibyte locale. - Ignore them in the superset DFA, and treat them as - backreferences in the main DFA. */ + Ignore them in the superset DFA. */ sup->tokens[j++] = EMPTY; - d->tokens[i] = BACKREF; break; } default: @@ -3717,8 +3712,11 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) dfaparse (s, len, d); dfamust (d); dfassbuild (d); - dfaoptimize (d); - dfaanalyze (d, searchflag); + if (!dfabackref (d)) + { + dfaoptimize (d); + dfaanalyze (d, searchflag); + } if (d->superset) { d->fast = true; -- 2.2.0