From e2a434467a616d1da72f190176b90ba6d3ea578f Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka
Date: Mon, 3 Nov 2014 08:12:40 +0900
Subject: [PATCH 1/2] dfa: avoid execution for a pattern including an
unsupported expression
If a pattern includes unsupported expression by DFA matcher, the
search with the matcher will fail in most cases. So dfaexec immediately
returns for the pattern.
src/dfa.c (struct dfa_state): Remove 'has_backref' and 'has_mbcset'.
All uses removed.
(dfaexec_main): Remove 'backref'.
All uses and callers removed.
(dfaexec_br): New function, always sets a backreference bit and returns
a beginning pointer in an input. When a pattern includes any
unsupported expression by DFA matcher, the function is used.
(dfabackref): Test wether patterns has any unsupported expression by DFA
matcher or not.
(dfassbuild): Remove no longer unsed code.
(dfacomp): If A pattern has any unsupported expression by DFA matcher,
avoid analysis for the pattern.
---
src/dfa.c | 68 +++++++++++++++++++++++++++++++--------------------------------
1 file changed, 33 insertions(+), 35 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 806cb04..49c9505 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -284,8 +284,6 @@ typedef struct
size_t hash; /* Hash of the positions of this state. */
position_set elems; /* Positions this state could match. */
unsigned char context; /* Context from previous state. */
- bool has_backref; /* This state matches a \. */
- bool has_mbcset; /* This state matches a MBCSET. */
unsigned short constraint; /* Constraint for this state to accept. */
token first_end; /* Token value of the first END in elems. */
position_set mbps; /* Positions which can match multibyte
@@ -2152,8 +2150,6 @@ state_index (struct dfa *d, position_set const *s, int context)
alloc_position_set (&d->states[i].elems, s->nelem);
copy (s, &d->states[i].elems);
d->states[i].context = context;
- d->states[i].has_backref = false;
- d->states[i].has_mbcset = false;
d->states[i].constraint = 0;
d->states[i].first_end = 0;
d->states[i].mbps.nelem = 0;
@@ -2169,10 +2165,7 @@ state_index (struct dfa *d, position_set const *s, int context)
d->states[i].first_end = d->tokens[s->elems[j].index];
}
else if (d->tokens[s->elems[j].index] == BACKREF)
- {
- d->states[i].constraint = NO_CONSTRAINT;
- d->states[i].has_backref = true;
- }
+ d->states[i].constraint = NO_CONSTRAINT;
++d->sindex;
@@ -2627,9 +2620,6 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
if (d->tokens[pos.index] == MBCSET
|| d->tokens[pos.index] == ANYCHAR)
{
- /* MB_CUR_MAX > 1 */
- if (d->tokens[pos.index] == MBCSET)
- d->states[s].has_mbcset = true;
/* ANYCHAR and MBCSET must match with a single character, so we
must put it to d->states[s].mbps, which contains the positions
which can match with a single character not a byte. */
@@ -3304,8 +3294,8 @@ skip_remains_mb (struct dfa *d, unsigned char const *p,
encoding-error bytes. Otherwise, the input consists of single-byte
characters. */
static inline char *
-dfaexec_main (struct dfa *d, char const *begin, char *end,
- int allow_nl, size_t *count, int *backref, bool multibyte)
+dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl,
+ size_t *count, bool multibyte)
{
state_num s, s1; /* Current state. */
unsigned char const *p, *mbp; /* Current input character. */
@@ -3395,16 +3385,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end,
Use a macro to avoid the risk that they diverge. */
#define State_transition() \
do { \
- /* Falling back to the glibc matcher in this case gives \
- better performance (up to 25% better on [a-z], for \
- example) and enables support for collating symbols and \
- equivalence classes. */ \
- if (d->states[s].has_mbcset && backref) \
- { \
- *backref = 1; \
- goto done; \
- } \
- \
/* Can match with a multibyte character (and multi-character \
collating element). Transition table might be updated. */ \
s = transit_state (d, s, &p, (unsigned char *) end); \
@@ -3478,11 +3458,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end,
if (d->fails[s])
{
if (d->success[s] & sbit[*p])
- {
- if (backref)
- *backref = d->states[s].has_backref;
- goto done;
- }
+ goto done;
s1 = s;
if (multibyte)
@@ -3512,14 +3488,22 @@ static char *
dfaexec_mb (struct dfa *d, char const *begin, char *end,
int allow_nl, size_t *count, int *backref)
{
- return dfaexec_main (d, begin, end, allow_nl, count, backref, true);
+ return dfaexec_main (d, begin, end, allow_nl, count, true);
}
static char *
dfaexec_sb (struct dfa *d, char const *begin, char *end,
int allow_nl, size_t *count, int *backref)
{
- return dfaexec_main (d, begin, end, allow_nl, count, backref, false);
+ return dfaexec_main (d, begin, end, allow_nl, count, false);
+}
+
+static char *
+dfaexec_br (struct dfa *d, char const *begin, char *end,
+ int allow_nl, size_t *count, int *backref)
+{
+ *backref = 1;
+ return (char *) begin;
}
/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, BACKREF, D->multibyte),
@@ -3585,6 +3569,19 @@ dfainit (struct dfa *d)
d->fast = !d->multibyte;
}
+bool
+dfabackref (struct dfa *d)
+{
+ size_t i;
+ for (i = 0; i < d->tindex; i++)
+ if (d->tokens[i] == BACKREF || d->tokens[i] == MBCSET)
+ {
+ d->dfaexec = dfaexec_br;
+ return true;
+ }
+ return false;
+}
+
static void
dfaoptimize (struct dfa *d)
{
@@ -3683,10 +3680,8 @@ dfassbuild (struct dfa *d)
if (d->multibyte)
{
/* These constraints aren't supported in a multibyte locale.
- Ignore them in the superset DFA, and treat them as
- backreferences in the main DFA. */
+ Ignore them in the superset DFA. */
sup->tokens[j++] = EMPTY;
- d->tokens[i] = BACKREF;
break;
}
default:
@@ -3717,8 +3712,11 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
dfaparse (s, len, d);
dfamust (d);
dfassbuild (d);
- dfaoptimize (d);
- dfaanalyze (d, searchflag);
+ if (!dfabackref (d))
+ {
+ dfaoptimize (d);
+ dfaanalyze (d, searchflag);
+ }
if (d->superset)
{
d->fast = true;
--
2.2.0