From 6918bc2493ff7ae21c50a3b4d358ae3028374a09 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Tue, 13 May 2014 10:30:21 +0900 Subject: [PATCH] dfa: speed-up at initial state DFA state is always 0 until have found potential match. So we improve matching there by continuing to use the transition table. * src/dfa.c (skip_remains_mb): New function. (dfaexec): Speed-up at initial state. --- src/dfa.c | 54 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 8ff29d0..1d73105 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -3250,6 +3250,24 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, return s1; } +/* The initial state may encounter a byte which is not a single byte character + nor the first byte of a multibyte character. But it is incorrect for the + initial state to accept such a byte. For example, in Shift JIS the regular + expression "\\" accepts the codepoint 0x5c, but should not accept the second + byte of the codepoint 0x815c. Then the initial state must skip the bytes + that are not a single byte character nor the first byte of a multibyte + character. */ +static unsigned char const * +skip_remains_mb (struct dfa *d, unsigned char const *p, + unsigned char const *mbp, char const *end) +{ + wint_t wc; + while (mbp < p) + mbp += mbs_to_wchar (&wc, (char const *) mbp, + end - (char const *) mbp, d); + return mbp; +} + /* Search through a buffer looking for a match to the given struct dfa. Find the first occurrence of a string matching the regexp in the buffer, and the shortest possible version thereof. Return a pointer to @@ -3303,27 +3321,18 @@ dfaexec (struct dfa *d, char const *begin, char *end, if (s == 0) { - /* The initial state may encounter a byte which is not - a single byte character nor the first byte of a - multibyte character. But it is incorrect for the - initial state to accept such a byte. For example, - in Shift JIS the regular expression "\\" accepts - the codepoint 0x5c, but should not accept the second - byte of the codepoint 0x815c. Then the initial - state must skip the bytes that are not a single - byte character nor the first byte of a multibyte - character. */ - wint_t wc; - while (mbp < p) - mbp += mbs_to_wchar (&wc, (char const *) mbp, - end - (char const *) mbp, d); - p = mbp; - - if ((char *) p > end) + if (d->states[s].mbps.nelem == 0) { - p = NULL; - goto done; + do + { + while (t[*p] == 0) + p++; + p = mbp = skip_remains_mb (d, p, mbp, end); + } + while (t[*p] == 0); } + else + p = mbp = skip_remains_mb (d, p, mbp, end); } if (d->states[s].mbps.nelem == 0) @@ -3351,6 +3360,13 @@ dfaexec (struct dfa *d, char const *begin, char *end, } else { + if (s == 0 && (t = trans[s]) != NULL) + { + while (t[*p] == 0) + p++; + s = t[*p++]; + } + while ((t = trans[s]) != NULL) { s1 = t[*p++]; -- 1.9.3