From 0ac87ef3d35e1bd14dc0cf1d3c4a62e335c88f5e Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 15 Dec 2014 23:40:17 +0900 Subject: [PATCH] dfa: improvement for checking of multibyte character boundary When found single bytes that cannot occur inside a multibyte character we can skip check for multibyte character boundary before the character. The improvement speeds up about 40% for input string which doesn't match even the first part of a pattern. * src/dfa.c (always_character_boundary): Add a new variable. It caches whether each byte can occur inside a multibyte character or not. (dfaalwayscb): Add a new function. (dfacomp): Use it. (skip_remains_mb): If an input character is single bytes that cannot occur inside a multibyte character, skip check for multibyte character boundary until there. --- src/dfa.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 806cb04..2406387 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -451,6 +451,29 @@ struct dfa static void dfamust (struct dfa *dfa); static void regexp (void); +/* True if each byte can not occur inside a multibyte character */ +static bool always_character_boundary[NOTCHAR]; + +static void +dfaalwayscb (void) +{ + int i; + if (using_utf8 ()) + { + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + unsigned char uc = i; + always_character_boundary[uc] = !((uc & 0xc0) ^ 0x80); + } + } + else + { + unsigned char const ucs[] = { '\0', '\n', '\r', '.', '/' }; + for (i = 0; i < sizeof ucs / sizeof ucs[0]; ++i) + always_character_boundary[ucs[i]] = true; + } +} + static void dfambcache (struct dfa *d) { @@ -3271,18 +3294,24 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, character. Given DFA state d, use mbs_to_wchar to advance MBP until it reaches or - exceeds P. If WCP is non-NULL, set *WCP to the final wide character - processed, or if no wide character is processed, set it to WEOF. - Both P and MBP must be no larger than END. */ + exceeds P. If WCP is non-NULL and the result is greater than p, set + *WCP to the final wide character processed, or if no wide character + is processed, set it to WEOF. Both P and MBP must be no larger than + END. + + If next character is character boundary, it always reaches P. So if + WCP is NULL, can omit check step by step and immediately return P. */ static unsigned char const * skip_remains_mb (struct dfa *d, unsigned char const *p, unsigned char const *mbp, char const *end, wint_t *wcp) { wint_t wc = WEOF; + if (always_character_boundary[*p]) + return p; while (mbp < p) mbp += mbs_to_wchar (&wc, (char const *) mbp, end - (char const *) mbp, d); - if (wcp != NULL) + if (wcp != NULL && p < mbp) *wcp = wc; return mbp; } @@ -3713,6 +3742,7 @@ void dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) { dfainit (d); + dfaalwayscb (); dfambcache (d); dfaparse (s, len, d); dfamust (d); -- 2.2.0