From 37f96cdd238a2e4a8ef5442d26eb4f3d97062446 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sun, 6 Apr 2014 13:42:08 -0700 Subject: [PATCH 1/2] grep: optimization with the superset of DFA The superset of a DFA is like the DFA, except that for speed ANYCHAR, MBCSET and BACKREF are replaced by (CSET full bits) STAR, and mb_cur_max is 1. For example, for 'a\(b\)c\1': original: a b CAT c CAT BACKREF CAT superset: a b CAT c CAT CSET STAR CAT (The CSET has all bits set.) If a string matches a DFA, it matches the DFA's superset. Using the superset to filter can dramatically improve performance, over 200x in some cases. See . * src/dfa.c (struct dfa): New member 'superset'. (dfahint, dfasuperset): New functions. (dfacomp): Create and analyze the superset. (dfafree): Free only non-NULL items. (dfaalloc): Initialize superset member. (dfaoptimize): If succeed in optimization for UTF-8 locale, don't use the superset. * src/dfa.h (dfahint): New decl. * src/dfasearch.c (EGexecute): Use dfahint. --- src/dfa.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++----- src/dfa.h | 3 ++ src/dfasearch.c | 63 +++++++++++++++++++++------ 3 files changed, 173 insertions(+), 25 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index ef5c8a9..d88d077 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -373,6 +373,9 @@ struct dfa size_t nmbcsets; size_t mbcsets_alloc; + /* Fields filled by the superset. */ + struct dfa *superset; /* Hint of the dfa. */ + /* Fields filled by the state builder. */ dfa_state *states; /* States of the dfa. */ state_num sindex; /* Index for adding new states. */ @@ -3488,6 +3491,21 @@ dfaexec (struct dfa *d, char const *begin, char *end, } } +size_t +dfahint (struct dfa *d, char const *begin, char *end, size_t *count) +{ + char const *match; + + if (d->superset == NULL) + return (size_t) -2; + + match = dfaexec (d->superset, begin, end, 1, count, NULL); + if (match == NULL) + return (size_t) -1; + + return match - begin; +} + static void free_mbdata (struct dfa *d) { @@ -3579,6 +3597,81 @@ dfaoptimize (struct dfa *d) d->mb_cur_max = 1; } +static void +dfasuperset (struct dfa *d) +{ + size_t i, j; + charclass ccl; + bool have_achar = false; + bool have_nchar = false; + + dfa = d->superset = dfaalloc (); + *d->superset = *d; + d->superset->mb_cur_max = 1; + d->superset->multibyte_prop = NULL; + d->superset->mbcsets = NULL; + d->superset->superset = NULL; + d->superset->states = NULL; + d->superset->follows = NULL; + d->superset->trans = NULL; + d->superset->realtrans = NULL; + d->superset->fails = NULL; + d->superset->success = NULL; + d->superset->newlines = NULL; + d->superset->musts = NULL; + + MALLOC (d->superset->charclasses, d->superset->calloc); + for (i = 0; i < d->cindex; i++) + copyset (d->charclasses[i], d->superset->charclasses[i]); + + d->superset->talloc = d->tindex * 2; + MALLOC (d->superset->tokens, d->superset->talloc); + + for (i = j = 0; i < d->tindex; i++) + { + switch (d->tokens[i]) + { + case ANYCHAR: + case MBCSET: + case BACKREF: + zeroset (ccl); + notset (ccl); + d->superset->tokens[j++] = CSET + charclass_index (ccl); + d->superset->tokens[j++] = STAR; + if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR + || d->tokens[i + 1] == PLUS) + i++; + have_achar = true; + break; + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + if (MB_CUR_MAX > 1) + { + /* Ignore these constraints. */ + d->superset->tokens[j++] = EMPTY; + break; + } + default: + d->superset->tokens[j++] = d->tokens[i]; + if ((d->tokens[i] >= 0 && d->tokens[i] < NOTCHAR) + || d->tokens[i] >= CSET) + have_nchar = true; + break; + } + } + + if ((d->mb_cur_max == 1 && !have_achar) || !have_nchar) + { + dfafree (d->superset); + d->superset = NULL; + return; + } + + d->superset->tindex = j; +} + /* Parse and analyze a single string of the given length. */ void dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) @@ -3588,7 +3681,10 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) dfaparse (s, len, d); dfamust (d); dfaoptimize (d); + dfasuperset (d); dfaanalyze (d, searchflag); + if (d->superset != NULL) + dfaanalyze (d->superset, searchflag); } /* Free the storage held by the components of a dfa. */ @@ -3604,30 +3700,42 @@ dfafree (struct dfa *d) if (d->mb_cur_max > 1) free_mbdata (d); - for (i = 0; i < d->sindex; ++i) + if (d->states != NULL) { - free (d->states[i].elems.elems); - free (d->states[i].mbps.elems); + for (i = 0; i < d->sindex; ++i) + { + free (d->states[i].elems.elems); + free (d->states[i].mbps.elems); + } + free (d->states); } - free (d->states); - for (i = 0; i < d->tindex; ++i) - free (d->follows[i].elems); - free (d->follows); - for (i = 0; i < d->tralloc; ++i) + if (d->follows != NULL) { - free (d->trans[i]); - free (d->fails[i]); + for (i = 0; i < d->tindex; ++i) + free (d->follows[i].elems); + free (d->follows); } + if (d->trans != NULL) + for (i = 0; i < d->tralloc; ++i) + { + free (d->trans[i]); + free (d->fails[i]); + } + free (d->realtrans); free (d->fails); free (d->newlines); free (d->success); + for (dm = d->musts; dm; dm = ndm) { ndm = dm->next; free (dm->must); free (dm); } + + if (d->superset != NULL) + dfafree (d->superset); } /* Having found the postfix representation of the regular expression, @@ -4135,7 +4243,9 @@ done: struct dfa * dfaalloc (void) { - return xmalloc (sizeof (struct dfa)); + struct dfa *d = xmalloc (sizeof (struct dfa)); + d->superset = NULL; + return d; } struct dfamust *_GL_ATTRIBUTE_PURE diff --git a/src/dfa.h b/src/dfa.h index 24fbcbe..ad97498 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -67,6 +67,9 @@ extern void dfacomp (char const *, size_t, struct dfa *, int); to decide whether to fall back on a backtracking matcher. */ extern char *dfaexec (struct dfa *d, char const *begin, char *end, int newline, size_t *count, int *backref); +extern size_t dfahint (struct dfa *d, char const *begin, char *end, + size_t *count); + /* Free the storage held by the components of a struct dfa. */ extern void dfafree (struct dfa *); diff --git a/src/dfasearch.c b/src/dfasearch.c index ca00505..383c2ad 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -236,7 +236,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size, match = beg; while (beg > buf && beg[-1] != eol) --beg; - char const *dfa_start = beg; if (kwsm.index < kwset_exact_matches) { if (mb_start < beg) @@ -248,29 +247,65 @@ EGexecute (char const *buf, size_t size, size_t *match_size, /* The matched line starts in the middle of a multibyte character. Perform the DFA search starting from the beginning of the next character. */ - dfa_start = mb_start; + if (dfaexec (dfa, mb_start, (char *) end, 0, NULL, + &backref) == NULL) + continue; + } + else + { + if (dfahint (dfa, beg, (char *) end, NULL) == + (size_t) -1) + continue; + if (dfaexec (dfa, beg, (char *) end, 0, NULL, + &backref) == NULL) + continue; } - if (dfaexec (dfa, dfa_start, (char *) end, 0, NULL, - &backref) == NULL) - continue; } else { /* No good fixed strings; start with DFA. */ - char const *next_beg = dfaexec (dfa, beg, (char *) buflim, - 0, NULL, &backref); - /* If there's no match, or if we've matched the sentinel, - we're done. */ - if (next_beg == NULL || next_beg == buflim) - break; + size_t offset, count; + char const *next_beg; + count = 0; + offset = dfahint (dfa, beg, (char *) buflim, &count); + if (offset == (size_t) -1) + goto failure; + if (offset == (size_t) -2) + { + /* No use hint. */ + next_beg = dfaexec (dfa, beg, (char *) buflim, 0, + NULL, &backref); + /* If there's no match, or if we've matched the sentinel, + we're done. */ + if (next_beg == NULL || next_beg == buflim) + goto failure; + } + else + next_beg = beg + offset; /* Narrow down to the line we've found. */ beg = next_beg; - if ((end = memchr(beg, eol, buflim - beg)) != NULL) + while (beg > buf && beg[-1] != eol) + --beg; + if (count > 0) + { + /* dfahint() may match in multiple lines. If that is + the case, try to match in one line. */ + end = beg; + continue; + } + if ((end = memchr(next_beg, eol, buflim - beg)) != NULL) end++; else end = buflim; - while (beg > buf && beg[-1] != eol) - --beg; + if (offset != (size_t) -2) + { + next_beg = dfaexec (dfa, beg, (char *) end, 0, NULL, + &backref); + /* If there's no match, or if we've matched the sentinel, + we're done. */ + if (next_beg == NULL || next_beg == end) + continue; + } } /* Successful, no backreferences encountered! */ if (!backref) -- 1.9.0