From 37f96cdd238a2e4a8ef5442d26eb4f3d97062446 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka
Date: Sun, 6 Apr 2014 13:42:08 -0700
Subject: [PATCH 1/2] grep: optimization with the superset of DFA
The superset of a DFA is like the DFA, except that for speed
ANYCHAR, MBCSET and BACKREF are replaced by (CSET full bits) STAR,
and mb_cur_max is 1. For example, for 'a\(b\)c\1':
original: a b CAT c CAT BACKREF CAT
superset: a b CAT c CAT CSET STAR CAT (The CSET has all bits set.)
If a string matches a DFA, it matches the DFA's superset.
Using the superset to filter can dramatically improve performance,
over 200x in some cases. See .
* src/dfa.c (struct dfa): New member 'superset'.
(dfahint, dfasuperset): New functions.
(dfacomp): Create and analyze the superset.
(dfafree): Free only non-NULL items.
(dfaalloc): Initialize superset member.
(dfaoptimize): If succeed in optimization for UTF-8 locale, don't use
the superset.
* src/dfa.h (dfahint): New decl.
* src/dfasearch.c (EGexecute): Use dfahint.
---
src/dfa.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
src/dfa.h | 3 ++
src/dfasearch.c | 63 +++++++++++++++++++++------
3 files changed, 173 insertions(+), 25 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index ef5c8a9..d88d077 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -373,6 +373,9 @@ struct dfa
size_t nmbcsets;
size_t mbcsets_alloc;
+ /* Fields filled by the superset. */
+ struct dfa *superset; /* Hint of the dfa. */
+
/* Fields filled by the state builder. */
dfa_state *states; /* States of the dfa. */
state_num sindex; /* Index for adding new states. */
@@ -3488,6 +3491,21 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
}
+size_t
+dfahint (struct dfa *d, char const *begin, char *end, size_t *count)
+{
+ char const *match;
+
+ if (d->superset == NULL)
+ return (size_t) -2;
+
+ match = dfaexec (d->superset, begin, end, 1, count, NULL);
+ if (match == NULL)
+ return (size_t) -1;
+
+ return match - begin;
+}
+
static void
free_mbdata (struct dfa *d)
{
@@ -3579,6 +3597,81 @@ dfaoptimize (struct dfa *d)
d->mb_cur_max = 1;
}
+static void
+dfasuperset (struct dfa *d)
+{
+ size_t i, j;
+ charclass ccl;
+ bool have_achar = false;
+ bool have_nchar = false;
+
+ dfa = d->superset = dfaalloc ();
+ *d->superset = *d;
+ d->superset->mb_cur_max = 1;
+ d->superset->multibyte_prop = NULL;
+ d->superset->mbcsets = NULL;
+ d->superset->superset = NULL;
+ d->superset->states = NULL;
+ d->superset->follows = NULL;
+ d->superset->trans = NULL;
+ d->superset->realtrans = NULL;
+ d->superset->fails = NULL;
+ d->superset->success = NULL;
+ d->superset->newlines = NULL;
+ d->superset->musts = NULL;
+
+ MALLOC (d->superset->charclasses, d->superset->calloc);
+ for (i = 0; i < d->cindex; i++)
+ copyset (d->charclasses[i], d->superset->charclasses[i]);
+
+ d->superset->talloc = d->tindex * 2;
+ MALLOC (d->superset->tokens, d->superset->talloc);
+
+ for (i = j = 0; i < d->tindex; i++)
+ {
+ switch (d->tokens[i])
+ {
+ case ANYCHAR:
+ case MBCSET:
+ case BACKREF:
+ zeroset (ccl);
+ notset (ccl);
+ d->superset->tokens[j++] = CSET + charclass_index (ccl);
+ d->superset->tokens[j++] = STAR;
+ if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
+ || d->tokens[i + 1] == PLUS)
+ i++;
+ have_achar = true;
+ break;
+ case BEGWORD:
+ case ENDWORD:
+ case LIMWORD:
+ case NOTLIMWORD:
+ if (MB_CUR_MAX > 1)
+ {
+ /* Ignore these constraints. */
+ d->superset->tokens[j++] = EMPTY;
+ break;
+ }
+ default:
+ d->superset->tokens[j++] = d->tokens[i];
+ if ((d->tokens[i] >= 0 && d->tokens[i] < NOTCHAR)
+ || d->tokens[i] >= CSET)
+ have_nchar = true;
+ break;
+ }
+ }
+
+ if ((d->mb_cur_max == 1 && !have_achar) || !have_nchar)
+ {
+ dfafree (d->superset);
+ d->superset = NULL;
+ return;
+ }
+
+ d->superset->tindex = j;
+}
+
/* Parse and analyze a single string of the given length. */
void
dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
@@ -3588,7 +3681,10 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
dfaparse (s, len, d);
dfamust (d);
dfaoptimize (d);
+ dfasuperset (d);
dfaanalyze (d, searchflag);
+ if (d->superset != NULL)
+ dfaanalyze (d->superset, searchflag);
}
/* Free the storage held by the components of a dfa. */
@@ -3604,30 +3700,42 @@ dfafree (struct dfa *d)
if (d->mb_cur_max > 1)
free_mbdata (d);
- for (i = 0; i < d->sindex; ++i)
+ if (d->states != NULL)
{
- free (d->states[i].elems.elems);
- free (d->states[i].mbps.elems);
+ for (i = 0; i < d->sindex; ++i)
+ {
+ free (d->states[i].elems.elems);
+ free (d->states[i].mbps.elems);
+ }
+ free (d->states);
}
- free (d->states);
- for (i = 0; i < d->tindex; ++i)
- free (d->follows[i].elems);
- free (d->follows);
- for (i = 0; i < d->tralloc; ++i)
+ if (d->follows != NULL)
{
- free (d->trans[i]);
- free (d->fails[i]);
+ for (i = 0; i < d->tindex; ++i)
+ free (d->follows[i].elems);
+ free (d->follows);
}
+ if (d->trans != NULL)
+ for (i = 0; i < d->tralloc; ++i)
+ {
+ free (d->trans[i]);
+ free (d->fails[i]);
+ }
+
free (d->realtrans);
free (d->fails);
free (d->newlines);
free (d->success);
+
for (dm = d->musts; dm; dm = ndm)
{
ndm = dm->next;
free (dm->must);
free (dm);
}
+
+ if (d->superset != NULL)
+ dfafree (d->superset);
}
/* Having found the postfix representation of the regular expression,
@@ -4135,7 +4243,9 @@ done:
struct dfa *
dfaalloc (void)
{
- return xmalloc (sizeof (struct dfa));
+ struct dfa *d = xmalloc (sizeof (struct dfa));
+ d->superset = NULL;
+ return d;
}
struct dfamust *_GL_ATTRIBUTE_PURE
diff --git a/src/dfa.h b/src/dfa.h
index 24fbcbe..ad97498 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -67,6 +67,9 @@ extern void dfacomp (char const *, size_t, struct dfa *, int);
to decide whether to fall back on a backtracking matcher. */
extern char *dfaexec (struct dfa *d, char const *begin, char *end,
int newline, size_t *count, int *backref);
+extern size_t dfahint (struct dfa *d, char const *begin, char *end,
+ size_t *count);
+
/* Free the storage held by the components of a struct dfa. */
extern void dfafree (struct dfa *);
diff --git a/src/dfasearch.c b/src/dfasearch.c
index ca00505..383c2ad 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -236,7 +236,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
match = beg;
while (beg > buf && beg[-1] != eol)
--beg;
- char const *dfa_start = beg;
if (kwsm.index < kwset_exact_matches)
{
if (mb_start < beg)
@@ -248,29 +247,65 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
/* The matched line starts in the middle of a multibyte
character. Perform the DFA search starting from the
beginning of the next character. */
- dfa_start = mb_start;
+ if (dfaexec (dfa, mb_start, (char *) end, 0, NULL,
+ &backref) == NULL)
+ continue;
+ }
+ else
+ {
+ if (dfahint (dfa, beg, (char *) end, NULL) ==
+ (size_t) -1)
+ continue;
+ if (dfaexec (dfa, beg, (char *) end, 0, NULL,
+ &backref) == NULL)
+ continue;
}
- if (dfaexec (dfa, dfa_start, (char *) end, 0, NULL,
- &backref) == NULL)
- continue;
}
else
{
/* No good fixed strings; start with DFA. */
- char const *next_beg = dfaexec (dfa, beg, (char *) buflim,
- 0, NULL, &backref);
- /* If there's no match, or if we've matched the sentinel,
- we're done. */
- if (next_beg == NULL || next_beg == buflim)
- break;
+ size_t offset, count;
+ char const *next_beg;
+ count = 0;
+ offset = dfahint (dfa, beg, (char *) buflim, &count);
+ if (offset == (size_t) -1)
+ goto failure;
+ if (offset == (size_t) -2)
+ {
+ /* No use hint. */
+ next_beg = dfaexec (dfa, beg, (char *) buflim, 0,
+ NULL, &backref);
+ /* If there's no match, or if we've matched the sentinel,
+ we're done. */
+ if (next_beg == NULL || next_beg == buflim)
+ goto failure;
+ }
+ else
+ next_beg = beg + offset;
/* Narrow down to the line we've found. */
beg = next_beg;
- if ((end = memchr(beg, eol, buflim - beg)) != NULL)
+ while (beg > buf && beg[-1] != eol)
+ --beg;
+ if (count > 0)
+ {
+ /* dfahint() may match in multiple lines. If that is
+ the case, try to match in one line. */
+ end = beg;
+ continue;
+ }
+ if ((end = memchr(next_beg, eol, buflim - beg)) != NULL)
end++;
else
end = buflim;
- while (beg > buf && beg[-1] != eol)
- --beg;
+ if (offset != (size_t) -2)
+ {
+ next_beg = dfaexec (dfa, beg, (char *) end, 0, NULL,
+ &backref);
+ /* If there's no match, or if we've matched the sentinel,
+ we're done. */
+ if (next_beg == NULL || next_beg == end)
+ continue;
+ }
}
/* Successful, no backreferences encountered! */
if (!backref)
--
1.9.0