From f41b1bcd57eee550af49bdbe3a7a243a218faf44 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Fri, 6 Jun 2014 19:08:08 +0900 Subject: [PATCH] dfa: build struct dfamust on demand If we won't use KWset, do not build a "struct dfamust". Now it is built only when needed. * src/dfa.c (struct dfa) [musts]: Remove member. (dfacomp): Don't build dfamust here. (dfamustfree): New function to free a struct dfamust. (dfamust): Make it a global function, and make it return a pointer to a malloc'd struct dfamust. (dfamusts): Remove it. * src/dfa.h (struct dfamust) [next]: Remove member. In the implementation preceding this patch, there was never more than one of these in a given "struct dfa". (dfamustfree, dfamust): Add prototypes. (dfamusts): Remove prototype. * src/dfasearch.c (kwsmusts): Adapt to use the new interface. Update the comments to reflect reality. This addresses http://bugs.gnu.org/17715 --- src/dfa.c | 60 ++++++++++++++++++++----------------------------------- src/dfa.h | 8 +++++--- src/dfasearch.c | 62 +++++++++++++++++++++++++++------------------------------ 3 files changed, 56 insertions(+), 74 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index c7b659e..9f024bb 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -429,9 +429,6 @@ struct dfa as a sentinel at the end of the buffer. */ state_num initstate_letter; /* Initial state for letter context. */ state_num initstate_others; /* Initial state for other contexts. */ - struct dfamust *musts; /* List of strings, at least one of which - is known to appear in any r.e. matching - the dfa. */ position_set mb_follows; /* Follow set added by ANYCHAR and/or MBCSET on demand. */ int *mb_match_lens; /* Array of length reduced by ANYCHAR and/or @@ -448,7 +445,6 @@ struct dfa #define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \ SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr) -static void dfamust (struct dfa *dfa); static void regexp (void); static void @@ -3647,7 +3643,6 @@ dfassbuild (struct dfa *d) sup->fails = NULL; sup->success = NULL; sup->newlines = NULL; - sup->musts = NULL; sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses); if (d->cindex) @@ -3714,7 +3709,6 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) dfainit (d); dfambcache (d); dfaparse (s, len, d); - dfamust (d); dfassbuild (d); dfaoptimize (d); dfaanalyze (d, searchflag); @@ -3730,7 +3724,6 @@ void dfafree (struct dfa *d) { size_t i; - struct dfamust *dm, *ndm; free (d->charclasses); free (d->tokens); @@ -3766,13 +3759,6 @@ dfafree (struct dfa *d) free (d->success); } - for (dm = d->musts; dm; dm = ndm) - { - ndm = dm->next; - free (dm->must); - free (dm); - } - if (d->superset) dfafree (d->superset); } @@ -3793,9 +3779,7 @@ dfafree (struct dfa *d) sequences that must constitute the match ("is") When we get to the root of the tree, we use one of the longest of its - calculated "in" sequences as our answer. The sequence we find is returned in - d->must (where "d" is the single argument passed to "dfamust"); - the length of the sequence is returned in d->mustn. + calculated "in" sequences as our answer. The sequences calculated for the various types of node (in pseudo ANSI c) are shown below. "p" is the operand of unary operators (and the left-hand @@ -4019,8 +4003,8 @@ freemust (must *mp) free (mp); } -static void -dfamust (struct dfa *d) +struct dfamust * +dfamust (struct dfa const *d) { must *mp = NULL; char const *result = ""; @@ -4029,7 +4013,6 @@ dfamust (struct dfa *d) bool exact = false; bool begline = false; bool endline = false; - struct dfamust *dm; for (ri = 0; ri < d->tindex; ++ri) { @@ -4190,30 +4173,28 @@ dfamust (struct dfa *d) t = j; while (++j < NOTCHAR) if (tstbit (j, *ccl) - && ! (case_fold && !d->multibyte + && ! (case_fold && MB_CUR_MAX == 1 && toupper (j) == toupper (t))) break; if (j < NOTCHAR) break; } mp->is[0] = mp->left[0] = mp->right[0] - = case_fold && !d->multibyte ? toupper (t) : t; + = case_fold && MB_CUR_MAX == 1 ? toupper (t) : t; mp->is[1] = mp->left[1] = mp->right[1] = '\0'; mp->in = enlist (mp->in, mp->is, 1); break; } } done: - if (*result) - { - dm = xmalloc (sizeof *dm); - dm->exact = exact; - dm->begline = begline; - dm->endline = endline; - dm->must = xstrdup (result); - dm->next = d->musts; - d->musts = dm; - } + if (!*result) + return NULL; + + struct dfamust *dm = xmalloc (sizeof *dm); + dm->exact = exact; + dm->begline = begline; + dm->endline = endline; + dm->must = xstrdup (result); while (mp) { @@ -4221,18 +4202,21 @@ done: freemust (mp); mp = prev; } + + return dm; } -struct dfa * -dfaalloc (void) +void +dfamustfree (struct dfamust *dm) { - return xmalloc (sizeof (struct dfa)); + free (dm->must); + free (dm); } -struct dfamust *_GL_ATTRIBUTE_PURE -dfamusts (struct dfa const *d) +struct dfa * +dfaalloc (void) { - return d->musts; + return xmalloc (sizeof (struct dfa)); } /* vim:set shiftwidth=2: */ diff --git a/src/dfa.h b/src/dfa.h index 9f95415..e4beb7e 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -30,7 +30,6 @@ struct dfamust bool begline; bool endline; char *must; - struct dfamust *next; }; /* The dfa structure. It is completely opaque. */ @@ -43,8 +42,11 @@ struct dfa; calling dfafree() on it. */ extern struct dfa *dfaalloc (void); -/* Return the dfamusts associated with a dfa. */ -extern struct dfamust *dfamusts (struct dfa const *); +/* Build and return the struct dfamust from the given struct dfa. */ +extern struct dfamust *dfamust (struct dfa const *); + +/* Free the storage held by the components of a struct dfamust. */ +extern void dfamustfree (struct dfamust *); /* dfasyntax() takes three arguments; the first sets the syntax bits described earlier in this file, the second sets the case-folding flag, and the diff --git a/src/dfasearch.c b/src/dfasearch.c index c8fc91b..8b8af06 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -86,41 +86,37 @@ dfawarn (char const *mesg) static void kwsmusts (void) { - struct dfamust const *dm = dfamusts (dfa); - if (dm) + struct dfamust *dm = dfamust (dfa); + if (!dm) + return; + kwsinit (&kwset); + if (dm->exact) { - kwsinit (&kwset); - /* First, we compile in the substrings known to be exact - matches. The kwset matcher will return the index - of the matching string that it chooses. */ - for (; dm; dm = dm->next) - { - if (!dm->exact) - continue; - ++kwset_exact_matches; - size_t old_len = strlen (dm->must); - size_t new_len = old_len + dm->begline + dm->endline; - char *must = xmalloc (new_len); - char *mp = must; - *mp = eolbyte; - mp += dm->begline; - begline |= dm->begline; - memcpy (mp, dm->must, old_len); - if (dm->endline) - mp[old_len] = eolbyte; - kwsincr (kwset, must, new_len); - free (must); - } - /* Now, we compile the substrings that will require - the use of the regexp matcher. */ - for (dm = dfamusts (dfa); dm; dm = dm->next) - { - if (dm->exact) - continue; - kwsincr (kwset, dm->must, strlen (dm->must)); - } - kwsprep (kwset); + /* Prepare a substring whose presence implies a match. + The kwset matcher will return the index of the matching + string that it chooses. */ + ++kwset_exact_matches; + size_t old_len = strlen (dm->must); + size_t new_len = old_len + dm->begline + dm->endline; + char *must = xmalloc (new_len); + char *mp = must; + *mp = eolbyte; + mp += dm->begline; + begline |= dm->begline; + memcpy (mp, dm->must, old_len); + if (dm->endline) + mp[old_len] = eolbyte; + kwsincr (kwset, must, new_len); + free (must); + } + else + { + /* Otherwise, filtering with this substring should help reduce the + search space, but we'll still have to use the regexp matcher. */ + kwsincr (kwset, dm->must, strlen (dm->must)); } + kwsprep (kwset); + dfamustfree (dm); } void -- 2.3.7