bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 1/4] dfa: change meaning of a state context


From: Paolo Bonzini
Subject: [PATCH 1/4] dfa: change meaning of a state context
Date: Sun, 5 Feb 2012 18:00:43 +0100

* src/dfa.c (MATCHES_NEWLINE_CONTEXT, MATCHES_LETTER_CONTEXT): New.
(state_separate_contexts): Remove second argument.
(state_index): Do not mask away CTX_NONE.
(dfaanalyze): Adjust call to state_index and state_separate_contexts.
(dfastate): Adjust calls to state_index and state_separate_contexts.
---
 src/dfa.c |  101 ++++++++++++++++++++++++++++++++-----------------------------
 1 files changed, 53 insertions(+), 48 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index b46cd35..dc487ff 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -93,11 +93,15 @@ static inline unsigned char to_uchar (char ch) { return ch; 
}
 
 /* Contexts tell us whether a character is a newline or a word constituent.
    Word-constituent characters are those that satisfy iswalnum(), plus '_'.
+   Each character has a single CTX_* value; bitmasks of CTX_* values denote
+   a particular character class.
 
-   A state also stores a context value, which is nonzero if its
-   predecessors always matches a newline or a word constituent.
-   The definition of a state's context is a bit unclear, but will be
-   modified soon anyway.  */
+   A state also stores a context value, which is a bitmask of CTX_* values.
+   A state's context represents a set of characters that the state's
+   predecessors must match.  For example, a state whose context does not
+   include CTX_LETTER will never have transitions where the previous
+   character is a word constituent.  A state whose context is CTX_ANY
+   might have transitions from any character.  */
 
 #define CTX_NONE       1
 #define CTX_LETTER     2
@@ -121,15 +125,15 @@ static inline unsigned char to_uchar (char ch) { return 
ch; }
    bit 0 - neither previous nor current is word-constituent
 
    The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
-   succeeds in a particular context.  Prev is the context value for
-   the previous character, curr is the context value for the lookahead
-   character. */
+   succeeds in a particular context.  Prev is a bitmask of possible
+   context values for the previous character, curr is the bitmask of
+   possible context values for the lookahead character. */
 #define MATCHES_NEWLINE_CONTEXT(constraint, prev, curr) \
   ((constraint) & \
-   1 << (((prev & CTX_NEWLINE) ? 2 : 0) + ((curr & CTX_NEWLINE) ? 1 : 0) + 4))
+   1 << (((prev & ~CTX_NEWLINE) ? 0 : 2) + ((curr & ~CTX_NEWLINE) ? 0 : 1) + 
4))
 #define MATCHES_LETTER_CONTEXT(constraint, prev, curr) \
   ((constraint) & \
-   1 << (((prev & CTX_LETTER) ? 2 : 0) + ((curr & CTX_LETTER) ? 1 : 0)))
+   1 << (((prev & ~CTX_LETTER) ? 0 : 2) + ((curr & ~CTX_LETTER) ? 0 : 1)))
 #define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
   (MATCHES_NEWLINE_CONTEXT(constraint, prev, curr)                  \
    && MATCHES_LETTER_CONTEXT(constraint, prev, curr))
@@ -1976,7 +1980,6 @@ state_index (struct dfa *d, position_set const *s, int 
context)
   int constraint;
   int i, j;
 
-  context &= ~CTX_NONE;
   for (i = 0; i < s->nelem; ++i)
     hash ^= s->elems[i].index + s->elems[i].constraint;
 
@@ -2102,12 +2105,12 @@ epsclosure (position_set *s, struct dfa const *d)
    character included in C.  */
 
 static int
-charclass_context(charclass c)
+charclass_context (charclass c)
 {
   int context = 0;
   unsigned int j;
 
-  if (tstbit(eolbyte, c))
+  if (tstbit (eolbyte, c))
     context |= CTX_NEWLINE;
 
   for (j = 0; j < CHARCLASS_INTS; ++j)
@@ -2121,29 +2124,27 @@ charclass_context(charclass c)
   return context;
 }
 
-/* Returns the subset of POSSIBLE_CONTEXTS on which the position set S
-   depends.  Each context in the set of returned contexts (let's call it
-   SC) may have a different follow set than other contexts in SC, and
-   also different from the follow set of the complement set.  However,
-   all contexts in the complement set will have the same follow set.  */
+/* Returns the contexts on which the position set S depends.  Each context
+   in the set of returned contexts (let's call it SC) may have a different
+   follow set than other contexts in SC, and also different from the
+   follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
+   in the complement set will have the same follow set.  */
 
 static int
-state_separate_contexts(position_set *s, int possible_contexts)
+state_separate_contexts(position_set *s)
 {
-  int separate_context = 0;
+  int separate_contexts = 0;
   unsigned int j;
 
   for (j = 0; j < s->nelem; ++j)
     {
-      if ((possible_contexts & CTX_NEWLINE)
-          && PREV_NEWLINE_DEPENDENT(s->elems[j].constraint))
-        separate_context |= CTX_NEWLINE;
-      if ((possible_contexts & CTX_LETTER)
-          && PREV_LETTER_DEPENDENT(s->elems[j].constraint))
-        separate_context |= CTX_LETTER;
+      if (PREV_NEWLINE_DEPENDENT (s->elems[j].constraint))
+        separate_contexts |= CTX_NEWLINE;
+      if (PREV_LETTER_DEPENDENT (s->elems[j].constraint))
+        separate_contexts |= CTX_LETTER;
     }
 
-  return separate_context;
+  return separate_contexts;
 }
 
 
@@ -2404,8 +2405,11 @@ dfaanalyze (struct dfa *d, int searchflag)
   d->sindex = 0;
   MALLOC(d->states, d->salloc);
 
-  separate_contexts = state_separate_contexts(&merged, CTX_NEWLINE);
-  state_index(d, &merged, separate_contexts);
+  separate_contexts = state_separate_contexts (&merged);
+  state_index(d, &merged,
+              (separate_contexts & CTX_NEWLINE
+               ? CTX_NEWLINE
+               : separate_contexts ^ CTX_ANY));
 
   free(o_nullable);
   free(o_nfirst);
@@ -2502,20 +2506,18 @@ dfastate (int s, struct dfa *d, int trans[])
       if (pos.constraint != 0xFF)
         {
           if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
-                                        d->states[s].context & CTX_NEWLINE,
-                                        CTX_NEWLINE))
+                                        d->states[s].context, CTX_NEWLINE))
             clrbit(eolbyte, matches);
           if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
-                                        d->states[s].context & CTX_NEWLINE, 0))
+                                        d->states[s].context, ~CTX_NEWLINE))
             for (j = 0; j < CHARCLASS_INTS; ++j)
               matches[j] &= newline[j];
           if (! MATCHES_LETTER_CONTEXT(pos.constraint,
-                                       d->states[s].context & CTX_LETTER,
-                                       CTX_LETTER))
+                                       d->states[s].context, CTX_LETTER))
             for (j = 0; j < CHARCLASS_INTS; ++j)
               matches[j] &= ~letters[j];
           if (! MATCHES_LETTER_CONTEXT(pos.constraint,
-                                       d->states[s].context & CTX_LETTER, 0))
+                                       d->states[s].context, ~CTX_LETTER))
             for (j = 0; j < CHARCLASS_INTS; ++j)
               matches[j] &= letters[j];
 
@@ -2532,7 +2534,7 @@ dfastate (int s, struct dfa *d, int trans[])
              group's label doesn't contain that character, go on to the
              next group. */
           if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
-              && !tstbit(d->tokens[pos.index], labels[j]))
+              && !tstbit (d->tokens[pos.index], labels[j]))
             continue;
 
           /* Check if this group's label has a nonempty intersection with
@@ -2598,15 +2600,15 @@ dfastate (int s, struct dfa *d, int trans[])
   if (d->searchflag)
     {
       /* Find the state(s) corresponding to the positions of state 0. */
-      copy(&d->states[0].elems, &follows);
-      separate_contexts = state_separate_contexts(&follows, CTX_ANY);
-      state = state_index(d, &follows, 0);
+      copy (&d->states[0].elems, &follows);
+      separate_contexts = state_separate_contexts (&follows);
+      state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
       if (separate_contexts & CTX_NEWLINE)
-        state_newline = state_index(d, &follows, CTX_NEWLINE);
+        state_newline = state_index (d, &follows, CTX_NEWLINE);
       else
         state_newline = state;
       if (separate_contexts & CTX_LETTER)
-        state_letter = state_index(d, &follows, CTX_LETTER);
+        state_letter = state_index (d, &follows, CTX_LETTER);
       else
         state_letter = state;
 
@@ -2668,17 +2670,20 @@ dfastate (int s, struct dfa *d, int trans[])
           insert(d->states[0].elems.elems[j], &follows);
 
       /* Find out if the new state will want any context information. */
-      possible_contexts = charclass_context(labels[i]);
-      separate_contexts = state_separate_contexts(&follows, possible_contexts);
+      possible_contexts = charclass_context (labels[i]);
+      separate_contexts = state_separate_contexts (&follows);
 
       /* Find the state(s) corresponding to the union of the follows. */
-      state = state_index(d, &follows, 0);
-      if (separate_contexts & CTX_NEWLINE)
-        state_newline = state_index(d, &follows, CTX_NEWLINE);
+      if ((separate_contexts & possible_contexts) != possible_contexts)
+        state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
+      else
+        state = -1;
+      if (separate_contexts & possible_contexts & CTX_NEWLINE)
+        state_newline = state_index (d, &follows, CTX_NEWLINE);
       else
         state_newline = state;
-      if (separate_contexts & CTX_LETTER)
-        state_letter = state_index(d, &follows, CTX_LETTER);
+      if (separate_contexts & possible_contexts & CTX_LETTER)
+        state_letter = state_index (d, &follows, CTX_LETTER);
       else
         state_letter = state;
 
@@ -2975,7 +2980,7 @@ match_mb_charset (struct dfa *d, int s, position pos, int 
idx)
 
   /* Match in range 0-255?  */
   if (wc < NOTCHAR && work_mbc->cset != -1
-      && tstbit((unsigned char)wc, d->charclasses[work_mbc->cset]))
+      && tstbit ((unsigned char)wc, d->charclasses[work_mbc->cset]))
     goto charset_matched;
 
   /* match with a character class?  */
-- 
1.7.7.6





reply via email to

[Prev in Thread] Current Thread [Next in Thread]