bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 4/4] dfa: optimize wide characters in a bracket expression


From: Paolo Bonzini
Subject: [PATCH 4/4] dfa: optimize wide characters in a bracket expression
Date: Tue, 7 Jun 2011 13:03:40 +0200

* src/dfa.c (addtok): Compile characters to an alternation.  Handle the
case when nothing else remains in the MBCSET.
---
 src/dfa.c |   36 ++++++++++++++++++++++++++++++++++--
 1 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 8fc6ed0..aecaad9 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1449,6 +1449,8 @@ addtok_mb (token t, int mbprop)
     dfa->depth = depth;
 }
 
+static void addtok_wc (wint_t wc);
+
 /* Add the given token to the parse tree, maintaining the depth count and
    updating the maximum depth if necessary. */
 static void
@@ -1457,8 +1459,24 @@ addtok (token t)
 #if MBS_SUPPORT
   if (MB_CUR_MAX > 1 && t == MBCSET)
     {
+      bool need_or = false;
       struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
 
+      /* Extract wide characters into alternations if possible (for
+         better performance).  This does not require UTF-8.  */
+      if (!work_mbc->invert)
+        {
+          int i;
+          for (i = 0; i < work_mbc->nchars; i++)
+            {
+              addtok_wc (work_mbc->chars[i]);
+              if (need_or)
+                addtok (OR);
+              need_or = true;
+            }
+          work_mbc->nchars = 0;
+        }
+
       /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET.  */
       if (work_mbc->invert
           || (!using_utf8() && work_mbc->cset != -1)
@@ -1467,9 +1485,23 @@ addtok (token t)
           || work_mbc->nranges != 0
           || work_mbc->nequivs != 0
           || work_mbc->ncoll_elems != 0)
-        addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+        {
+          addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+          if (need_or)
+            addtok (OR);
+        }
       else
-        addtok (CSET + work_mbc->cset);
+        {
+          /* Characters have been handled above, so it is possible
+             that the mbcset is empty now.  Do nothing in that case.  */
+          if (work_mbc->cset != -1)
+            {
+              assert (using_utf8 ());
+              addtok (CSET + work_mbc->cset);
+              if (need_or)
+                addtok (OR);
+            }
+        }
     }
   else
 #endif
-- 
1.7.4.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]