[PATCH 4/9] dfa: speed up handling of brackets

bug-grep
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 4/9] dfa: speed up handling of brackets

From:	Paolo Bonzini
Subject:	[PATCH 4/9] dfa: speed up handling of brackets
Date:	Sun, 14 Mar 2010 16:35:09 +0100
This patch has two sides.  One is to fold the parsing of brackets in the
single- and multi-byte cases.  The second is to leverage this change,
and use a bitset to test for single-byte characters in the charset.
Splitting the two would be very hard.

Testcase:
   yes 'the quick brown fox jumps over the lazy dog' | sed 100000q | \
     time grep -c [ABCDEFGHIJKLMNOPQRSTUVWXYZ,]

Before: 59ms (best of three runs); after: 51ms (best of three runs).
Nice, but mostly providing infrastructure for the next patch.

* src/dfa.c (setbit_case_fold): Try applying towlower/towupper.
(looking_at): Remove.
(FETCH_WC): New.
(fetch_wc): Merge into FETCH_WC [MBS_SUPPORT].
(FETCH) [MBS_SUPPORT]: Call FETCH_WC.
(prednames, find_pred, is_blank and other predicates): Move above.
(parse_bracket_exp): New name of parse_bracket_exp_mb, rewritten to
include single-byte character set parsing of brackets.
(lex): Adjust for fetch_wc->FETCH_WC change, remove single-byte
character set parsing of brackets.
(match_mb_charset): Test against work_mbc->cset.
* src/dfa.h (struct mb_char_classes): Add cset.
---
 .x-sc_space_tab |    1 -
 src/dfa.c       |  628 ++++++++++++++++++++++++++++---------------------------
 src/dfa.h       |    1 +
 3 files changed, 318 insertions(+), 312 deletions(-)

diff --git a/.x-sc_space_tab b/.x-sc_space_tab
index be9bbd8..efa4369 100644
--- a/.x-sc_space_tab
+++ b/.x-sc_space_tab
@@ -1,2 +1 @@
 \.diff$
-^src/dfa\.c$
diff --git a/src/dfa.c b/src/dfa.c
index 3191c6f..ed4e1ae 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -236,17 +236,40 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
   eolbyte = eol;
 }
 
-/* Like setbit, but if case is folded, set both cases of a letter.  */
+/* Like setbit, but if case is folded, set both cases of a letter.
+   For MB_CUR_MAX > 1, one or both of the two cases may not be set,
+   so the resulting charset may only be used as an optimization.  */
 static void
 setbit_case_fold (unsigned b, charclass c)
 {
-  setbit (b, c);
   if (case_fold)
     {
-      if (ISUPPER (b))
-       setbit (tolower (b), c);
-      else if (ISLOWER (b))
-       setbit (toupper (b), c);
+#ifdef MBS_SUPPORT
+      if (MB_CUR_MAX > 1)
+        {
+          wint_t b1 = iswupper(b) ? towlower(b) : b;
+          wint_t b2 = iswlower(b) ? towupper(b) : b;
+          if (wctob ((unsigned char)b1) == b1)
+            setbit (b1, c);
+          if (b2 != b1 && wctob ((unsigned char)b2) == b2)
+            setbit (b2, c);
+        }
+      else
+        {
+#endif
+          unsigned char b1 = ISUPPER(b) ? tolower(b) : b;
+          unsigned char b2 = ISLOWER(b) ? toupper(b) : b;
+         setbit (b1, c);
+          if (b2 != b1)
+            setbit (b2, c);
+        }
+    }
+  else
+    {
+#ifdef MBS_SUPPORT
+      if (wctob ((unsigned char)b) == b)
+#endif
+        setbit (b, c);
     }
 }
 
@@ -293,57 +316,57 @@ static unsigned char const *buf_end;      /* reference to 
end in dfaexec().  */
 
 #ifdef MBS_SUPPORT
 /* Note that characters become unsigned here. */
-# define FETCH(c, eoferr)                      \
+# define FETCH_WC(c, wc, eoferr)               \
   do {                                         \
     if (! lexleft)                             \
-     {                                         \
-       if (eoferr != 0)                        \
+      {                                                \
+        if (eoferr != 0)                       \
          dfaerror (eoferr);                    \
-       else                                    \
+        else                                   \
          return lasttok = END;                 \
       }                                                \
-    (c) = (unsigned char) *lexptr++;           \
-    --lexleft;                                 \
+    else                                       \
+      {                                                \
+        cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); \
+        if (cur_mb_len <= 0)                   \
+          {                                    \
+            cur_mb_len = 1;                    \
+            --lexleft;                         \
+            wc = c = (unsigned char) *lexptr++;        \
+          }                                    \
+        else                                   \
+          {                                    \
+            lexptr += cur_mb_len;              \
+            lexleft -= cur_mb_len;             \
+            (c) = wctob(wc);                   \
+          }                                    \
+      }                                                \
   } while(0)
 
-/* This function fetch a wide character, and update cur_mb_len,
-   used only if the current locale is a multibyte environment.  */
-static wint_t
-fetch_wc (char const *eoferr)
-{
-  wchar_t wc;
-  if (! lexleft)
-    {
-      if (eoferr != 0)
-       dfaerror (eoferr);
-      else
-       return WEOF;
-    }
+# define FETCH(c, eoferr)                      \
+  do {                                         \
+    wint_t _wc;                                        \
+    FETCH_WC(c, _wc, eoferr);                  \
+  } while(0)
 
-  cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
-  if (cur_mb_len <= 0)
-   {
-      cur_mb_len = 1;
-      wc = (unsigned char) *lexptr;
-    }
-  lexptr += cur_mb_len;
-  lexleft -= cur_mb_len;
-  return wc;
-}
 #else
 /* Note that characters become unsigned here. */
-# define FETCH(c, eoferr)            \
-  do {                               \
-    if (! lexleft)                   \
+# define FETCH(c, eoferr)            \
+  do {                               \
+    if (! lexleft)                   \
       {                                      \
        if (eoferr != 0)              \
          dfaerror (eoferr);          \
-       else                          \
+       else                          \
          return lasttok = END;       \
       }                                      \
     (c) = (unsigned char) *lexptr++;  \
-    --lexleft;                       \
+    --lexleft;                       \
   } while(0)
+
+# define FETCH_WC(c, unused, eoferr)           \
+  FETCH(c, eoferr)
+
 #endif /* MBS_SUPPORT */
 
 static int
@@ -353,13 +376,76 @@ in_coll_range (char ch, char from, char to)
   return strcoll (&c[0], &c[2]) <= 0 && 0 <= strcoll (&c[2], &c[4]);
 }
 
-#ifdef MBS_SUPPORT
+#ifdef __STDC__
+#define FUNC(F, P) static int F(int c) { return P(c); }
+#else
+#define FUNC(F, P) static int F(c) int c; { return P(c); }
+#endif
+
+FUNC(is_alpha, ISALPHA)
+FUNC(is_upper, ISUPPER)
+FUNC(is_lower, ISLOWER)
+FUNC(is_digit, ISDIGIT)
+FUNC(is_xdigit, ISXDIGIT)
+FUNC(is_space, ISSPACE)
+FUNC(is_punct, ISPUNCT)
+FUNC(is_alnum, ISALNUM)
+FUNC(is_print, ISPRINT)
+FUNC(is_graph, ISGRAPH)
+FUNC(is_cntrl, ISCNTRL)
+
+static int
+is_blank (int c)
+{
+   return (c == ' ' || c == '\t');
+}
+
+typedef int predicate (int);
+
+/* The following list maps the names of the Posix named character classes
+   to predicate functions that determine whether a given character is in
+   the class.  The leading [ has already been eaten by the lexical analyzer. */
+static struct {
+  const char *name;
+  predicate *pred;
+} const prednames[] = {
+  { "alpha", is_alpha },
+  { "upper", is_upper },
+  { "lower", is_lower },
+  { "digit", is_digit },
+  { "xdigit", is_xdigit },
+  { "space", is_space },
+  { "punct", is_punct },
+  { "alnum", is_alnum },
+  { "print", is_print },
+  { "graph", is_graph },
+  { "cntrl", is_cntrl },
+  { "blank", is_blank },
+  { 0, 0 }
+};
+
+static predicate *
+find_pred (const char *str)
+{
+  int i;
+  for (i = 0; prednames[i].name; ++i)
+    if (!strcmp(str, prednames[i].name))
+      break;
+
+  return prednames[i].pred;
+}
+
 /* Multibyte character handling sub-routine for lex.
    This function  parse a bracket expression and build a struct
    mb_char_classes.  */
 static token
-parse_bracket_exp_mb (void)
+parse_bracket_exp (void)
 {
+  int invert;
+  int c, c1, c2;
+  charclass ccl;
+
+#ifdef MBS_SUPPORT
   wint_t wc, wc1, wc2;
 
   /* Work area to build a mb_char_classes.  */
@@ -367,63 +453,68 @@ parse_bracket_exp_mb (void)
   int chars_al, range_sts_al, range_ends_al, ch_classes_al,
     equivs_al, coll_elems_al;
 
-  REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
-                      dfa->mbcsets_alloc, dfa->nmbcsets + 1);
-  /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
-     We will update dfa->multibyte_prop[] in addtok(), because we can't
-     decide the index in dfa->tokens[].  */
-
-  /* Initialize work are */
-  work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
-
   chars_al = 1;
   range_sts_al = range_ends_al = 0;
   ch_classes_al = equivs_al = coll_elems_al = 0;
+  if (MB_CUR_MAX > 1)
+    {
+      REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
+                           dfa->mbcsets_alloc, dfa->nmbcsets + 1);
+
+      /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
+         We will update dfa->multibyte_prop[] in addtok(), because we can't
+         decide the index in dfa->tokens[].  */
+
+      /* Initialize work area.  */
+      work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
+      work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
+      work_mbc->nequivs = work_mbc->ncoll_elems = 0;
+      work_mbc->chars = NULL;
+      work_mbc->ch_classes = NULL;
+      work_mbc->range_sts = work_mbc->range_ends = NULL;
+      work_mbc->equivs = work_mbc->coll_elems = NULL;
+    }
+  else
+    work_mbc = NULL;
+#endif
 
-  work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
-  work_mbc->nequivs = work_mbc->ncoll_elems = 0;
-  work_mbc->chars = NULL;
-  work_mbc->ch_classes = NULL;
-  work_mbc->range_sts = work_mbc->range_ends = NULL;
-  work_mbc->equivs = work_mbc->coll_elems = NULL;
-
-  wc = fetch_wc(_("unbalanced ["));
-  if (wc == L'^')
+  memset (ccl, 0, sizeof(ccl));
+  FETCH_WC (c, wc, _("unbalanced ["));
+  if (c == '^')
     {
-      wc = fetch_wc(_("unbalanced ["));
-      work_mbc->invert = 1;
+      FETCH_WC (c, wc, _("unbalanced ["));
+      invert = 1;
     }
   else
-    work_mbc->invert = 0;
+    invert = 0;
+
   do
     {
-      wc1 = WEOF; /* mark wc1 is not initialized".  */
+      c1 = EOF; /* mark c1 is not initialized".  */
 
       /* Note that if we're looking at some other [:...:] construct,
         we just treat it as a bunch of ordinary characters.  We can do
         this because we assume regex has checked for syntax errors before
         dfa is ever called. */
-      if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
+      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
        {
 #define BRACKET_BUFFER_SIZE 128
          char str[BRACKET_BUFFER_SIZE];
-         wc1 = wc;
-         wc = fetch_wc(_("unbalanced ["));
+         FETCH_WC (c1, wc1, _("unbalanced ["));
 
          /* If pattern contains `[[:', `[[.', or `[[='.  */
-         if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
+         if (c1 == ':'
+#ifdef MBS_SUPPORT
+              /* TODO: handle `[[.' and `[[=' also for MB_CUR_MAX == 1.  */
+             || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '='))
+#endif
+             )
            {
-             unsigned char c;
-             unsigned char delim = (unsigned char)wc;
              int len = 0;
              for (;;)
                {
-                 if (! lexleft)
-                   dfaerror (_("unbalanced ["));
-                 c = (unsigned char) *lexptr++;
-                 --lexleft;
-
-                 if ((c == delim && *lexptr == ']') || lexleft == 0)
+                 FETCH (c, _("unbalanced ["));
+                 if ((c == c1 && *lexptr == ']') || lexleft == 0)
                    break;
                  if (len < BRACKET_BUFFER_SIZE)
                    str[len++] = c;
@@ -433,18 +524,9 @@ parse_bracket_exp_mb (void)
                }
              str[len] = '\0';
 
-             if (lexleft == 0)
-               {
-                 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
-                                      work_mbc->nchars + 2);
-                 work_mbc->chars[work_mbc->nchars++] = L'[';
-                 work_mbc->chars[work_mbc->nchars++] = delim;
-                 break;
-               }
-
-             if (--lexleft, *lexptr++ != ']')
-               dfaerror (_("unbalanced ["));
-             if (delim == ':')
+              /* Fetch bracket.  */
+             FETCH (c, _("unbalanced ["));
+             if (c1 == ':')
                /* build character class.  */
                {
                  char const *class
@@ -452,24 +534,39 @@ parse_bracket_exp_mb (void)
                                     || !strcmp (str, "lower"))
                                       ? "alpha"
                                       : str);
-                 /* Query the character class as wctype_t.  */
-                 wctype_t wt = wctype (class);
-
-                 if (ch_classes_al == 0)
-                   MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
-                 REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
-                                      ch_classes_al,
-                                      work_mbc->nch_classes + 1);
-                 work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
-
-               }
-             else if (delim == '=' || delim == '.')
+#ifdef MBS_SUPPORT
+                  if (MB_CUR_MAX > 1)
+                    {
+                     /* Store the character class as wctype_t.  */
+                      wctype_t wt = wctype (class);
+
+                      if (ch_classes_al == 0)
+                        MALLOC(work_mbc->ch_classes, wctype_t, 
++ch_classes_al);
+                      REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
+                                           ch_classes_al,
+                                           work_mbc->nch_classes + 1);
+                      work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
+                    }
+#endif
+
+                  {
+                    predicate *pred = find_pred (class);
+                    if (!pred)
+                      dfaerror(_("invalid character class"));
+                    for (c2 = 0; c2 < NOTCHAR; ++c2)
+                      if ((*pred)(c2))
+                        setbit_case_fold (c2, ccl);
+                  }
+                }
+
+#ifdef MBS_SUPPORT
+             else if (c1 == '=' || c1 == '.')
                {
                  char *elem;
                  MALLOC(elem, char, len + 1);
                  strncpy(elem, str, len + 1);
 
-                 if (delim == '=')
+                 if (c1 == '=')
                    /* build equivalent class.  */
                    {
                      if (equivs_al == 0)
@@ -480,7 +577,7 @@ parse_bracket_exp_mb (void)
                      work_mbc->equivs[work_mbc->nequivs++] = elem;
                    }
 
-                 if (delim == '.')
+                 if (c1 == '.')
                    /* build collating element.  */
                    {
                      if (coll_elems_al == 0)
@@ -490,157 +587,157 @@ parse_bracket_exp_mb (void)
                                           work_mbc->ncoll_elems + 1);
                      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
                    }
-               }
-             wc1 = wc = WEOF;
-           }
-         else
-           /* We treat '[' as a normal character here.  */
-           {
-             wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
+               }
+#endif
+
+              /* Fetch new lookahead character.  */
+             FETCH_WC (c1, wc1, _("unbalanced ["));
+              continue;
            }
+
+          /* We treat '[' as a normal character here.  c/c1/wc/wc1
+             are already set up.  */
        }
-      else
-       {
-         if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-           wc = fetch_wc(("unbalanced ["));
-       }
 
-      if (wc1 == WEOF)
-       wc1 = fetch_wc(_("unbalanced ["));
+      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+        FETCH_WC(c, wc, _("unbalanced ["));
+
+      if (c1 == EOF)
+       FETCH_WC(c1, wc1, _("unbalanced ["));
 
-      if (wc1 == L'-')
+      if (c1 == '-')
        /* build range characters.  */
        {
-         wc2 = fetch_wc(_("unbalanced ["));
-         if (wc2 == L']')
+         FETCH_WC(c2, wc2, _("unbalanced ["));
+         if (c2 == ']')
            {
              /* In the case [x-], the - is an ordinary hyphen,
                 which is left in c1, the lookahead character. */
              lexptr -= cur_mb_len;
              lexleft += cur_mb_len;
-             wc2 = wc;
-           }
-         else
-           {
-             if (wc2 == L'\\'
-                 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-               wc2 = fetch_wc(_("unbalanced ["));
-             wc1 = fetch_wc(_("unbalanced ["));
-           }
+            }
+        }
 
-         if (range_sts_al == 0)
-           {
-             MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
-             MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
-           }
-         REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
-                              range_sts_al, work_mbc->nranges + 1);
-         REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
-                              range_ends_al, work_mbc->nranges + 1);
-         work_mbc->range_sts[work_mbc->nranges] = 
-            case_fold ? towlower(wc) : (wchar_t)wc;
-         work_mbc->range_ends[work_mbc->nranges++] = 
-            case_fold ? towlower(wc2) : (wchar_t)wc2;
+      if (c1 == '-' && c2 != ']')
+        {
+          if (c2 == '\\'
+              && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+            FETCH_WC(c2, wc2, _("unbalanced ["));
 
-#ifndef GREP
-         if (case_fold)
+#ifdef MBS_SUPPORT
+          if (MB_CUR_MAX > 1)
             {
+             if (range_sts_al == 0)
+                {
+                  MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
+                  MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
+                }
               REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
                                    range_sts_al, work_mbc->nranges + 1);
-              work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
               REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
                                    range_ends_al, work_mbc->nranges + 1);
-              work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
+              work_mbc->range_sts[work_mbc->nranges] = 
+                case_fold ? towlower(wc) : (wchar_t)wc;
+              work_mbc->range_ends[work_mbc->nranges++] = 
+                case_fold ? towlower(wc2) : (wchar_t)wc2;
+
+#ifndef GREP
+              if (case_fold)
+                {
+                  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+                                       range_sts_al, work_mbc->nranges + 1);
+                  work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
+                  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+                                       range_ends_al, work_mbc->nranges + 1);
+                  work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
+                }
+#endif
             }
+          else
 #endif
+            {
+              c1 = c;
+              if (case_fold)
+                {
+                  c1 = tolower (c1);
+                  c2 = tolower (c2);
+                }
+              if (!hard_LC_COLLATE)
+                for (c = c1; c <= c2; c++)
+                  setbit_case_fold (c, ccl);
+              else
+                for (c = 0; c < NOTCHAR; ++c)
+                  if (!(case_fold && ISUPPER (c))
+                      && in_coll_range (c, c1, c2))
+                    setbit_case_fold (c, ccl);
+            }
+
+          FETCH_WC(c1, wc1, _("unbalanced ["));
+         continue;
        }
-      else if (wc != WEOF)
-       /* build normal characters.  */
-       {
-         REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
-                              work_mbc->nchars + 1);
-         work_mbc->chars[work_mbc->nchars++] =
-               (wchar_t) (case_fold ? towlower(wc) : wc);
-#ifndef GREP
-         if (case_fold)
+
+      setbit_case_fold (c, ccl);
+#ifdef MBS_SUPPORT
+      /* Build normal characters.  */
+      if (MB_CUR_MAX > 1)
+        {
+          if (case_fold && iswalpha(wc))
+            {
+              wc = towlower(wc);
+              c = wctob(wc);
+              if (c == EOF || (wint_t)c == (wint_t)wc)
+                {
+                  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+                                       work_mbc->nchars + 1);
+                  work_mbc->chars[work_mbc->nchars++] = wc;
+                }
+#ifdef GREP
+             continue;
+#else
+              wc = towupper(wc);
+              c = wctob(wc);
+#endif
+            }
+
+          if (c == EOF || (wint_t)c == (wint_t)wc)
             {
               REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
                                    work_mbc->nchars + 1);
-              work_mbc->chars[work_mbc->nchars++] = towupper(wc);
+              work_mbc->chars[work_mbc->nchars++] = wc;
             }
 #endif
         }
     }
-  while ((wc = wc1) != L']');
-  return MBCSET;
-}
-#endif /* MBS_SUPPORT */
+  while ((wc = wc1, (c = c1) != L']'));
 
-#ifdef __STDC__
-#define FUNC(F, P) static int F(int c) { return P(c); }
-#else
-#define FUNC(F, P) static int F(c) int c; { return P(c); }
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    {
+      static charclass zeroclass;
+      work_mbc->invert = invert;
+      work_mbc->cset = equal(ccl, zeroclass) ? -1 : charclass_index(ccl);
+      return MBCSET;
+    }
 #endif
 
-FUNC(is_alpha, ISALPHA)
-FUNC(is_upper, ISUPPER)
-FUNC(is_lower, ISLOWER)
-FUNC(is_digit, ISDIGIT)
-FUNC(is_xdigit, ISXDIGIT)
-FUNC(is_space, ISSPACE)
-FUNC(is_punct, ISPUNCT)
-FUNC(is_alnum, ISALNUM)
-FUNC(is_print, ISPRINT)
-FUNC(is_graph, ISGRAPH)
-FUNC(is_cntrl, ISCNTRL)
+  if (invert)
+    {
+      notset(ccl);
+      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
+        clrbit(eolbyte, ccl);
+    }
 
-static int
-is_blank (int c)
-{
-   return (c == ' ' || c == '\t');
+  return CSET + charclass_index(ccl);
 }
 
-/* The following list maps the names of the Posix named character classes
-   to predicate functions that determine whether a given character is in
-   the class.  The leading [ has already been eaten by the lexical analyzer. */
-static struct {
-  const char *name;
-  int (*pred) (int);
-} const prednames[] = {
-  { ":alpha:]", is_alpha },
-  { ":upper:]", is_upper },
-  { ":lower:]", is_lower },
-  { ":digit:]", is_digit },
-  { ":xdigit:]", is_xdigit },
-  { ":space:]", is_space },
-  { ":punct:]", is_punct },
-  { ":alnum:]", is_alnum },
-  { ":print:]", is_print },
-  { ":graph:]", is_graph },
-  { ":cntrl:]", is_cntrl },
-  { ":blank:]", is_blank },
-  { 0, 0 }
-};
-
 /* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
 
-static int
-looking_at (char const *s)
-{
-  size_t len;
-
-  len = strlen(s);
-  if (lexleft < len)
-    return 0;
-  return strncmp(s, lexptr, len) == 0;
-}
-
 static token
 lex (void)
 {
-  unsigned c, c1, c2;
-  int backslash = 0, invert;
+  unsigned c, c2;
+  int backslash = 0;
   charclass ccl;
   int i;
 
@@ -655,10 +752,7 @@ lex (void)
 #ifdef MBS_SUPPORT
       if (MB_CUR_MAX > 1)
         {
-          wint_t wi = fetch_wc (NULL);
-          if (wi == WEOF)
-            return lasttok = EOF;
-          wctok = wi, c = wctob (wi);
+          FETCH_WC (c, wctok, NULL);
           if ((int)c == EOF)
             goto normal_char;
         }
@@ -939,100 +1033,7 @@ lex (void)
          if (backslash)
            goto normal_char;
          laststart = 0;
-#ifdef MBS_SUPPORT
-         if (MB_CUR_MAX > 1)
-           {
-             /* In multibyte environment a bracket expression may contain
-                multibyte characters, which must be treated as characters
-                (not bytes).  So we parse it by parse_bracket_exp_mb().  */
-             return lasttok = parse_bracket_exp_mb();
-           }
-#endif
-         zeroset(ccl);
-         FETCH(c, _("unbalanced ["));
-         if (c == '^')
-           {
-             FETCH(c, _("unbalanced ["));
-             invert = 1;
-           }
-         else
-           invert = 0;
-         do
-           {
-             /* Nobody ever said this had to be fast. :-)
-                Note that if we're looking at some other [:...:]
-                construct, we just treat it as a bunch of ordinary
-                characters.  We can do this because we assume
-                regex has checked for syntax errors before
-                dfa is ever called. */
-             if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
-               for (c1 = 0; prednames[c1].name; ++c1)
-                 if (looking_at(prednames[c1].name))
-                   {
-                     int (*pred) (int) = prednames[c1].pred;
-
-                     for (c2 = 0; c2 < NOTCHAR; ++c2)
-                       if ((*pred)(c2))
-                         setbit_case_fold (c2, ccl);
-                     lexptr += strlen(prednames[c1].name);
-                     lexleft -= strlen(prednames[c1].name);
-                     FETCH(c1, _("unbalanced ["));
-                     goto skip;
-                   }
-             if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-               FETCH(c, _("unbalanced ["));
-             FETCH(c1, _("unbalanced ["));
-             if (c1 == '-')
-               {
-                 FETCH(c2, _("unbalanced ["));
-                 if (c2 == ']')
-                   {
-                     /* In the case [x-], the - is an ordinary hyphen,
-                        which is left in c1, the lookahead character. */
-                     --lexptr;
-                     ++lexleft;
-                   }
-                 else
-                   {
-                     if (c2 == '\\'
-                         && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-                       FETCH(c2, _("unbalanced ["));
-
-                      c1 = c;
-                     if (!hard_LC_COLLATE)
-                       for (c = c1; c <= c2; c++)
-                         setbit_case_fold (c, ccl);
-                     else
-                        {
-                          if (case_fold)
-                            {
-                              c1 = tolower (c1);
-                              c2 = tolower (c2);
-                            }
-                          for (c = 0; c < NOTCHAR; ++c)
-                            if (!(case_fold && ISUPPER (c))
-                                && in_coll_range (c, c1, c2))
-                              setbit_case_fold (c, ccl);
-                        }
-
-                     FETCH(c1, _("unbalanced ["));
-                     continue;
-                   }
-               }
-
-             setbit_case_fold (c, ccl);
-
-           skip:
-             ;
-           }
-         while ((c = c1) != ']');
-         if (invert)
-           {
-             notset(ccl);
-             if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
-               clrbit(eolbyte, ccl);
-           }
-         return lasttok = CSET + charclass_index(ccl);
+         return lasttok = parse_bracket_exp();
 
        default:
        normal_char:
@@ -2473,6 +2474,11 @@ match_mb_charset (struct dfa *d, int s, position pos, 
int idx)
   match = !work_mbc->invert;
   match_len = (mblen_buf[idx] == 0)? 1 : mblen_buf[idx];
 
+  /* Match in range 0-255?  */
+  if (wc < NOTCHAR && work_mbc->cset != -1
+      && tstbit((unsigned char)wc, d->charclasses[work_mbc->cset]))
+    goto charset_matched;
+
   /* match with a character class?  */
   for (i = 0; i<work_mbc->nch_classes; i++)
     {
diff --git a/src/dfa.h b/src/dfa.h
index 4ca55f0..b8eb0c2 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -243,6 +243,7 @@ struct dfamust
    e.g. [a-c], [[:alpha:]], etc.  */
 struct mb_char_classes
 {
+  int cset;
   int invert;
   wchar_t *chars;              /* Normal characters.  */
   int nchars;
-- 
1.6.6.1
[Prev in Thread]
Current Thread
[Next in Thread]
Re: [PATCH 1/9] tests: add more UTF-8 test cases, (continued)
- [PATCH 2/9] dfa: fix handling of ranges in multibyte character sets, Paolo Bonzini, 2010/03/14
  - Re: [PATCH 2/9] dfa: fix handling of ranges in multibyte character sets, Jim Meyering, 2010/03/15
    - Re: [PATCH 2/9] dfa: fix handling of ranges in multibyte character sets, Paolo Bonzini, 2010/03/15
    - Re: [PATCH 2/9] dfa: fix handling of ranges in multibyte character sets, Jim Meyering, 2010/03/15
    - Re: [PATCH 2/9] dfa: fix handling of ranges in multibyte character sets, Paolo Bonzini, 2010/03/15
    - Re: [PATCH 2/9] dfa: fix handling of ranges in multibyte character sets, Jim Meyering, 2010/03/15
- [PATCH 3/9] dfa: rewrite handling of multibyte case_fold lexing, Paolo Bonzini, 2010/03/14
  - Re: [PATCH 3/9] dfa: rewrite handling of multibyte case_fold lexing, Jim Meyering, 2010/03/16
    - Re: [PATCH 3/9] dfa: rewrite handling of multibyte case_fold lexing, Paolo Bonzini, 2010/03/17
- [PATCH 4/9] dfa: speed up handling of brackets, Paolo Bonzini <=
  - Re: [PATCH 4/9] dfa: speed up handling of brackets, Jim Meyering, 2010/03/17
    - Re: [PATCH 4/9] dfa: speed up handling of brackets, Paolo Bonzini, 2010/03/17
    - Re: [PATCH 4/9] dfa: speed up handling of brackets, Jim Meyering, 2010/03/17
- [PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets, Paolo Bonzini, 2010/03/14
  - Re: [PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets, Jim Meyering, 2010/03/17
    - Re: [PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets, Paolo Bonzini, 2010/03/17
- [PATCH 7/9] dfa: run simple UTF-8 regexps as a single-byte character set, Paolo Bonzini, 2010/03/14
  - Re: [PATCH 7/9] dfa: run simple UTF-8 regexps as a single-byte character set, Jim Meyering, 2010/03/15
- [PATCH 6/9] dfa: cache MB_CUR_MAX for dfaexec, Paolo Bonzini, 2010/03/14
  - Re: [PATCH 6/9] dfa: cache MB_CUR_MAX for dfaexec, Jim Meyering, 2010/03/17
Prev by Date: [PATCH 3/9] dfa: rewrite handling of multibyte case_fold lexing
Next by Date: [PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets
Previous by thread: Re: [PATCH 3/9] dfa: rewrite handling of multibyte case_fold lexing
Next by thread: Re: [PATCH 4/9] dfa: speed up handling of brackets
Index(es):
- Date
- Thread