emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] Changes to emacs/src/regex.c,v


From: Miles Bader
Subject: [Emacs-diffs] Changes to emacs/src/regex.c,v
Date: Fri, 01 Feb 2008 16:02:10 +0000

CVSROOT:        /cvsroot/emacs
Module name:    emacs
Changes by:     Miles Bader <miles>     08/02/01 16:01:31

Index: src/regex.c
===================================================================
RCS file: /cvsroot/emacs/emacs/src/regex.c,v
retrieving revision 1.226
retrieving revision 1.227
diff -u -b -r1.226 -r1.227
--- src/regex.c 12 Sep 2007 07:21:32 -0000      1.226
+++ src/regex.c 1 Feb 2008 16:00:49 -0000       1.227
@@ -124,7 +124,7 @@
 # define SYNTAX_ENTRY_VIA_PROPERTY
 
 # include "syntax.h"
-# include "charset.h"
+# include "character.h"
 # include "category.h"
 
 # ifdef malloc
@@ -145,28 +145,51 @@
 # define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP 
(re_match_object)))
 
 # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
-# define RE_STRING_CHAR(p, s) \
+# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
+# define RE_STRING_CHAR(p, s, multibyte) \
   (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
-# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
+# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) \
   (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
 
-/* Set C a (possibly multibyte) character before P.  P points into a
-   string which is the virtual concatenation of STR1 (which ends at
-   END1) or STR2 (which ends at END2).  */
+# define RE_CHAR_TO_MULTIBYTE(c) unibyte_to_multibyte_table[(c)]
+
+# define RE_CHAR_TO_UNIBYTE(c)                 \
+  (ASCII_CHAR_P (c) ? (c)                      \
+   : CHAR_BYTE8_P (c) ? CHAR_TO_BYTE8 (c)      \
+   : multibyte_char_to_unibyte_safe (c))
+
+/* Set C a (possibly converted to multibyte) character before P.  P
+   points into a string which is the virtual concatenation of STR1
+   (which ends at END1) or STR2 (which ends at END2).  */
 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)               \
   do {                                                                 \
-    if (multibyte)                                                     \
+    if (target_multibyte)                                                   \
        {                                                               \
         re_char *dtemp = (p) == (str2) ? (end1) : (p);                 \
         re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
-        re_char *d0 = dtemp;                                           \
-        PREV_CHAR_BOUNDARY (d0, dlimit);                               \
-        c = STRING_CHAR (d0, dtemp - d0);                              \
+       while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp));                   \
+       c = STRING_CHAR (dtemp, (p) - dtemp);                                \
        }                                                               \
      else                                                              \
+      {                                                                        
     \
        (c = ((p) == (str2) ? (end1) : (p))[-1]);                       \
+       (c) = RE_CHAR_TO_MULTIBYTE (c);                                      \
+      }                                                                        
     \
   } while (0)
 
+/* Set C a (possibly converted to multibyte) character at P, and set
+   LEN to the byte length of that character.  */
+# define GET_CHAR_AFTER(c, p, len)             \
+  do {                                         \
+    if (target_multibyte)                      \
+      (c) = STRING_CHAR_AND_LENGTH (p, 0, len);        \
+    else                                       \
+      {                                                \
+       (c) = *p;                               \
+       len = 1;                                \
+       (c) = RE_CHAR_TO_MULTIBYTE (c);         \
+      }                                                \
+   } while (0)
 
 #else  /* not emacs */
 
@@ -278,6 +301,7 @@
 # define CHARSET_LEADING_CODE_BASE(c) 0
 # define MAX_MULTIBYTE_LENGTH 1
 # define RE_MULTIBYTE_P(x) 0
+# define RE_TARGET_MULTIBYTE_P(x) 0
 # define WORD_BOUNDARY_P(c1, c2) (0)
 # define CHAR_HEAD_P(p) (1)
 # define SINGLE_BYTE_CHAR_P(c) (1)
@@ -285,13 +309,21 @@
 # define MULTIBYTE_FORM_LENGTH(p, s) (1)
 # define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
 # define STRING_CHAR(p, s) (*(p))
-# define RE_STRING_CHAR STRING_CHAR
+# define RE_STRING_CHAR(p, s, multibyte) STRING_CHAR ((p), (s))
 # define CHAR_STRING(c, s) (*(s) = (c), 1)
 # define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
-# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
+# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) 
STRING_CHAR_AND_LENGTH ((p), (s), (len))
+# define RE_CHAR_TO_MULTIBYTE(c) (c)
+# define RE_CHAR_TO_UNIBYTE(c) (c)
 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
   (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
+# define GET_CHAR_AFTER(c, p, len)     \
+  (c = *p, len = 1)
 # define MAKE_CHAR(charset, c1, c2) (c1)
+# define BYTE8_TO_CHAR(c) (c)
+# define CHAR_BYTE8_P(c) (0)
+# define CHAR_LEADING_CODE(c) (c)
+
 #endif /* not emacs */
 
 #ifndef RE_TRANSLATE
@@ -497,7 +529,7 @@
 #  ifdef __GNUC__
 #   define alloca __builtin_alloca
 #  else /* not __GNUC__ */
-#   if HAVE_ALLOCA_H
+#   ifdef HAVE_ALLOCA_H
 #    include <alloca.h>
 #   endif /* HAVE_ALLOCA_H */
 #  endif /* not __GNUC__ */
@@ -1731,7 +1763,7 @@
   do {                                                                 \
     int len;                                                           \
     if (p == pend) return REG_EEND;                                    \
-    c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len);                  \
+    c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len, multibyte);       \
     p += len;                                                          \
   } while (0)
 
@@ -1942,10 +1974,10 @@
 
 #define EXTEND_RANGE_TABLE(work_area, n)                               \
   do {                                                                 \
-    if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \
+    if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
       {                                                                        
\
-        extend_range_table_work_area (work_area);                      \
-        if ((work_area)->table == 0)                                   \
+        extend_range_table_work_area (&work_area);                     \
+        if ((work_area).table == 0)                                    \
           return (REG_ESPACE);                                         \
       }                                                                        
\
   } while (0)
@@ -1962,15 +1994,12 @@
 #define BIT_UPPER      0x10
 #define BIT_MULTIBYTE  0x20
 
-/* Set a range START..END to WORK_AREA.
-   The range is passed through TRANSLATE, so START and END
-   should be untranslated.  */
-#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end)               \
+/* Set a range (RANGE_START, RANGE_END) to WORK_AREA.  */
+#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end)   \
   do {                                                                 \
-    int tem;                                                           \
-    tem = set_image_of_range (&work_area, start, end, translate);      \
-    if (tem > 0)                                                       \
-      FREE_STACK_RETURN (tem);                                         \
+    EXTEND_RANGE_TABLE ((work_area), 2);                               \
+    (work_area).table[(work_area).used++] = (range_start);             \
+    (work_area).table[(work_area).used++] = (range_end);               \
   } while (0)
 
 /* Free allocated memory for WORK_AREA.  */
@@ -1990,6 +2019,113 @@
 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
 
 
+#ifdef emacs
+
+/* Store characters in the range FROM to TO in the bitmap at B (for
+   ASCII and unibyte characters) and WORK_AREA (for multibyte
+   characters) while translating them and paying attention to the
+   continuity of translated characters.
+
+   Implementation note: It is better to implement these fairly big
+   macros by a function, but it's not that easy because macros called
+   in this macro assume various local variables already declared.  */
+
+/* Both FROM and TO are ASCII characters.  */
+
+#define SETUP_ASCII_RANGE(work_area, FROM, TO)                 \
+  do {                                                         \
+    int C0, C1;                                                        \
+                                                               \
+    for (C0 = (FROM); C0 <= (TO); C0++)                                \
+      {                                                                \
+       C1 = TRANSLATE (C0);                                    \
+       if (! ASCII_CHAR_P (C1))                                \
+         {                                                     \
+           SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1);    \
+           if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0)             \
+             C1 = C0;                                          \
+         }                                                     \
+       SET_LIST_BIT (C1);                                      \
+      }                                                                \
+  } while (0)
+
+
+/* Both FROM and TO are unibyte characters (0x80..0xFF).  */
+
+#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO)                              \
+  do {                                                                        \
+    int C0, C1, C2, I;                                                        \
+    int USED = RANGE_TABLE_WORK_USED (work_area);                             \
+                                                                              \
+    for (C0 = (FROM); C0 <= (TO); C0++)                                        
       \
+      {                                                                        
       \
+       C1 = RE_CHAR_TO_MULTIBYTE (C0);                                        \
+       if (CHAR_BYTE8_P (C1))                                                 \
+         SET_LIST_BIT (C0);                                                   \
+       else                                                                   \
+         {                                                                    \
+           C2 = TRANSLATE (C1);                                               \
+           if (C2 == C1                                                       \
+               || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0)                         \
+             C1 = C0;                                                         \
+           SET_LIST_BIT (C1);                                                 \
+           for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
+             {                                                                \
+               int from = RANGE_TABLE_WORK_ELT (work_area, I);                \
+               int to = RANGE_TABLE_WORK_ELT (work_area, I + 1);              \
+                                                                              \
+               if (C2 >= from - 1 && C2 <= to + 1)                            \
+                 {                                                            \
+                   if (C2 == from - 1)                                        \
+                     RANGE_TABLE_WORK_ELT (work_area, I)--;                   \
+                   else if (C2 == to + 1)                                     \
+                     RANGE_TABLE_WORK_ELT (work_area, I + 1)++;               \
+                   break;                                                     \
+                 }                                                            \
+             }                                                                \
+           if (I < USED)                                                      \
+             SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2);                 \
+         }                                                                    \
+      }                                                                        
       \
+  } while (0)
+
+
+/* Both FROM and TO are mulitbyte characters.  */
+
+#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO)                        \
+  do {                                                                    \
+    int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area);          \
+                                                                          \
+    SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO));                \
+    for (C0 = (FROM); C0 <= (TO); C0++)                                        
   \
+      {                                                                        
   \
+       C1 = TRANSLATE (C0);                                               \
+       if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0                            \
+           || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0))          \
+         SET_LIST_BIT (C2);                                               \
+       if (C1 >= (FROM) && C1 <= (TO))                                    \
+         continue;                                                        \
+       for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
+         {                                                                \
+           int from = RANGE_TABLE_WORK_ELT (work_area, I);                \
+           int to = RANGE_TABLE_WORK_ELT (work_area, I + 1);              \
+                                                                          \
+           if (C1 >= from - 1 && C1 <= to + 1)                            \
+             {                                                            \
+               if (C1 == from - 1)                                        \
+                 RANGE_TABLE_WORK_ELT (work_area, I)--;                   \
+               else if (C1 == to + 1)                                     \
+                 RANGE_TABLE_WORK_ELT (work_area, I + 1)++;               \
+               break;                                                     \
+             }                                                            \
+         }                                                                \
+       if (I < USED)                                                      \
+         SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1);                 \
+      }                                                                        
   \
+  } while (0)
+
+#endif /* emacs */
+
 /* Get the next unsigned number in the uncompiled pattern.  */
 #define GET_UNSIGNED_NUMBER(num)                                       \
   do {                                                                 \
@@ -2113,6 +2249,7 @@
       = (int *) malloc (work_area->allocated);
 }
 
+#if 0
 #ifdef emacs
 
 /* Carefully find the ranges of codes that are equivalent
@@ -2345,6 +2482,7 @@
 
   return -1;
 }
+#endif /* 0 */
 
 #ifndef MATCH_MAY_ALLOCATE
 
@@ -2483,6 +2621,9 @@
   /* If the object matched can contain multibyte characters.  */
   const boolean multibyte = RE_MULTIBYTE_P (bufp);
 
+  /* If a target of matching can contain multibyte characters.  */
+  const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
+
   /* Nonzero if we have pushed down into a subpattern.  */
   int in_subpattern = 0;
 
@@ -2835,6 +2976,7 @@
              {
                boolean escaped_char = false;
                const unsigned char *p2 = p;
+               re_wchar_t ch, c2;
 
                if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
 
@@ -2861,10 +3003,6 @@
                      break;
                  }
 
-               /* What should we do for the character which is
-                  greater than 0x7F, but not BASE_LEADING_CODE_P?
-                  XXX */
-
                /* See if we're at the beginning of a possible character
                   class.  */
 
@@ -2901,8 +3039,8 @@
                       them).  */
                    if (c == ':' && *p == ']')
                      {
-                       re_wchar_t ch;
                        re_wctype_t cc;
+                       int limit;
 
                        cc = re_wctype (str);
 
@@ -2915,6 +3053,15 @@
 
                         if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
 
+#ifndef emacs
+                       for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
+                         if (re_iswctype (btowc (ch), cc))
+                           {
+                             c = TRANSLATE (ch);
+                             if (c < (1 << BYTEWIDTH))
+                               SET_LIST_BIT (c);
+                           }
+#else  /* emacs */
                        /* Most character classes in a multibyte match
                           just set a flag.  Exceptions are is_blank,
                           is_digit, is_cntrl, and is_xdigit, since
@@ -2922,18 +3069,25 @@
                           don't need to handle them for multibyte.
                           They are distinguished by a negative wctype.  */
 
-                       if (multibyte)
-                         SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
-                                                        re_wctype_to_bit (cc));
-
-                        for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
+                       for (ch = 0; ch < 256; ++ch)
                          {
-                           int translated = TRANSLATE (ch);
-                           if (translated < (1 << BYTEWIDTH)
-                               && re_iswctype (btowc (ch), cc))
-                             SET_LIST_BIT (translated);
+                           c = RE_CHAR_TO_MULTIBYTE (ch);
+                           if (! CHAR_BYTE8_P (c)
+                               && re_iswctype (c, cc))
+                             {
+                               SET_LIST_BIT (ch);
+                               c1 = TRANSLATE (c);
+                               if (c1 == c)
+                                 continue;
+                               if (ASCII_CHAR_P (c1))
+                                 SET_LIST_BIT (c1);
+                               else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
+                                 SET_LIST_BIT (c1);
                          }
-
+                         }
+                       SET_RANGE_TABLE_WORK_AREA_BIT
+                         (range_table_work, re_wctype_to_bit (cc));
+#endif /* emacs */
                        /* In most cases the matching rule for char classes
                           only uses the syntax table for multibyte chars,
                           so that the content of the syntax-table it is not
@@ -2966,64 +3120,63 @@
 
                    /* Fetch the character which ends the range. */
                    PATFETCH (c1);
-
-                   if (SINGLE_BYTE_CHAR_P (c))
-                     {
-                       if (! SINGLE_BYTE_CHAR_P (c1))
-                         {
-                           /* Handle a range starting with a
-                              character of less than 256, and ending
-                              with a character of not less than 256.
-                              Split that into two ranges, the low one
-                              ending at 0377, and the high one
-                              starting at the smallest character in
-                              the charset of C1 and ending at C1.  */
-                           int charset = CHAR_CHARSET (c1);
-                           re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
-
-                           SET_RANGE_TABLE_WORK_AREA (range_table_work,
-                                                      c2, c1);
-                           c1 = 0377;
-                         }
-                     }
-                   else if (!SAME_CHARSET_P (c, c1))
-                     FREE_STACK_RETURN (REG_ERANGEX);
+#ifdef emacs
+                   if (CHAR_BYTE8_P (c1)
+                       && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
+                     /* Treat the range from a multibyte character to
+                        raw-byte character as empty.  */
+                     c = c1 + 1;
+#endif /* emacs */
                  }
                else
                  /* Range from C to C. */
                  c1 = c;
 
-               /* Set the range ... */
-               if (SINGLE_BYTE_CHAR_P (c))
-                 /* ... into bitmap.  */
-                 {
-                   re_wchar_t this_char;
-                   re_wchar_t range_start = c, range_end = c1;
-
-                   /* If the start is after the end, the range is empty.  */
-                   if (range_start > range_end)
+               if (c > c1)
                      {
                        if (syntax & RE_NO_EMPTY_RANGES)
-                         FREE_STACK_RETURN (REG_ERANGE);
+                     FREE_STACK_RETURN (REG_ERANGEX);
                        /* Else, repeat the loop.  */
                      }
                    else
                      {
-                       for (this_char = range_start; this_char <= range_end;
-                            this_char++)
+#ifndef emacs
+                   /* Set the range into bitmap */
+                   for (; c <= c1; c++)
                          {
-                           int translated = TRANSLATE (this_char);
-                           if (translated < (1 << BYTEWIDTH))
-                             SET_LIST_BIT (translated);
-                           else
-                             SET_RANGE_TABLE_WORK_AREA
-                               (range_table_work, translated, translated);
+                       ch = TRANSLATE (c);
+                       if (ch < (1 << BYTEWIDTH))
+                         SET_LIST_BIT (ch);
                          }
+#else  /* emacs */
+                   if (c < 128)
+                     {
+                       ch = MIN (127, c1);
+                       SETUP_ASCII_RANGE (range_table_work, c, ch);
+                       c = ch + 1;
+                       if (CHAR_BYTE8_P (c1))
+                         c = BYTE8_TO_CHAR (128);
+                     }
+                   if (c <= c1)
+                     {
+                       if (CHAR_BYTE8_P (c))
+                         {
+                           c = CHAR_TO_BYTE8 (c);
+                           c1 = CHAR_TO_BYTE8 (c1);
+                           for (; c <= c1; c++)
+                             SET_LIST_BIT (c);
                      }
+                       else if (multibyte)
+                         {
+                           SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
                  }
                else
-                 /* ... into range table.  */
-                 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
+                         {
+                           SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
+                         }
+                     }
+#endif /* emacs */
+                 }
              }
 
            /* Discard any (non)matching list bytes that are all 0 at the
@@ -3634,12 +3787,25 @@
          {
            int len;
 
-           c = TRANSLATE (c);
            if (multibyte)
+             {
+               c = TRANSLATE (c);
              len = CHAR_STRING (c, b);
-           else
-             *b = c, len = 1;
            b += len;
+             }
+           else
+             {
+               c1 = RE_CHAR_TO_MULTIBYTE (c);
+               if (! CHAR_BYTE8_P (c1))
+                 {
+                   re_wchar_t c2 = TRANSLATE (c1);
+
+                   if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
+                     c = c1;
+                 }                   
+               *b++ = c;
+               len = 1;
+             }
            (*pending_exact) += len;
          }
 
@@ -3909,14 +4075,21 @@
        case exactn:
          if (fastmap)
            {
-             int c = RE_STRING_CHAR (p + 1, pend - p);
-             /* When fast-scanning, the fastmap can be indexed either with
-                a char (smaller than 256) or with the first byte of
-                a char's byte sequence.  So we have to conservatively add
-                both to the table.  */
-             if (SINGLE_BYTE_CHAR_P (c))
-               fastmap[c] = 1;
+             /* If multibyte is nonzero, the first byte of each
+                character is an ASCII or a leading code.  Otherwise,
+                each byte is a character.  Thus, this works in both
+                cases. */
              fastmap[p[1]] = 1;
+             if (! multibyte)
+               {
+                 /* For the case of matching this unibyte regex
+                    against multibyte, we must set a leading code of
+                    the corresponding multibyte character.  */
+                 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
+
+                 if (! CHAR_BYTE8_P (c))
+                   fastmap[CHAR_LEADING_CODE (c)] = 1;
+               }
            }
          break;
 
@@ -3929,18 +4102,14 @@
 
 
        case charset_not:
-         /* Chars beyond end of bitmap are possible matches.
-            All the single-byte codes can occur in multibyte buffers.
-            So any that are not listed in the charset
-            are possible matches, even in multibyte buffers.  */
          if (!fastmap) break;
-         /* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially
-            because it will automatically be set when needed by virtue of
-            being larger than the highest char of its charset (0xbf) but
-            smaller than (1<<BYTEWIDTH).  */
+         {
+           /* Chars beyond end of bitmap are possible matches.  */
          for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
               j < (1 << BYTEWIDTH); j++)
            fastmap[j] = 1;
+         }
+
          /* Fallthrough */
        case charset:
          if (!fastmap) break;
@@ -3948,27 +4117,23 @@
          for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
               j >= 0; j--)
            if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
-             {
                fastmap[j] = 1;
-#ifdef emacs
-               if (j >= 0x80 && j < 0xa0)
-                 fastmap[LEADING_CODE_8_BIT_CONTROL] = 1;
-#endif
-             }
 
-         if ((not && multibyte)
-             /* Any character set can possibly contain a character
+#ifdef emacs
+         if (/* Any leading code can possibly start a character
                 which doesn't match the specified set of characters.  */
-             || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
+             not
+             || 
+             /* If we can match a character class, we can match any
+                multibyte characters.  */
+             (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
                  && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
-           /* If we can match a character class, we can match
-              any character set.  */
+
            {
-           set_fastmap_for_multibyte_characters:
              if (match_any_multibyte_characters == false)
                {
-                 for (j = 0x80; j < 0xA0; j++) /* XXX */
-                   if (BASE_LEADING_CODE_P (j))
+                 for (j = MIN_MULTIBYTE_LEADING_CODE;
+                      j <= MAX_MULTIBYTE_LEADING_CODE; j++)
                      fastmap[j] = 1;
                  match_any_multibyte_characters = true;
                }
@@ -3977,9 +4142,10 @@
          else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
                   && match_any_multibyte_characters == false)
            {
-             /* Set fastmap[I] 1 where I is a base leading code of each
-                multibyte character in the range table. */
+             /* Set fastmap[I] to 1 where I is a leading code of each
+                multibyte characer in the range table. */
              int c, count;
+             unsigned char lc1, lc2;
 
              /* Make P points the range table.  `+ 2' is to skip flag
                 bits for a character class.  */
@@ -3987,14 +4153,19 @@
 
              /* Extract the number of ranges in range table into COUNT.  */
              EXTRACT_NUMBER_AND_INCR (count, p);
-             for (; count > 0; count--, p += 2 * 3) /* XXX */
+             for (; count > 0; count--, p += 3)
                {
-                 /* Extract the start of each range.  */
+                 /* Extract the start and end of each range.  */
                  EXTRACT_CHARACTER (c, p);
-                 j = CHAR_CHARSET (c);
-                 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
+                 lc1 = CHAR_LEADING_CODE (c);
+                 p += 3;
+                 EXTRACT_CHARACTER (c, p);
+                 lc2 = CHAR_LEADING_CODE (c);
+                 for (j = lc1; j <= lc2; j++)
+                   fastmap[j] = 1;
                }
            }
+#endif
          break;
 
        case syntaxspec:
@@ -4017,14 +4188,19 @@
          if (!fastmap) break;
          not = (re_opcode_t)p[-1] == notcategoryspec;
          k = *p++;
-         for (j = 0; j < (1 << BYTEWIDTH); j++)
+         for (j = (1 << BYTEWIDTH); j >= 0; j--)
            if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
              fastmap[j] = 1;
 
-         if (multibyte)
-           /* Any character set can possibly contain a character
-              whose category is K (or not).  */
-           goto set_fastmap_for_multibyte_characters;
+         /* Any leading code can possibly start a character which
+            has or doesn't has the specified category.  */
+         if (match_any_multibyte_characters == false)
+           {
+             for (j = MIN_MULTIBYTE_LEADING_CODE;
+                  j <= MAX_MULTIBYTE_LEADING_CODE; j++)
+               fastmap[j] = 1;
+             match_any_multibyte_characters = true;
+           }
          break;
 
       /* All cases after this match the empty string.  These end with
@@ -4274,9 +4450,8 @@
   int total_size = size1 + size2;
   int endpos = startpos + range;
   boolean anchored_start;
-
-  /* Nonzero if we have to concern multibyte character.  */
-  const boolean multibyte = RE_MULTIBYTE_P (bufp);
+  /* Nonzero if we are searching multibyte string.  */
+  const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
 
   /* Check for out-of-range STARTPOS.  */
   if (startpos < 0 || startpos > total_size)
@@ -4372,59 +4547,51 @@
 
                        buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
                                                         buf_charlen);
-
                        buf_ch = RE_TRANSLATE (translate, buf_ch);
-                       if (buf_ch >= 0400
-                           || fastmap[buf_ch])
+                       if (fastmap[CHAR_LEADING_CODE (buf_ch)])
                          break;
 
                        range -= buf_charlen;
                        d += buf_charlen;
                      }
                  else
+                   while (range > lim)
                    {
-                     /* Convert *d to integer to shut up GCC's
-                        whining about comparison that is always
-                        true.  */
-                     int di = *d;
+                       register re_wchar_t ch, translated;
 
-                     while (range > lim
-                            && !fastmap[RE_TRANSLATE (translate, di)])
-                       {
-                         di = *(++d);
+                       buf_ch = *d;
+                       ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
+                       translated = RE_TRANSLATE (translate, ch);
+                       if (translated != ch
+                           && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
+                         buf_ch = ch;
+                       if (fastmap[buf_ch])
+                         break;
+                       d++;
                          range--;
                        }
                    }
-               }
              else
-               do
                  {
-                   re_char *d_start = d;
+                 if (multibyte)
+                   while (range > lim)
+                     {
+                       int buf_charlen;
+
+                       buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
+                                                        buf_charlen);
+                       if (fastmap[CHAR_LEADING_CODE (buf_ch)])
+                         break;
+                       range -= buf_charlen;
+                       d += buf_charlen;
+                     }
+                 else
                    while (range > lim && !fastmap[*d])
                      {
                        d++;
                        range--;
                      }
-#ifdef emacs
-                   if (multibyte && range > lim)
-                     {
-                       /* Check that we are at the beginning of a char.  */
-                       int at_boundary;
-                       AT_CHAR_BOUNDARY_P (at_boundary, d, d_start);
-                       if (at_boundary)
-                         break;
-                       else
-                         { /* We have matched an internal byte of a char
-                              rather than the leading byte, so it's a false
-                              positive: we should keep scanning.  */
-                           d++; range--;
-                         }
                      }
-                   else
-#endif
-                     break;
-                 } while (1);
-
              startpos += irange - range;
            }
          else                          /* Searching backwards.  */
@@ -4432,14 +4599,28 @@
              int room = (startpos >= size1
                          ? size2 + size1 - startpos
                          : size1 - startpos);
-             buf_ch = RE_STRING_CHAR (d, room);
+             if (multibyte)
+               {
+                 buf_ch = STRING_CHAR (d, room);
              buf_ch = TRANSLATE (buf_ch);
+                 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
+                   goto advance;
+               }
+             else
+               {
+                 register re_wchar_t ch, translated;
 
-             if (! (buf_ch >= 0400
-                    || fastmap[buf_ch]))
+                 buf_ch = *d;
+                 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
+                 translated = TRANSLATE (ch);
+                 if (translated != ch
+                     && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
+                   buf_ch = ch;
+                 if (! fastmap[TRANSLATE (buf_ch)])
                goto advance;
            }
        }
+       }
 
       /* If can't match the null string, and that's all we have left, fail.  */
       if (range >= 0 && startpos == total_size && fastmap
@@ -4711,11 +4892,11 @@
       {
        register re_wchar_t c
          = (re_opcode_t) *p2 == endline ? '\n'
-         : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
+         : RE_STRING_CHAR (p2 + 2, pend - p2 - 2, multibyte);
 
        if ((re_opcode_t) *p1 == exactn)
          {
-           if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
+           if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2, multibyte))
              {
                DEBUG_PRINT3 ("  '%c' != '%c' => fast loop.\n", c, p1[2]);
                return 1;
@@ -4729,7 +4910,7 @@
 
            /* Test if C is listed in charset (or charset_not)
               at `p1'.  */
-           if (SINGLE_BYTE_CHAR_P (c))
+           if (! multibyte || IS_REAL_ASCII (c))
              {
                if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
                    && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
@@ -4772,9 +4953,10 @@
             size of bitmap table of P1 is extracted by
             using macro `CHARSET_BITMAP_SIZE'.
 
-            Since we know that all the character listed in
-            P2 is ASCII, it is enough to test only bitmap
-            table of P1.  */
+            In a multibyte case, we know that all the character
+            listed in P2 is ASCII.  In a unibyte case, P1 has only a
+            bitmap table.  So, in both cases, it is enough to test
+            only the bitmap table of P1.  */
 
          if ((re_opcode_t) *p1 == charset)
            {
@@ -4932,6 +5114,7 @@
 }
 WEAK_ALIAS (__re_match_2, re_match_2)
 
+
 /* This is a separate function so that we can force an alloca cleanup
    afterwards.  */
 static int
@@ -4971,9 +5154,12 @@
   /* We use this to map every character in the string.  */
   RE_TRANSLATE_TYPE translate = bufp->translate;
 
-  /* Nonzero if we have to concern multibyte character.  */
+  /* Nonzero if BUFP is setup from a multibyte regex.  */
   const boolean multibyte = RE_MULTIBYTE_P (bufp);
 
+  /* Nonzero if STRING1/STRING2 are multibyte.  */
+  const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
+
   /* Failure point stack.  Each place that can handle a failure further
      down the line pushes a failure point on this stack.  It consists of
      regstart, and regend for all registers corresponding to
@@ -5325,22 +5511,51 @@
          /* Remember the start point to rollback upon failure.  */
          dfail = d;
 
+#ifndef emacs
          /* This is written out as an if-else so we don't waste time
             testing `translate' inside the loop.  */
          if (RE_TRANSLATE_P (translate))
+           do
            {
-             if (multibyte)
+               PREFETCH ();
+               if (RE_TRANSLATE (translate, *d) != *p++)
+                 {
+                   d = dfail;
+                   goto fail;
+                 }
+               d++;
+             }
+           while (--mcnt);
+         else
+           do
+             {
+               PREFETCH ();
+               if (*d++ != *p++)
+                 {
+                   d = dfail;
+                   goto fail;
+                 }
+             }
+           while (--mcnt);
+#else  /* emacs */
+         /* The cost of testing `translate' is comparatively small.  */
+         if (target_multibyte)
                do
                  {
                    int pat_charlen, buf_charlen;
-                   unsigned int pat_ch, buf_ch;
+               int pat_ch, buf_ch;
 
                    PREFETCH ();
+               if (multibyte)
                    pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
+               else
+                 {
+                   pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
+                   pat_charlen = 1;
+                 }
                    buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
 
-                   if (RE_TRANSLATE (translate, buf_ch)
-                       != pat_ch)
+               if (TRANSLATE (buf_ch) != pat_ch)
                      {
                        d = dfail;
                        goto fail;
@@ -5354,34 +5569,41 @@
              else
                do
                  {
-                   /* Avoid compiler whining about comparison being
-                      always true.  */
-                   int di;
+               int pat_charlen, buf_charlen;
+               int pat_ch, buf_ch;
 
                    PREFETCH ();
-                   di = *d;
-                   if (RE_TRANSLATE (translate, di) != *p++)
+               if (multibyte)
                      {
-                       d = dfail;
-                       goto fail;
-                     }
-                   d++;
-                 }
-               while (--mcnt);
+                   pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
+                   if (CHAR_BYTE8_P (pat_ch))
+                     pat_ch = CHAR_TO_BYTE8 (pat_ch);
+                   else
+                     pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
            }
          else
            {
-             do
+                   pat_ch = *p;
+                   pat_charlen = 1;
+                 }
+               buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
+               if (! CHAR_BYTE8_P (buf_ch))
                {
-                 PREFETCH ();
-                 if (*d++ != *p++)
+                   buf_ch = TRANSLATE (buf_ch);
+                   buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
+                   if (buf_ch < 0)
+                     buf_ch = *d;
+                 }
+               if (buf_ch != pat_ch)
                    {
                      d = dfail;
                      goto fail;
                    }
+               p += pat_charlen;
+               d++;
                }
              while (--mcnt);
-           }
+#endif
          break;
 
 
@@ -5394,7 +5616,8 @@
            DEBUG_PRINT1 ("EXECUTING anychar.\n");
 
            PREFETCH ();
-           buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+           buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen,
+                                               target_multibyte);
            buf_ch = TRANSLATE (buf_ch);
 
            if ((!(bufp->syntax & RE_DOT_NEWLINE)
@@ -5438,10 +5661,30 @@
              }
 
            PREFETCH ();
-           c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
-           c = TRANSLATE (c); /* The character to match.  */
+           c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len, target_multibyte);
+           if (target_multibyte)
+             {
+               int c1;
+
+               c = TRANSLATE (c);
+               c1 = RE_CHAR_TO_UNIBYTE (c);
+               if (c1 >= 0)
+                 c = c1;
+             }
+           else
+             {
+               int c1 = RE_CHAR_TO_MULTIBYTE (c);
+
+               if (! CHAR_BYTE8_P (c1))
+                 {
+                   c1 = TRANSLATE (c1);
+                   c1 = RE_CHAR_TO_UNIBYTE (c1);
+                   if (c1 >= 0)
+                     c = c1;
+                 }
+             }
 
-           if (SINGLE_BYTE_CHAR_P (c))
+           if (c < (1 << BYTEWIDTH))
              {                 /* Lookup bitmap.  */
                /* Cast to `unsigned' instead of `unsigned char' in
                   case the bit list is a full 32 bytes long.  */
@@ -5581,7 +5824,7 @@
                /* Compare that many; failure if mismatch, else move
                   past them.  */
                if (RE_TRANSLATE_P (translate)
-                   ? bcmp_translate (d, d2, mcnt, translate, multibyte)
+                   ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
                    : memcmp (d, d2, mcnt))
                  {
                    d = dfail;
@@ -5604,7 +5847,7 @@
            }
          else
            {
-             unsigned char c;
+             unsigned c;
              GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
              if (c == '\n')
                break;
@@ -5871,6 +6114,7 @@
                 is the character at D, and S2 is the syntax of C2.  */
              re_wchar_t c1, c2;
              int s1, s2;
+             int dummy;
 #ifdef emacs
              int offset = PTR_TO_OFFSET (d - 1);
              int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
@@ -5882,7 +6126,7 @@
              UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
 #endif
              PREFETCH_NOLIMIT ();
-             c2 = RE_STRING_CHAR (d, dend - d);
+             GET_CHAR_AFTER (c2, d, dummy);
              s2 = SYNTAX (c2);
 
              if (/* Case 2: Only one of S1 and S2 is Sword.  */
@@ -5911,13 +6155,14 @@
                 is the character at D, and S2 is the syntax of C2.  */
              re_wchar_t c1, c2;
              int s1, s2;
+             int dummy;
 #ifdef emacs
              int offset = PTR_TO_OFFSET (d);
              int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
              UPDATE_SYNTAX_TABLE (charpos);
 #endif
              PREFETCH ();
-             c2 = RE_STRING_CHAR (d, dend - d);
+             GET_CHAR_AFTER (c2, d, dummy);
              s2 = SYNTAX (c2);
 
              /* Case 2: S2 is not Sword. */
@@ -5955,6 +6200,7 @@
                 is the character at D, and S2 is the syntax of C2.  */
              re_wchar_t c1, c2;
              int s1, s2;
+             int dummy;
 #ifdef emacs
              int offset = PTR_TO_OFFSET (d) - 1;
              int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
@@ -5971,9 +6217,9 @@
              if (!AT_STRINGS_END (d))
                {
                  PREFETCH_NOLIMIT ();
-                 c2 = RE_STRING_CHAR (d, dend - d);
+                 GET_CHAR_AFTER (c2, d, dummy);
 #ifdef emacs
-                 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+                 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
 #endif
                  s2 = SYNTAX (c2);
 
@@ -6005,7 +6251,7 @@
              UPDATE_SYNTAX_TABLE (charpos);
 #endif
              PREFETCH ();
-             c2 = RE_STRING_CHAR (d, dend - d);
+             c2 = RE_STRING_CHAR (d, dend - d, target_multibyte);
              s2 = SYNTAX (c2);
 
              /* Case 2: S2 is neither Sword nor Ssymbol. */
@@ -6058,7 +6304,7 @@
              if (!AT_STRINGS_END (d))
                {
                  PREFETCH_NOLIMIT ();
-                 c2 = RE_STRING_CHAR (d, dend - d);
+                 c2 = RE_STRING_CHAR (d, dend - d, target_multibyte);
 #ifdef emacs
                  UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
 #endif
@@ -6088,8 +6334,7 @@
            int len;
            re_wchar_t c;
 
-           c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
-
+           GET_CHAR_AFTER (c, d, len);
            if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
              goto fail;
            d += len;
@@ -6125,8 +6370,7 @@
            int len;
            re_wchar_t c;
 
-           c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
-
+           GET_CHAR_AFTER (c, d, len);
            if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
              goto fail;
            d += len;
@@ -6201,11 +6445,11 @@
    bytes; nonzero otherwise.  */
 
 static int
-bcmp_translate (s1, s2, len, translate, multibyte)
+bcmp_translate (s1, s2, len, translate, target_multibyte)
      re_char *s1, *s2;
      register int len;
      RE_TRANSLATE_TYPE translate;
-     const int multibyte;
+     const int target_multibyte;
 {
   register re_char *p1 = s1, *p2 = s2;
   re_char *p1_end = s1 + len;
@@ -6218,8 +6462,8 @@
       int p1_charlen, p2_charlen;
       re_wchar_t p1_ch, p2_ch;
 
-      p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
-      p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
+      GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
+      GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
 
       if (RE_TRANSLATE (translate, p1_ch)
          != RE_TRANSLATE (translate, p2_ch))




reply via email to

[Prev in Thread] Current Thread [Next in Thread]