emacs-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Regexp matching errors


From: Stefan Monnier
Subject: Regexp matching errors
Date: Fri, 22 Sep 2006 13:34:01 -0400
User-agent: Gnus/5.11 (Gnus v5.11) Emacs/22.0.50 (gnu/linux)

The handling of the fastmap optimization in regep.c had some bugs w.r.t
eight-bit-* chars in multibyte buffers/strings when case-fold is not in use:

   src/emacs --batch -Q --eval \
             '(let ((case-fold-search nil))
                (message "%s" (list (string-match "\xa1\\|a" "éf\x81g")
                                    (string-match "\x81\\|a" "éf\x81g")
                                    (string-match "[\xa1]" "éf\x81g")
                                    (string-match "[\x81]" "éf\x81g"))))'

returned

   (2 nil 2 nil)

I've installed the patch below to hopefully fix them.
I believe this patch to be prefectly safe, but you never know, so if you
notice anything fishy about regexp-matching, please tell me.


        Stefan



2006-09-22  Stefan Monnier  <address@hidden>

        * regex.c (analyse_first): For eight-bit-control chars, mark both the
        char's value and its leading byte in the fastmap.
        (re_search_2): When fast-scanning without translation, be careful to
        check that we only match the leading byte of a multibyte char.

        * charset.h (PREV_CHAR_BOUNDARY): Make it work from within a char's
        byte sequence.
        (AT_CHAR_BOUNDARY): New macro.

Index: src/charset.h
===================================================================
RCS file: /sources/emacs/emacs/src/charset.h,v
retrieving revision 1.83
diff -u -r1.83 charset.h
--- src/charset.h       29 May 2006 06:19:09 -0000      1.83
+++ src/charset.h       22 Sep 2006 17:29:43 -0000
@@ -658,22 +658,34 @@
   } while (0)
 
 
-/* If P is after LIMIT, advance P to the previous character boundary.
-   It assumes that P is already at a character boundary of the sane
-   mulitbyte form whose beginning address is LIMIT.  */
+/* If P is after LIMIT, advance P to the previous character boundary.  */
 
 #define PREV_CHAR_BOUNDARY(p, limit)                                   \
   do {                                                                 \
     if ((p) > (limit))                                                 \
       {                                                                        
\
        const unsigned char *p0 = (p);                                  \
+       const unsigned char *p_limit = max (limit, p0 - MAX_MULTIBYTE_LENGTH);\
        do {                                                            \
          p0--;                                                         \
-       } while (p0 >= limit && ! CHAR_HEAD_P (*p0));                   \
-       (p) = (BYTES_BY_CHAR_HEAD (*p0) == (p) - p0) ? p0 : (p) - 1;    \
+       } while (p0 >= p_limit && ! CHAR_HEAD_P (*p0));                 \
+       /* If BBCH(*p0) > p-p0, it means we were not on a boundary.  */ \
+       (p) = (BYTES_BY_CHAR_HEAD (*p0) >= (p) - p0) ? p0 : (p) - 1;    \
       }                                                                        
\
   } while (0)
 
+#define AT_CHAR_BOUNDARY_P(result, p, limit)   \
+  do {                                         \
+    if (CHAR_HEAD_P (*(p)) || (p) <= limit)    \
+      /* Optimization for the common case. */  \
+      (result) = 1;                            \
+    else                                       \
+      {                                                \
+       const unsigned char *p_aux = (p)+1;     \
+       PREV_CHAR_BOUNDARY (p_aux, limit);      \
+       (result) = (p_aux == (p));              \
+      }                                                \
+} while (0)
 
 #ifdef emacs
 
Index: src/regex.c
===================================================================
RCS file: /sources/emacs/emacs/src/regex.c,v
retrieving revision 1.212
diff -u -r1.212 regex.c
--- src/regex.c 16 Sep 2006 15:28:47 -0000      1.212
+++ src/regex.c 22 Sep 2006 17:29:43 -0000
@@ -3877,11 +3877,13 @@
          if (fastmap)
            {
              int c = RE_STRING_CHAR (p + 1, pend - p);
-
+             /* When fast-scanning, the fastmap can be indexed either with
+                a char (smaller than 256) or with the first byte of
+                a char's byte sequence.  So we have to conservatively add
+                both to the table.  */
              if (SINGLE_BYTE_CHAR_P (c))
                fastmap[c] = 1;
-             else
-               fastmap[p[1]] = 1;
+             fastmap[p[1]] = 1;
            }
          break;
 
@@ -3899,6 +3901,10 @@
             So any that are not listed in the charset
             are possible matches, even in multibyte buffers.  */
          if (!fastmap) break;
+         /* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially
+            because it will automatically be set when needed by virtue of
+            being larger than the highest char of its charset (0xbf) but
+            smaller than (1<<BYTEWIDTH).  */
          for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
               j < (1 << BYTEWIDTH); j++)
            fastmap[j] = 1;
@@ -3909,7 +3915,13 @@
          for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
               j >= 0; j--)
            if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
-             fastmap[j] = 1;
+             {
+               fastmap[j] = 1;
+#ifdef emacs
+               if (j >= 0x80 && j < 0xa0)
+                 fastmap[LEADING_CODE_8_BIT_CONTROL] = 1;
+#endif
+             }
 
          if ((not && multibyte)
              /* Any character set can possibly contain a character
@@ -4352,11 +4364,33 @@
                    }
                }
              else
-               while (range > lim && !fastmap[*d])
+               do
                  {
-                   d++;
-                   range--;
-                 }
+                   re_char *d_start = d;
+                   while (range > lim && !fastmap[*d])
+                     {
+                       d++;
+                       range--;
+                     }
+#ifdef emacs
+                   if (multibyte && range > lim)
+                     {
+                       /* Check that we are at the beginning of a char.  */
+                       int at_boundary;
+                       AT_CHAR_BOUNDARY_P (at_boundary, d, d_start);
+                       if (at_boundary)
+                         break;
+                       else
+                         { /* We have matched an internal byte of a char
+                              rather than the leading byte, so it's a false
+                              positive: we should keep scanning.  */
+                           d++; range--;
+                         }
+                     }
+                   else
+#endif
+                     break;
+                 } while (1);
 
              startpos += irange - range;
            }




reply via email to

[Prev in Thread] Current Thread [Next in Thread]