bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: dfa - gawk matching problem on windows and suggested fix


From: Jim Meyering
Subject: Re: dfa - gawk matching problem on windows and suggested fix
Date: Sun, 02 Oct 2011 21:45:01 +0200

Aharon Robbins wrote:
...

Thanks for forwarding that Arnold,
and for the analysis/patch, Eli.

>> 2011-09-30  Eli Zaretskii  <address@hidden>
>>
>>      * dfa.c (FETCH_WC, FETCH): Produce an unsigned value, rather than
>>      a sign-extended one.  Fixes a bug on MS-Windows with compiling
>>      patterns that include characters with the 8-th bit set.
>>      Reported by David Millis <address@hidden>.
>>
>> --- dfa.c.orig       2011-06-23 12:27:01.000000000 +0300
>> +++ dfa.c    2011-09-30 16:06:25.609375000 +0300
>> @@ -691,19 +691,22 @@ static unsigned char const *buf_end;   /*
>>      else                                    \
>>        {                                             \
>>          wchar_t _wc;                                \
>> +        unsigned char uc;                   \
>>          cur_mb_len = mbrtowc(&_wc, lexptr, lexleft, &mbs); \
>>          if (cur_mb_len <= 0)                        \
>>            {                                 \
>>              cur_mb_len = 1;                 \
>>              --lexleft;                              \
>> -            (wc) = (c) = (unsigned char) *lexptr++; \
>> +            uc = (unsigned char) *lexptr++; \
>> +        (wc) = (c) = uc;                    \
>>            }                                 \
>>          else                                        \
>>            {                                 \
>>              lexptr += cur_mb_len;           \
>>              lexleft -= cur_mb_len;          \
>>              (wc) = _wc;                             \
>> -            (c) = wctob(wc);                        \
>> +            uc = (unsigned) wctob(wc);              \
>> +            (c) = uc;                               \
>>            }                                 \
>>        }                                             \
>>    } while(0)
>> @@ -718,6 +721,7 @@ static unsigned char const *buf_end;     /*
>>  /* Note that characters become unsigned here. */
>>  # define FETCH(c, eoferr)         \
>>    do {                                    \
>> +    unsigned char uc;                     \
>>      if (! lexleft)                \
>>        {                                   \
>>          if ((eoferr) != 0)        \
>> @@ -725,7 +729,8 @@ static unsigned char const *buf_end;     /*
>>          else                              \
>>            return lasttok = END;           \
>>        }                                   \
>> -    (c) = (unsigned char) *lexptr++;  \
>> +    uc = (unsigned char) *lexptr++;   \
>> +    (c) = uc;                             \
>>      --lexleft;                            \
>>    } while(0)

The changes that guard against sign extension seem justified in the
first and third cases.  However, since wctob is already returning
an int, do we really need to worry about sign extension there?

I have adjusted your patch to use the to_uchar function that
we use for precisely this purpose in coreutils.
Not only is it cleaner because it avoids an explicit cast, but it
is also shorter and requires no added temporary variable.

Eli, can you confirm that this also solves the problem
and that the log text is alright with you?  Once you do
that, and I've written a NEWS entry and a test, I'll push it.


>From 64f1937b504321bfffc5154b2ed079f34e4b2502 Mon Sep 17 00:00:00 2001
From: Eli Zaretskii <address@hidden>
Date: Sun, 2 Oct 2011 21:33:53 +0200
Subject: [PATCH] dfa: don't mishandle high-bit bytes in a regexp with
 signed-char

This appears to arise only on systems for which "char" is signed.
* src/dfa.c (FETCH_WC, FETCH): Produce an unsigned value, rather
than a sign-extended one.  Fixes a bug on MS-Windows with compiling
patterns that include characters with the 8-th bit set.
(to_uchar): Define.  From coreutils.
Reported by David Millis <address@hidden>.
See http://thread.gmane.org/gmane.comp.gnu.grep.bugs/3893
---
 src/dfa.c |    9 +++++++--
 1 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 8611435..dc87915 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -86,6 +86,11 @@
 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
 typedef int charclass[CHARCLASS_INTS];

+/* Convert a possibly-signed character to an unsigned character.  This is
+   a bit safer than casting to unsigned char, since it catches some type
+   errors that the cast doesn't.  */
+static inline unsigned char to_uchar (char ch) { return ch; }
+
 /* Sometimes characters can only be matched depending on the surrounding
    context.  Such context decisions depend on what the previous character
    was, and the value of the current (lookahead) character.  Context
@@ -686,7 +691,7 @@ static unsigned char const *buf_end;        /* reference to 
end in dfaexec().  */
           {                                    \
             cur_mb_len = 1;                    \
             --lexleft;                         \
-            (wc) = (c) = (unsigned char) *lexptr++; \
+            (wc) = (c) = to_uchar (*lexptr++);  \
           }                                    \
         else                                   \
           {                                    \
@@ -715,7 +720,7 @@ static unsigned char const *buf_end;        /* reference to 
end in dfaexec().  */
         else                         \
           return lasttok = END;              \
       }                                      \
-    (c) = (unsigned char) *lexptr++;  \
+    (c) = to_uchar (*lexptr++);       \
     --lexleft;                       \
   } while(0)

--
1.7.7.rc0.362.g5a14



reply via email to

[Prev in Thread] Current Thread [Next in Thread]