bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] proper lowercasing of multi-octet-char "keys" string


From: Charles Levert
Subject: [PATCH] proper lowercasing of multi-octet-char "keys" string
Date: Thu, 3 Nov 2005 18:50:09 -0500
User-agent: Mutt/1.4.1i

Ok to commit?



Index: grep/ChangeLog
===================================================================
RCS file: /cvsroot/grep/grep/ChangeLog,v
retrieving revision 1.272
diff -u -r1.272 ChangeLog
--- grep/ChangeLog      27 Sep 2005 14:50:20 -0000      1.272
+++ grep/ChangeLog      3 Nov 2005 23:28:05 -0000
@@ -1,3 +1,11 @@
+2005-11-04  Charles Levert  <address@hidden>
+
+       * src/grep.c (mb_icase_keys): New function to properly lowercase
+         keys if match_icase.  The problem was that some multi-octet
+         characters can get longer or shorter upon this conversion, so that
+         it cannot just naively be done in place on the same memory buffer.
+       * src/grep.c (main): Call mb_icase_keys (and remove in-line code).
+
 2005-09-27  Stepan Kasal  <address@hidden>
 
        * doc/grep.1: Fix a typo.
Index: grep/src/grep.c
===================================================================
RCS file: /cvsroot/grep/grep/src/grep.c,v
retrieving revision 1.113
diff -p -u -r1.113 grep.c
--- grep/src/grep.c     24 Aug 2005 07:28:29 -0000      1.113
+++ grep/src/grep.c     3 Nov 2005 23:28:14 -0000
@@ -1664,6 +1664,66 @@ parse_grep_colors (void)
          program_name, p, q);
 }
 
+#ifdef MBS_SUPPORT
+static void
+mb_icase_keys (char **keys, size_t *len)
+{
+  wchar_t wc;
+  mbstate_t sti, stj;
+  size_t i, j, li, lj;
+  char *ki, *kj;
+  int mcm;
+
+  if ((mcm = MB_CUR_MAX) == 1 || !match_icase)
+    return;
+
+  li = *len;
+  ki = *keys;
+  /* We use a new buffer because some multi-octet characters change
+     length through a lower-case conversion.  For example:
+       len(U+0049)=1 --> len(U+0131)=2   under tr_TR.UTF-8
+       len(U+0130)=2 --> len(U+0069)=1   under en_US.UTF-8
+       len(U+2126)=3 --> len(U+03C9)=2   under en_US.UTF-8
+       len(U+212A)=3 --> len(U+006B)=1   under en_US.UTF-8
+       len(U+212B)=3 --> len(U+00E5)=2   under en_US.UTF-8  */
+  lj = li + mcm;
+  kj = xmalloc(lj + 1);
+
+  memset(&sti, 0, sizeof(mbstate_t));
+  memset(&stj, 0, sizeof(mbstate_t));
+  for (i = j = 0; i < li ;)
+    {
+      size_t mbclen;
+      mbclen = mbrtowc(&wc, ki + i, li - i, &sti);
+      if (lj < j + mcm)
+       {
+         lj += mcm;
+         kj = xrealloc(kj, lj + 1);
+       }
+      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+       {
+         /* An invalid sequence, or a truncated multi-octet character.
+            We treat it as a single-octet character.  */
+         kj[j++] = ki[i++];
+       }
+      else
+       {
+         /* Doing towupper() before towlower() helps a few hairy cases and is
+            not too costly since this is the PATTERN and is done only once.  */
+         wc = towupper((wint_t)wc);
+         wc = towlower((wint_t)wc);
+         j += wcrtomb(kj + j, wc, &stj);
+         i += mbclen;
+       }
+    }
+  kj[j] = '\0';
+
+  free(ki);
+  *keys = kj;
+  *len = j;
+}
+#endif /* MBS_SUPPORT */
+
 int
 main (int argc, char **argv)
 {
@@ -2100,34 +2160,7 @@ warranty; not even for MERCHANTABILITY o
     abort ();
 
 #ifdef MBS_SUPPORT
-  if (MB_CUR_MAX != 1 && match_icase)
-    {
-      wchar_t wc;
-      mbstate_t cur_state, prev_state;
-      int i, len = strlen(keys);
-
-      memset(&cur_state, 0, sizeof(mbstate_t));
-      for (i = 0; i <= len ;)
-       {
-         size_t mbclen;
-         mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state);
-         if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
-           {
-             /* An invalid sequence, or a truncated multibyte character.
-                We treat it as a single byte character.  */
-             mbclen = 1;
-           }
-         else
-           {
-             if (iswupper((wint_t)wc))
-               {
-                 wc = towlower((wint_t)wc);
-                 wcrtomb(keys + i, wc, &cur_state);
-               }
-           }
-         i += mbclen;
-       }
-    }
+  mb_icase_keys (&keys, &keycc);
 #endif /* MBS_SUPPORT */
 
   (*compile)(keys, keycc);




reply via email to

[Prev in Thread] Current Thread [Next in Thread]