[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH] proper lowercasing of multi-octet-char "keys" string
From: |
Charles Levert |
Subject: |
[PATCH] proper lowercasing of multi-octet-char "keys" string |
Date: |
Thu, 3 Nov 2005 18:50:09 -0500 |
User-agent: |
Mutt/1.4.1i |
Ok to commit?
Index: grep/ChangeLog
===================================================================
RCS file: /cvsroot/grep/grep/ChangeLog,v
retrieving revision 1.272
diff -u -r1.272 ChangeLog
--- grep/ChangeLog 27 Sep 2005 14:50:20 -0000 1.272
+++ grep/ChangeLog 3 Nov 2005 23:28:05 -0000
@@ -1,3 +1,11 @@
+2005-11-04 Charles Levert <address@hidden>
+
+ * src/grep.c (mb_icase_keys): New function to properly lowercase
+ keys if match_icase. The problem was that some multi-octet
+ characters can get longer or shorter upon this conversion, so that
+ it cannot just naively be done in place on the same memory buffer.
+ * src/grep.c (main): Call mb_icase_keys (and remove in-line code).
+
2005-09-27 Stepan Kasal <address@hidden>
* doc/grep.1: Fix a typo.
Index: grep/src/grep.c
===================================================================
RCS file: /cvsroot/grep/grep/src/grep.c,v
retrieving revision 1.113
diff -p -u -r1.113 grep.c
--- grep/src/grep.c 24 Aug 2005 07:28:29 -0000 1.113
+++ grep/src/grep.c 3 Nov 2005 23:28:14 -0000
@@ -1664,6 +1664,66 @@ parse_grep_colors (void)
program_name, p, q);
}
+#ifdef MBS_SUPPORT
+static void
+mb_icase_keys (char **keys, size_t *len)
+{
+ wchar_t wc;
+ mbstate_t sti, stj;
+ size_t i, j, li, lj;
+ char *ki, *kj;
+ int mcm;
+
+ if ((mcm = MB_CUR_MAX) == 1 || !match_icase)
+ return;
+
+ li = *len;
+ ki = *keys;
+ /* We use a new buffer because some multi-octet characters change
+ length through a lower-case conversion. For example:
+ len(U+0049)=1 --> len(U+0131)=2 under tr_TR.UTF-8
+ len(U+0130)=2 --> len(U+0069)=1 under en_US.UTF-8
+ len(U+2126)=3 --> len(U+03C9)=2 under en_US.UTF-8
+ len(U+212A)=3 --> len(U+006B)=1 under en_US.UTF-8
+ len(U+212B)=3 --> len(U+00E5)=2 under en_US.UTF-8 */
+ lj = li + mcm;
+ kj = xmalloc(lj + 1);
+
+ memset(&sti, 0, sizeof(mbstate_t));
+ memset(&stj, 0, sizeof(mbstate_t));
+ for (i = j = 0; i < li ;)
+ {
+ size_t mbclen;
+ mbclen = mbrtowc(&wc, ki + i, li - i, &sti);
+ if (lj < j + mcm)
+ {
+ lj += mcm;
+ kj = xrealloc(kj, lj + 1);
+ }
+ if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+ {
+ /* An invalid sequence, or a truncated multi-octet character.
+ We treat it as a single-octet character. */
+ kj[j++] = ki[i++];
+ }
+ else
+ {
+ /* Doing towupper() before towlower() helps a few hairy cases and is
+ not too costly since this is the PATTERN and is done only once. */
+ wc = towupper((wint_t)wc);
+ wc = towlower((wint_t)wc);
+ j += wcrtomb(kj + j, wc, &stj);
+ i += mbclen;
+ }
+ }
+ kj[j] = '\0';
+
+ free(ki);
+ *keys = kj;
+ *len = j;
+}
+#endif /* MBS_SUPPORT */
+
int
main (int argc, char **argv)
{
@@ -2100,34 +2160,7 @@ warranty; not even for MERCHANTABILITY o
abort ();
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX != 1 && match_icase)
- {
- wchar_t wc;
- mbstate_t cur_state, prev_state;
- int i, len = strlen(keys);
-
- memset(&cur_state, 0, sizeof(mbstate_t));
- for (i = 0; i <= len ;)
- {
- size_t mbclen;
- mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state);
- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
- {
- /* An invalid sequence, or a truncated multibyte character.
- We treat it as a single byte character. */
- mbclen = 1;
- }
- else
- {
- if (iswupper((wint_t)wc))
- {
- wc = towlower((wint_t)wc);
- wcrtomb(keys + i, wc, &cur_state);
- }
- }
- i += mbclen;
- }
- }
+ mb_icase_keys (&keys, &keycc);
#endif /* MBS_SUPPORT */
(*compile)(keys, keycc);
- [PATCH] proper lowercasing of multi-octet-char "keys" string,
Charles Levert <=