[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 2/2] regex: fix ignore-case Turkish bug
From: |
Paul Eggert |
Subject: |
[PATCH 2/2] regex: fix ignore-case Turkish bug |
Date: |
Wed, 23 Sep 2020 17:05:03 -0700 |
* lib/regex_internal.c (build_wcs_upper_buffer):
Do not assume that converting single-byte character to upper
yields a single-byte character. This is not true for Turkish,
where towupper (L'i') yields L'İ', which is not single-byte.
* tests/test-regex.c (main): Test for this bug.
---
ChangeLog | 7 +++++++
lib/regex_internal.c | 19 ++++++++++---------
tests/test-regex.c | 41 ++++++++++++++++++++++++++++++++++++-----
3 files changed, 53 insertions(+), 14 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index d15f158ab..5c4d8f849 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
2020-09-23 Paul Eggert <eggert@cs.ucla.edu>
+ regex: fix ignore-case Turkish bug
+ * lib/regex_internal.c (build_wcs_upper_buffer):
+ Do not assume that converting single-byte character to upper
+ yields a single-byte character. This is not true for Turkish,
+ where towupper (L'i') yields L'İ', which is not single-byte.
+ * tests/test-regex.c (main): Test for this bug.
+
regex: port to weird isascii platforms
* lib/regex_internal.h (isascii) [!_LIBC]: Supply glibc version.
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
index e1b6b4d5a..ed0a13461 100644
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -300,18 +300,20 @@ build_wcs_upper_buffer (re_string_t *pstr)
while (byte_idx < end_idx)
{
wchar_t wc;
+ unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
- if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
- && mbsinit (&pstr->cur_state))
+ if (isascii (ch) && mbsinit (&pstr->cur_state))
{
- /* In case of a singlebyte character. */
- pstr->mbs[byte_idx]
- = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
/* The next step uses the assumption that wchar_t is encoded
ASCII-safe: all ASCII values can be converted like this. */
- pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
- ++byte_idx;
- continue;
+ wchar_t wcu = __towupper (ch);
+ if (isascii (wcu))
+ {
+ pstr->mbs[byte_idx] = wcu;
+ pstr->wcs[byte_idx] = wcu;
+ byte_idx++;
+ continue;
+ }
}
remain_len = end_idx - byte_idx;
@@ -348,7 +350,6 @@ build_wcs_upper_buffer (re_string_t *pstr)
{
/* It is an invalid character, an incomplete character
at the end of the string, or '\0'. Just use the byte. */
- int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
pstr->mbs[byte_idx] = ch;
/* And also cast it to wide char. */
pstr->wcs[byte_idx++] = (wchar_t) ch;
diff --git a/tests/test-regex.c b/tests/test-regex.c
index d3f429aeb..b4e23c8c8 100644
--- a/tests/test-regex.c
+++ b/tests/test-regex.c
@@ -29,6 +29,15 @@
#include "localcharset.h"
+/* Check whether it's really a UTF-8 locale.
+ On mingw, setlocale (LC_ALL, "en_US.UTF-8") succeeds but returns
+ "English_United States.1252", with locale_charset () returning "CP1252". */
+static int
+really_utf8 (void)
+{
+ return strcmp (locale_charset (), "UTF-8") == 0;
+}
+
int
main (void)
{
@@ -75,11 +84,7 @@ main (void)
}
}
- /* Check whether it's really a UTF-8 locale.
- On mingw, the setlocale call succeeds but returns
- "English_United States.1252", with locale_charset() returning
- "CP1252". */
- if (strcmp (locale_charset (), "UTF-8") == 0)
+ if (really_utf8 ())
{
/* This test is from glibc bug 15078.
The test case is from Andreas Schwab in
@@ -119,6 +124,32 @@ main (void)
return 1;
}
+ if (setlocale (LC_ALL, "tr_TR.UTF-8") && really_utf8 ())
+ {
+ re_set_syntax (RE_SYNTAX_GREP | RE_ICASE);
+ if (re_compile_pattern ("i", 1, ®ex))
+ result |= 1;
+ else
+ {
+ /* UTF-8 encoding of U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE.
+ In Turkish, this is the upper-case equivalent of ASCII "i".
+ Older versions of Gnulib failed to match "i" to U+0130 when
+ ignoring case in Turkish <https://bugs.gnu.org/43577>. */
+ static char const data[] = "\xc4\xb0";
+
+ memset (®s, 0, sizeof regs);
+ if (re_search (®ex, data, sizeof data - 1, 0, sizeof data - 1,
+ ®s))
+ result |= 1;
+ regfree (®ex);
+ free (regs.start);
+ free (regs.end);
+
+ if (! setlocale (LC_ALL, "C"))
+ return 1;
+ }
+ }
+
/* This test is from glibc bug 3957, reported by Andrew Mackey. */
re_set_syntax (RE_SYNTAX_EGREP | RE_HAT_LISTS_NOT_NEWLINE);
memset (®ex, 0, sizeof regex);
--
2.25.4