From df950021df03d33f06e54505e3e68be5989d72a6 Mon Sep 17 00:00:00 2001
From: Paul Eggert
Date: Fri, 7 Mar 2014 18:27:28 -0800
Subject: [PATCH] grep: fix case-fold mismatches between DFA and regex
The DFA code and the regex code didn't use the same semantics for
case-folding. The regex code says that the data char d matches
the pattern char p if uc (d) == uc (p). POSIX is unclear in this
area; the simplest fix for now is to change the DFA code to agree
with the regex code. See .
* src/dfa.c (static_assert): New macro, if not already defined.
(setbit_case_fold_c): Assume MB_CUR_MAX is 1 and that case_fold
is nonzero; all callers changed.
(setbit_case_fold_c, parse_bracket_exp, lex, atom):
Case-fold like the regex code does.
(lonesome_lower): New constant.
(case_folded_counterparts): New function.
(parse_bracket_exp): Prefer plain setbit when case-folding is
not needed.
* src/dfa.h (CASE_FOLDED_BUFSIZE): New constant.
(case_folded_counterparts): New function decl.
* src/main.c (trivial_case_ignore): Case-fold like the regex code does.
(main): Try to improve comment re trivial_case_ignore.
* tests/case-fold-titlecase: Add lots more test cases.
---
src/dfa.c | 143 ++++++++++++++++++++++++---------------
src/dfa.h | 8 +++
src/main.c | 57 ++++++++--------
tests/case-fold-titlecase | 167 +++++++++++++++++++++++++++++++++++++++++++---
4 files changed, 282 insertions(+), 93 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 585a599..5910268 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -34,6 +34,12 @@
#include
#include
+/* Gawk doesn't use Gnulib, so don't assume static_assert is present. */
+#ifndef static_assert
+# define static_assert(cond, diagnostic) \
+ extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
+#endif
+
#define STREQ(a, b) (strcmp (a, b) == 0)
/* ISASCIIDIGIT differs from isdigit, as follows:
@@ -710,34 +716,16 @@ setbit_wc (wint_t wc, charclass c)
#endif
}
-/* Set a bit for B in the charclass C, if B is a valid single byte
- character in the current character set. If case is folded, set B's
- lower and upper case variants similarly. If MB_CUR_MAX > 1, the
- resulting charset is used only as an optimization, and the caller
- should set the appropriate field of struct mb_char_classes. */
+/* Set a bit for B and its case variants in the charclass C.
+ MB_CUR_MAX must be 1. */
static void
setbit_case_fold_c (int b, charclass c)
{
- if (MB_CUR_MAX > 1)
- {
- wint_t wc = btowc (b);
- if (wc == WEOF)
- return;
- if (case_fold)
- {
- setbit_wc (towlower (wc), c);
- setbit_wc (towupper (wc), c);
- }
- }
- else
- {
- if (case_fold)
- {
- setbit (tolower (b), c);
- setbit (toupper (b), c);
- }
- }
- setbit (b, c);
+ int ub = toupper (b);
+ int i;
+ for (i = 0; i < NOTCHAR; i++)
+ if (toupper (i) == ub)
+ setbit (i, c);
}
@@ -898,6 +886,50 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
+/* The set of wchar_t values C such that there's a useful locale
+ somewhere where C != towupper (C) && C != towlower (towupper (C)).
+ For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+ towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+ towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
+static short const lonesome_lower[] =
+ {
+ 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+ 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+ /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+ counterpart in locales predating Unicode 4.0.0 (April 2003). */
+ 0x03F2,
+
+ 0x03F5, 0x1E9B, 0x1FBE,
+ };
+
+static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2
+ == CASE_FOLDED_BUFSIZE),
+ "CASE_FOLDED_BUFSIZE is wrong");
+
+/* Find the characters equal to C after case-folding, other than C
+ itself, and store them into FOLDED. Return the number of characters
+ stored. */
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+ int i;
+ int n = 0;
+ wint_t uc = towupper (c);
+ wint_t lc = towlower (uc);
+ if (uc != c)
+ folded[n++] = uc;
+ if (lc != uc && lc != c && towupper (lc) == uc)
+ folded[n++] = lc;
+ for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+ {
+ wint_t li = lonesome_lower[i];
+ if (li != lc && li != uc && li != c && towupper (li) == uc)
+ folded[n++] = li;
+ }
+ return n;
+}
+
typedef int predicate (int);
/* The following list maps the names of the Posix named character classes
@@ -1058,7 +1090,7 @@ parse_bracket_exp (void)
for (c2 = 0; c2 < NOTCHAR; ++c2)
if (pred->func (c2))
- setbit_case_fold_c (c2, ccl);
+ setbit (c2, ccl);
}
else
known_bracket_exp = false;
@@ -1125,8 +1157,21 @@ parse_bracket_exp (void)
}
}
else if (using_simple_locale ())
- for (; c <= c2; c++)
- setbit_case_fold_c (c, ccl);
+ {
+ for (c1 = c; c1 <= c2; c1++)
+ setbit (c1, ccl);
+ if (case_fold)
+ {
+ int uc = toupper (c);
+ int uc2 = toupper (c2);
+ for (c1 = 0; c1 < NOTCHAR; c1++)
+ {
+ int uc1 = toupper (c1);
+ if (uc <= uc1 && uc1 <= uc2)
+ setbit (c1, ccl);
+ }
+ }
+ }
else
known_bracket_exp = false;
@@ -1145,26 +1190,22 @@ parse_bracket_exp (void)
if (MB_CUR_MAX == 1)
{
- setbit_case_fold_c (c, ccl);
+ if (case_fold)
+ setbit_case_fold_c (c, ccl);
+ else
+ setbit (c, ccl);
continue;
}
if (case_fold)
{
- wint_t folded = towlower (wc);
- if (folded != wc && !setbit_wc (folded, ccl))
- {
- REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
- work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] = folded;
- }
- folded = towupper (wc);
- if (folded != wc && !setbit_wc (folded, ccl))
- {
- REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
- work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] = folded;
- }
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ int i, n = case_folded_counterparts (wc, folded);
+ REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
+ work_mbc->nchars + n);
+ for (i = 0; i < n; i++)
+ if (!setbit_wc (folded[i], ccl))
+ work_mbc->chars[work_mbc->nchars++] = folded[i];
}
if (!setbit_wc (wc, ccl))
{
@@ -1510,7 +1551,7 @@ lex (void)
if (MB_CUR_MAX > 1)
return lasttok = WCHAR;
- if (case_fold && (tolower (c) != c || toupper (c) != c))
+ if (case_fold && isalpha (c))
{
zeroset (ccl);
setbit_case_fold_c (c, ccl);
@@ -1757,18 +1798,14 @@ atom (void)
if (MBS_SUPPORT && tok == WCHAR)
{
addtok_wc (wctok);
+
if (case_fold)
{
- wint_t folded = towlower (wctok);
- if (folded != wctok)
- {
- addtok_wc (folded);
- addtok (OR);
- }
- folded = towupper (wctok);
- if (folded != wctok)
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ int i, n = case_folded_counterparts (wctok, folded);
+ for (i = 0; i < n; i++)
{
- addtok_wc (folded);
+ addtok_wc (folded[i]);
addtok (OR);
}
}
diff --git a/src/dfa.h b/src/dfa.h
index 7e0674f..ad2b854 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -101,3 +101,11 @@ extern void dfawarn (const char *);
extern _Noreturn void dfaerror (const char *);
extern int using_utf8 (void);
+
+/* Maximum number of characters that can be the case-folded
+ counterparts of a single character, not counting the character
+ itself. This is 1 for towupper, 1 for towlower, and 1 for each
+ entry in LONESOME_LOWER; see dfa.c. */
+enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 };
+
+int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/src/main.c b/src/main.c
index 808e47a..4f8f1ef 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1892,11 +1892,12 @@ trivial_case_ignore (size_t len, char const *keys,
return false;
/* Worst case is that each byte B of KEYS is ASCII alphabetic and
- the two other_case(B) characters, C and D, each occupies
- MB_CUR_MAX bytes, so each B maps to [BCD], which requires 2 *
- MB_CUR_MAX + 3 bytes; this is bounded above by the constant
- expression 2 * MB_LEN_MAX + 3. */
- *new_keys = xnmalloc (len + 1, 2 * MB_LEN_MAX + 3);
+ CASE_FOLDED_BUFSIZE other_case(B) characters, C through Z, each
+ occupying MB_CUR_MAX bytes, so each B maps to [BC...Z], which
+ requires CASE_FOLDED_BUFSIZE * MB_CUR_MAX + 3 bytes; this is
+ bounded above by the constant expression CASE_FOLDED_BUFSIZE *
+ MB_LEN_MAX + 3. */
+ *new_keys = xnmalloc (len + 1, CASE_FOLDED_BUFSIZE * MB_LEN_MAX + 3);
char *p = *new_keys;
mbstate_t mb_state = { 0 };
@@ -1918,9 +1919,9 @@ trivial_case_ignore (size_t len, char const *keys,
keys += n;
len -= n;
- wint_t lc = towlower (wc);
- wint_t uc = towupper (wc);
- if (lc == wc && uc == wc)
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ int nfolded = case_folded_counterparts (wc, folded);
+ if (nfolded <= 0)
{
memcpy (p, orig, n);
p += n;
@@ -1933,20 +1934,18 @@ trivial_case_ignore (size_t len, char const *keys,
memcpy (p, orig, n);
p += n;
- if (lc != wc)
- {
- size_t lcbytes = WCRTOMB (p, lc, &mb_state);
- if (lcbytes == (size_t) -1)
- goto skip_case_ignore_optimization;
- p += lcbytes;
- }
- if (uc != wc)
+ int i = 0;
+ do
{
- size_t ucbytes = WCRTOMB (p, uc, &mb_state);
- if (ucbytes == (size_t) -1 || ! mbsinit (&mb_state))
+ size_t nbytes = WCRTOMB (p, folded[i], &mb_state);
+ if (nbytes == (size_t) -1)
goto skip_case_ignore_optimization;
- p += ucbytes;
+ p += nbytes;
}
+ while (++i < nfolded);
+
+ if (! mbsinit (&mb_state))
+ goto skip_case_ignore_optimization;
*p++ = ']';
}
@@ -2351,16 +2350,16 @@ main (int argc, char **argv)
else
usage (EXIT_TROUBLE);
- /* As currently implemented, case-insensitive matching is expensive in
- multi-byte locales because of a few outlier locales in which some
- characters change size when converted to upper or lower case. To
- accommodate those, we revert to searching the input one line at a
- time, rather than using the much more efficient buffer search.
- However, if we have a regular expression, /foo/i, we can convert
- it to an equivalent case-insensitive /[fF][oO][oO]/, and thus
- avoid the expensive read-and-process-a-line-at-a-time requirement.
- Optimize-away the "-i" option, when possible, converting each
- candidate alpha, C, in the regexp to [Cc]. */
+ /* Case-insensitive matching is expensive in multibyte locales
+ because a few characters may change size when converted to upper
+ or lower case. To accommodate those, search the input one line
+ at a time, rather than using the much more efficient buffer search.
+
+ Try to convert a regular expression 'foo' (ignoring case) to an
+ equivalent regular expression '[fF][oO][oO]' (where case matters).
+ Not only does this avoid the expensive requirement to read and
+ process a line at a time, it also allows use of the kwset engine,
+ a win in non-UTF-8 multibyte locales. */
if (match_icase)
{
size_t new_keycc;
diff --git a/tests/case-fold-titlecase b/tests/case-fold-titlecase
index 0ece5c8..f16022b 100755
--- a/tests/case-fold-titlecase
+++ b/tests/case-fold-titlecase
@@ -1,5 +1,5 @@
#!/bin/sh
-# Check that case folding works even with titlecase characters.
+# Check that case folding works even with titlecase and similarly odd chars.
# Copyright 2014 Free Software Foundation, Inc.
@@ -25,17 +25,162 @@ export LC_ALL
fail=0
-LJ='\307\207' # U+01C7 LATIN CAPITAL LETTER LJ
-Lj='\307\210' # U+01C8 LATIN CAPITAL LETTER L WITH SMALL LETTER J
-lj='\307\211' # U+01C9 LATIN SMALL LETTER LJ
-pattern=$(printf "$Lj\n") || framework_failure_
-printf "$lj$lj\n$Lj$Lj\n$LJ$LJ\n" >in || framework_failure_
+for testcase in \
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
+do
+ case $testcase in
+ 0)
+ a='\302\265' # U+00B5
+ b='\316\234' # U+039C
+ c='\316\274' # U+03BC
+ ;;
+ 1)
+ a='\111' # U+0049
+ b='\151' # U+0069
+ c='\304\260' # U+0130
+ ;;
+ 2)
+ a='\111' # U+0049
+ b='\151' # U+0069
+ c='\304\261' # U+0131
+ ;;
+ 3)
+ a='\123' # U+0053
+ b='\163' # U+0073
+ c='\305\277' # U+017F
+ ;;
+ 4)
+ a='\307\204' # U+01C4
+ b='\307\205' # U+01C5
+ c='\307\206' # U+01C6
+ ;;
+ 5)
+ a='\307\207' # U+01C7
+ b='\307\210' # U+01C8
+ c='\307\211' # U+01C9
+ ;;
+ 6)
+ a='\307\212' # U+01CA
+ b='\307\213' # U+01CB
+ c='\307\214' # U+01CC
+ ;;
+ 7)
+ a='\307\261' # U+01F1
+ b='\307\262' # U+01F2
+ c='\307\263' # U+01F3
+ ;;
+ 8)
+ a='\315\205' # U+0345
+ b='\316\231' # U+0399
+ c='\316\271' # U+03B9
+ ;;
+ 9)
+ a='\316\243' # U+03A3
+ b='\317\202' # U+03C2
+ c='\317\203' # U+03C3
+ ;;
+ 10)
+ a='\316\222' # U+0392
+ b='\316\262' # U+03B2
+ c='\317\220' # U+03D0
+ ;;
+ 11)
+ a='\316\230' # U+0398
+ b='\316\270' # U+03B8
+ c='\317\221' # U+03D1
+ ;;
+ 12)
+ a='\316\246' # U+03A6
+ b='\317\206' # U+03C6
+ c='\317\225' # U+03D5
+ ;;
+ 13)
+ a='\316\240' # U+03A0
+ b='\317\200' # U+03C0
+ c='\317\226' # U+03D6
+ ;;
+ 14)
+ a='\316\232' # U+039A
+ b='\316\272' # U+03BA
+ c='\317\260' # U+03F0
+ ;;
+ 15)
+ a='\316\241' # U+03A1
+ b='\317\201' # U+03C1
+ c='\317\261' # U+03F1
+ ;;
+ 16)
+ a='\316\230' # U+0398
+ b='\316\270' # U+03B8
+ c='\317\264' # U+03F4
+ ;;
+ 17)
+ a='\316\225' # U+0395
+ b='\316\265' # U+03B5
+ c='\317\265' # U+03F5
+ ;;
+ 18)
+ a='\341\271\240' # U+1E60
+ b='\341\271\241' # U+1E61
+ c='\341\272\233' # U+1E9B
+ ;;
+ 19)
+ a='\303\237' # U+00DF
+ b='\303\237' # U+00DF
+ c='\341\272\236' # U+1E9E
+ ;;
+ 20)
+ a='\316\231' # U+0399
+ b='\316\271' # U+03B9
+ c='\341\276\276' # U+1FBE
+ ;;
+ 21)
+ a='\316\251' # U+03A9
+ b='\317\211' # U+03C9
+ c='\342\204\246' # U+2126
+ ;;
+ 22)
+ a='\113' # U+004B
+ b='\153' # U+006B
+ c='\342\204\252' # U+212A
+ ;;
+ 23)
+ a='\303\205' # U+00C5
+ b='\303\245' # U+00E5
+ c='\342\204\253' # U+212B
+ ;;
+ 24)
+ a='\316\243' # U+03A3
+ b='\317\203' # U+03C3
+ c='\317\262' # U+03F2
+ ;;
+ esac
-grep -i "$pattern" in >out || fail=1
-compare in out || fail=1
+ printf "$a\\n$b\\n$c\\n" >in || framework_failure_
+ for pattern in "$a" "$b" "$c"; do
+ pat=$(printf "$pattern\\n") || framework_failure_
+ grep -i "\\(\\)\\1$pat" in >out-regex || fail=1
+ grep -i "$pat" in >out-dfa || fail=1
+ compare_ out-regex out-dfa || fail=1
+ done
+done
-pattern="($pattern)\\1"
-grep -Ei "$pattern" in >out || fail=1
-compare in out || fail=1
+# Try a unibyte test with ISO 8859-7, if available.
+if test "$(get-mb-cur-max el_GR.iso88597)" -eq 1; then
+ LC_ALL=el_GR.iso88597
+ export LC_ALL
+
+ a='\323' # SIGMA
+ b='\362' # stigma
+ c='\363' # sigma
+
+ printf "$a\\n$b\\n$c\\n" >in || framework_failure_
+ for pattern in "$a" "$b" "$c"; do
+ pat=$(printf "$pattern\\n") || framework_failure_
+ grep -i "\\(\\)\\1$pat" in >out-regex || fail=1
+ grep -i "$pat" in >out-dfa || fail=1
+ compare_ out-regex out-dfa || fail=1
+ done
+fi
Exit $fail
--
1.8.5.3