From df950021df03d33f06e54505e3e68be5989d72a6 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Fri, 7 Mar 2014 18:27:28 -0800 Subject: [PATCH] grep: fix case-fold mismatches between DFA and regex The DFA code and the regex code didn't use the same semantics for case-folding. The regex code says that the data char d matches the pattern char p if uc (d) == uc (p). POSIX is unclear in this area; the simplest fix for now is to change the DFA code to agree with the regex code. See . * src/dfa.c (static_assert): New macro, if not already defined. (setbit_case_fold_c): Assume MB_CUR_MAX is 1 and that case_fold is nonzero; all callers changed. (setbit_case_fold_c, parse_bracket_exp, lex, atom): Case-fold like the regex code does. (lonesome_lower): New constant. (case_folded_counterparts): New function. (parse_bracket_exp): Prefer plain setbit when case-folding is not needed. * src/dfa.h (CASE_FOLDED_BUFSIZE): New constant. (case_folded_counterparts): New function decl. * src/main.c (trivial_case_ignore): Case-fold like the regex code does. (main): Try to improve comment re trivial_case_ignore. * tests/case-fold-titlecase: Add lots more test cases. --- src/dfa.c | 143 ++++++++++++++++++++++++--------------- src/dfa.h | 8 +++ src/main.c | 57 ++++++++-------- tests/case-fold-titlecase | 167 +++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 282 insertions(+), 93 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 585a599..5910268 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -34,6 +34,12 @@ #include #include +/* Gawk doesn't use Gnulib, so don't assume static_assert is present. */ +#ifndef static_assert +# define static_assert(cond, diagnostic) \ + extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })] +#endif + #define STREQ(a, b) (strcmp (a, b) == 0) /* ISASCIIDIGIT differs from isdigit, as follows: @@ -710,34 +716,16 @@ setbit_wc (wint_t wc, charclass c) #endif } -/* Set a bit for B in the charclass C, if B is a valid single byte - character in the current character set. If case is folded, set B's - lower and upper case variants similarly. If MB_CUR_MAX > 1, the - resulting charset is used only as an optimization, and the caller - should set the appropriate field of struct mb_char_classes. */ +/* Set a bit for B and its case variants in the charclass C. + MB_CUR_MAX must be 1. */ static void setbit_case_fold_c (int b, charclass c) { - if (MB_CUR_MAX > 1) - { - wint_t wc = btowc (b); - if (wc == WEOF) - return; - if (case_fold) - { - setbit_wc (towlower (wc), c); - setbit_wc (towupper (wc), c); - } - } - else - { - if (case_fold) - { - setbit (tolower (b), c); - setbit (toupper (b), c); - } - } - setbit (b, c); + int ub = toupper (b); + int i; + for (i = 0; i < NOTCHAR; i++) + if (toupper (i) == ub) + setbit (i, c); } @@ -898,6 +886,50 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */ # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif +/* The set of wchar_t values C such that there's a useful locale + somewhere where C != towupper (C) && C != towlower (towupper (C)). + For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because + towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and + towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ +static short const lonesome_lower[] = + { + 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, + 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, + + /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase + counterpart in locales predating Unicode 4.0.0 (April 2003). */ + 0x03F2, + + 0x03F5, 0x1E9B, 0x1FBE, + }; + +static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2 + == CASE_FOLDED_BUFSIZE), + "CASE_FOLDED_BUFSIZE is wrong"); + +/* Find the characters equal to C after case-folding, other than C + itself, and store them into FOLDED. Return the number of characters + stored. */ +int +case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) +{ + int i; + int n = 0; + wint_t uc = towupper (c); + wint_t lc = towlower (uc); + if (uc != c) + folded[n++] = uc; + if (lc != uc && lc != c && towupper (lc) == uc) + folded[n++] = lc; + for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) + { + wint_t li = lonesome_lower[i]; + if (li != lc && li != uc && li != c && towupper (li) == uc) + folded[n++] = li; + } + return n; +} + typedef int predicate (int); /* The following list maps the names of the Posix named character classes @@ -1058,7 +1090,7 @@ parse_bracket_exp (void) for (c2 = 0; c2 < NOTCHAR; ++c2) if (pred->func (c2)) - setbit_case_fold_c (c2, ccl); + setbit (c2, ccl); } else known_bracket_exp = false; @@ -1125,8 +1157,21 @@ parse_bracket_exp (void) } } else if (using_simple_locale ()) - for (; c <= c2; c++) - setbit_case_fold_c (c, ccl); + { + for (c1 = c; c1 <= c2; c1++) + setbit (c1, ccl); + if (case_fold) + { + int uc = toupper (c); + int uc2 = toupper (c2); + for (c1 = 0; c1 < NOTCHAR; c1++) + { + int uc1 = toupper (c1); + if (uc <= uc1 && uc1 <= uc2) + setbit (c1, ccl); + } + } + } else known_bracket_exp = false; @@ -1145,26 +1190,22 @@ parse_bracket_exp (void) if (MB_CUR_MAX == 1) { - setbit_case_fold_c (c, ccl); + if (case_fold) + setbit_case_fold_c (c, ccl); + else + setbit (c, ccl); continue; } if (case_fold) { - wint_t folded = towlower (wc); - if (folded != wc && !setbit_wc (folded, ccl)) - { - REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, - work_mbc->nchars + 1); - work_mbc->chars[work_mbc->nchars++] = folded; - } - folded = towupper (wc); - if (folded != wc && !setbit_wc (folded, ccl)) - { - REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, - work_mbc->nchars + 1); - work_mbc->chars[work_mbc->nchars++] = folded; - } + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int i, n = case_folded_counterparts (wc, folded); + REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, + work_mbc->nchars + n); + for (i = 0; i < n; i++) + if (!setbit_wc (folded[i], ccl)) + work_mbc->chars[work_mbc->nchars++] = folded[i]; } if (!setbit_wc (wc, ccl)) { @@ -1510,7 +1551,7 @@ lex (void) if (MB_CUR_MAX > 1) return lasttok = WCHAR; - if (case_fold && (tolower (c) != c || toupper (c) != c)) + if (case_fold && isalpha (c)) { zeroset (ccl); setbit_case_fold_c (c, ccl); @@ -1757,18 +1798,14 @@ atom (void) if (MBS_SUPPORT && tok == WCHAR) { addtok_wc (wctok); + if (case_fold) { - wint_t folded = towlower (wctok); - if (folded != wctok) - { - addtok_wc (folded); - addtok (OR); - } - folded = towupper (wctok); - if (folded != wctok) + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int i, n = case_folded_counterparts (wctok, folded); + for (i = 0; i < n; i++) { - addtok_wc (folded); + addtok_wc (folded[i]); addtok (OR); } } diff --git a/src/dfa.h b/src/dfa.h index 7e0674f..ad2b854 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -101,3 +101,11 @@ extern void dfawarn (const char *); extern _Noreturn void dfaerror (const char *); extern int using_utf8 (void); + +/* Maximum number of characters that can be the case-folded + counterparts of a single character, not counting the character + itself. This is 1 for towupper, 1 for towlower, and 1 for each + entry in LONESOME_LOWER; see dfa.c. */ +enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 }; + +int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]); diff --git a/src/main.c b/src/main.c index 808e47a..4f8f1ef 100644 --- a/src/main.c +++ b/src/main.c @@ -1892,11 +1892,12 @@ trivial_case_ignore (size_t len, char const *keys, return false; /* Worst case is that each byte B of KEYS is ASCII alphabetic and - the two other_case(B) characters, C and D, each occupies - MB_CUR_MAX bytes, so each B maps to [BCD], which requires 2 * - MB_CUR_MAX + 3 bytes; this is bounded above by the constant - expression 2 * MB_LEN_MAX + 3. */ - *new_keys = xnmalloc (len + 1, 2 * MB_LEN_MAX + 3); + CASE_FOLDED_BUFSIZE other_case(B) characters, C through Z, each + occupying MB_CUR_MAX bytes, so each B maps to [BC...Z], which + requires CASE_FOLDED_BUFSIZE * MB_CUR_MAX + 3 bytes; this is + bounded above by the constant expression CASE_FOLDED_BUFSIZE * + MB_LEN_MAX + 3. */ + *new_keys = xnmalloc (len + 1, CASE_FOLDED_BUFSIZE * MB_LEN_MAX + 3); char *p = *new_keys; mbstate_t mb_state = { 0 }; @@ -1918,9 +1919,9 @@ trivial_case_ignore (size_t len, char const *keys, keys += n; len -= n; - wint_t lc = towlower (wc); - wint_t uc = towupper (wc); - if (lc == wc && uc == wc) + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int nfolded = case_folded_counterparts (wc, folded); + if (nfolded <= 0) { memcpy (p, orig, n); p += n; @@ -1933,20 +1934,18 @@ trivial_case_ignore (size_t len, char const *keys, memcpy (p, orig, n); p += n; - if (lc != wc) - { - size_t lcbytes = WCRTOMB (p, lc, &mb_state); - if (lcbytes == (size_t) -1) - goto skip_case_ignore_optimization; - p += lcbytes; - } - if (uc != wc) + int i = 0; + do { - size_t ucbytes = WCRTOMB (p, uc, &mb_state); - if (ucbytes == (size_t) -1 || ! mbsinit (&mb_state)) + size_t nbytes = WCRTOMB (p, folded[i], &mb_state); + if (nbytes == (size_t) -1) goto skip_case_ignore_optimization; - p += ucbytes; + p += nbytes; } + while (++i < nfolded); + + if (! mbsinit (&mb_state)) + goto skip_case_ignore_optimization; *p++ = ']'; } @@ -2351,16 +2350,16 @@ main (int argc, char **argv) else usage (EXIT_TROUBLE); - /* As currently implemented, case-insensitive matching is expensive in - multi-byte locales because of a few outlier locales in which some - characters change size when converted to upper or lower case. To - accommodate those, we revert to searching the input one line at a - time, rather than using the much more efficient buffer search. - However, if we have a regular expression, /foo/i, we can convert - it to an equivalent case-insensitive /[fF][oO][oO]/, and thus - avoid the expensive read-and-process-a-line-at-a-time requirement. - Optimize-away the "-i" option, when possible, converting each - candidate alpha, C, in the regexp to [Cc]. */ + /* Case-insensitive matching is expensive in multibyte locales + because a few characters may change size when converted to upper + or lower case. To accommodate those, search the input one line + at a time, rather than using the much more efficient buffer search. + + Try to convert a regular expression 'foo' (ignoring case) to an + equivalent regular expression '[fF][oO][oO]' (where case matters). + Not only does this avoid the expensive requirement to read and + process a line at a time, it also allows use of the kwset engine, + a win in non-UTF-8 multibyte locales. */ if (match_icase) { size_t new_keycc; diff --git a/tests/case-fold-titlecase b/tests/case-fold-titlecase index 0ece5c8..f16022b 100755 --- a/tests/case-fold-titlecase +++ b/tests/case-fold-titlecase @@ -1,5 +1,5 @@ #!/bin/sh -# Check that case folding works even with titlecase characters. +# Check that case folding works even with titlecase and similarly odd chars. # Copyright 2014 Free Software Foundation, Inc. @@ -25,17 +25,162 @@ export LC_ALL fail=0 -LJ='\307\207' # U+01C7 LATIN CAPITAL LETTER LJ -Lj='\307\210' # U+01C8 LATIN CAPITAL LETTER L WITH SMALL LETTER J -lj='\307\211' # U+01C9 LATIN SMALL LETTER LJ -pattern=$(printf "$Lj\n") || framework_failure_ -printf "$lj$lj\n$Lj$Lj\n$LJ$LJ\n" >in || framework_failure_ +for testcase in \ + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 +do + case $testcase in + 0) + a='\302\265' # U+00B5 + b='\316\234' # U+039C + c='\316\274' # U+03BC + ;; + 1) + a='\111' # U+0049 + b='\151' # U+0069 + c='\304\260' # U+0130 + ;; + 2) + a='\111' # U+0049 + b='\151' # U+0069 + c='\304\261' # U+0131 + ;; + 3) + a='\123' # U+0053 + b='\163' # U+0073 + c='\305\277' # U+017F + ;; + 4) + a='\307\204' # U+01C4 + b='\307\205' # U+01C5 + c='\307\206' # U+01C6 + ;; + 5) + a='\307\207' # U+01C7 + b='\307\210' # U+01C8 + c='\307\211' # U+01C9 + ;; + 6) + a='\307\212' # U+01CA + b='\307\213' # U+01CB + c='\307\214' # U+01CC + ;; + 7) + a='\307\261' # U+01F1 + b='\307\262' # U+01F2 + c='\307\263' # U+01F3 + ;; + 8) + a='\315\205' # U+0345 + b='\316\231' # U+0399 + c='\316\271' # U+03B9 + ;; + 9) + a='\316\243' # U+03A3 + b='\317\202' # U+03C2 + c='\317\203' # U+03C3 + ;; + 10) + a='\316\222' # U+0392 + b='\316\262' # U+03B2 + c='\317\220' # U+03D0 + ;; + 11) + a='\316\230' # U+0398 + b='\316\270' # U+03B8 + c='\317\221' # U+03D1 + ;; + 12) + a='\316\246' # U+03A6 + b='\317\206' # U+03C6 + c='\317\225' # U+03D5 + ;; + 13) + a='\316\240' # U+03A0 + b='\317\200' # U+03C0 + c='\317\226' # U+03D6 + ;; + 14) + a='\316\232' # U+039A + b='\316\272' # U+03BA + c='\317\260' # U+03F0 + ;; + 15) + a='\316\241' # U+03A1 + b='\317\201' # U+03C1 + c='\317\261' # U+03F1 + ;; + 16) + a='\316\230' # U+0398 + b='\316\270' # U+03B8 + c='\317\264' # U+03F4 + ;; + 17) + a='\316\225' # U+0395 + b='\316\265' # U+03B5 + c='\317\265' # U+03F5 + ;; + 18) + a='\341\271\240' # U+1E60 + b='\341\271\241' # U+1E61 + c='\341\272\233' # U+1E9B + ;; + 19) + a='\303\237' # U+00DF + b='\303\237' # U+00DF + c='\341\272\236' # U+1E9E + ;; + 20) + a='\316\231' # U+0399 + b='\316\271' # U+03B9 + c='\341\276\276' # U+1FBE + ;; + 21) + a='\316\251' # U+03A9 + b='\317\211' # U+03C9 + c='\342\204\246' # U+2126 + ;; + 22) + a='\113' # U+004B + b='\153' # U+006B + c='\342\204\252' # U+212A + ;; + 23) + a='\303\205' # U+00C5 + b='\303\245' # U+00E5 + c='\342\204\253' # U+212B + ;; + 24) + a='\316\243' # U+03A3 + b='\317\203' # U+03C3 + c='\317\262' # U+03F2 + ;; + esac -grep -i "$pattern" in >out || fail=1 -compare in out || fail=1 + printf "$a\\n$b\\n$c\\n" >in || framework_failure_ + for pattern in "$a" "$b" "$c"; do + pat=$(printf "$pattern\\n") || framework_failure_ + grep -i "\\(\\)\\1$pat" in >out-regex || fail=1 + grep -i "$pat" in >out-dfa || fail=1 + compare_ out-regex out-dfa || fail=1 + done +done -pattern="($pattern)\\1" -grep -Ei "$pattern" in >out || fail=1 -compare in out || fail=1 +# Try a unibyte test with ISO 8859-7, if available. +if test "$(get-mb-cur-max el_GR.iso88597)" -eq 1; then + LC_ALL=el_GR.iso88597 + export LC_ALL + + a='\323' # SIGMA + b='\362' # stigma + c='\363' # sigma + + printf "$a\\n$b\\n$c\\n" >in || framework_failure_ + for pattern in "$a" "$b" "$c"; do + pat=$(printf "$pattern\\n") || framework_failure_ + grep -i "\\(\\)\\1$pat" in >out-regex || fail=1 + grep -i "$pat" in >out-dfa || fail=1 + compare_ out-regex out-dfa || fail=1 + done +fi Exit $fail -- 1.8.5.3