>From 859b9496860e67d01e32a58e9f2a098410775a22 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Tue, 28 Jan 2014 13:47:47 -0800 Subject: [PATCH] Simplify handling of letter case. * src/dfa.c (setbit_wc, setbit_case_fold_c, atom): Simplify. (xor_other, xor_wother, to_other, to_wother): New functions. (setbit_case_fold_c, parse_bracket_exp, lex, atom): Use them to invoke tolower and toupper instead of isalpha followed by one or the other, and similarly for towlower, towupper, iswalpha. This should lead to more-efficient handling of caseless letters, and it simplifies the code. --- src/dfa.c | 130 +++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 81 insertions(+), 49 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index b79c604..af830a8 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -662,6 +662,42 @@ wchar_context (wint_t wc) return CTX_NONE; } +/* The following functions exploit the commutativity and associativity of ^, + and the fact that X ^ X is zero. POSIX requires that C equals + either tolower (C) or toupper (C); if the former, then C ^ tolower (C) + is zero so C ^ xor_other (C) equals toupper (C), and similarly + for the latter. */ + +/* Return the exclusive-OR of C and C's other case, or zero if C is + not a letter that changes case. */ + +static int +xor_other (int c) +{ + return tolower (c) ^ toupper (c); +} + +static wint_t +xor_wother (wint_t c) +{ + return towlower (c) ^ towupper (c); +} + +/* If C is a lowercase letter, return its uppercase version, and vice versa. + Return C if it's not a letter that changes case. */ + +static int +to_other (int c) +{ + return c ^ xor_other (c); +} + +static wint_t +to_wother (wint_t c) +{ + return c ^ xor_wother (c); +} + /* Entry point to set syntax options. */ void dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) @@ -693,39 +729,24 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) this may happen when folding case in weird Turkish locales where dotless i/dotted I are not included in the chosen character set. Return whether a bit was set in the charclass. */ -#if MBS_SUPPORT static bool setbit_wc (wint_t wc, charclass c) { +#if MBS_SUPPORT int b = wctob (wc); if (b == EOF) return false; setbit (b, c); return true; -} - -/* Set a bit in the charclass for the given single byte character, - if it is valid in the current character set. */ -static void -setbit_c (int b, charclass c) -{ - /* Do nothing if b is invalid in this character set. */ - if (MB_CUR_MAX > 1 && btowc (b) == WEOF) - return; - setbit (b, c); -} #else -# define setbit_c setbit -static inline bool -setbit_wc (wint_t wc, charclass c) -{ abort (); /*NOTREACHED*/ return false; -} #endif +} -/* Like setbit_c, but if case is folded, set both cases of a letter. For +/* Like setbit_wc but for a single-byte character B; and if case is + folded, set both cases of a letter. For MB_CUR_MAX > 1, the resulting charset is only used as an optimization, and the caller takes care of setting the appropriate field of struct mb_char_classes. */ @@ -737,16 +758,16 @@ setbit_case_fold_c (int b, charclass c) wint_t wc = btowc (b); if (wc == WEOF) return; - setbit (b, c); - if (case_fold && iswalpha (wc)) - setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c); + if (case_fold) + setbit_wc (to_wother (wc), c); } else { - setbit (b, c); - if (case_fold && isalpha (b)) - setbit_c (isupper (b) ? tolower (b) : toupper (b), c); + if (case_fold) + setbit (to_other (b), c); } + + setbit (b, c); } @@ -1085,23 +1106,30 @@ parse_bracket_exp (void) { /* When case folding map a range, say [m-z] (or even [M-z]) to the pair of ranges, [m-z] [M-Z]. */ + wchar_t lo1 = wc, hi1 = wc2, lo2 = wc, hi2 = wc2; + if (case_fold) + { + lo1 = towlower (lo1); + hi1 = towlower (hi1); + lo2 = towupper (lo2); + hi2 = towupper (hi2); + } + REALLOC_IF_NECESSARY (work_mbc->range_sts, range_sts_al, work_mbc->nranges + 1); REALLOC_IF_NECESSARY (work_mbc->range_ends, range_ends_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = - case_fold ? towlower (wc) : (wchar_t) wc; - work_mbc->range_ends[work_mbc->nranges++] = - case_fold ? towlower (wc2) : (wchar_t) wc2; + work_mbc->range_sts[work_mbc->nranges] = lo1; + work_mbc->range_ends[work_mbc->nranges++] = hi1; - if (case_fold && (iswalpha (wc) || iswalpha (wc2))) + if (lo1 != lo2 || hi1 != hi2) { REALLOC_IF_NECESSARY (work_mbc->range_sts, range_sts_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = towupper (wc); + work_mbc->range_sts[work_mbc->nranges] = lo2; REALLOC_IF_NECESSARY (work_mbc->range_ends, range_ends_al, work_mbc->nranges + 1); - work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2); + work_mbc->range_ends[work_mbc->nranges++] = hi2; } } else @@ -1129,16 +1157,19 @@ parse_bracket_exp (void) continue; } - if (case_fold && iswalpha (wc)) + if (case_fold) { - wc = towlower (wc); - if (!setbit_wc (wc, ccl)) + wchar_t xor = xor_wother (wc); + if (xor) { - REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, - work_mbc->nchars + 1); - work_mbc->chars[work_mbc->nchars++] = wc; + wchar_t other = wc ^ xor; + if (!setbit_wc (other, ccl)) + { + REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, + work_mbc->nchars + 1); + work_mbc->chars[work_mbc->nchars++] = other; + } } - wc = towupper (wc); } if (!setbit_wc (wc, ccl)) { @@ -1481,7 +1512,7 @@ lex (void) if (MB_CUR_MAX > 1) return lasttok = WCHAR; - if (case_fold && isalpha (c)) + if (case_fold && tolower (c) != toupper (c)) { zeroset (ccl); setbit_case_fold_c (c, ccl); @@ -1725,17 +1756,18 @@ add_utf8_anychar (void) static void atom (void) { - if (0) - { - /* empty */ - } - else if (MBS_SUPPORT && tok == WCHAR) + if (MBS_SUPPORT && tok == WCHAR) { - addtok_wc (case_fold ? towlower (wctok) : wctok); - if (case_fold && iswalpha (wctok)) + wchar_t wc = wctok; + addtok_wc (wc); + if (case_fold) { - addtok_wc (towupper (wctok)); - addtok (OR); + wchar_t xor = xor_wother (wc); + if (xor) + { + addtok_wc (wc ^ xor); + addtok (OR); + } } tok = lex (); -- 1.8.5.3