>From 6afba02d7869d39ed7f61981045ddbdcb2814101 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Tue, 21 Mar 2017 19:05:17 -0700 Subject: [PATCH] dfa: make [0-9] faster in non-C locales Problem reported by John P. Linderman (Bug#26193). * lib/dfa.c (parse_bracket_exp): Remove redundant assignment. If both ends of the range are ASCII digits, do not worry about multi-character collating sequences and the like. Be consistent about using isalpha as a precondition for setbit_case_fold_c. --- ChangeLog | 9 +++++++++ lib/dfa.c | 35 ++++++++++++----------------------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/ChangeLog b/ChangeLog index 625f007..33bd6a1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2017-03-21 Paul Eggert + + dfa: make [0-9] faster in non-C locales + Problem reported by John P. Linderman (Bug#26193). + * lib/dfa.c (parse_bracket_exp): Remove redundant assignment. + If both ends of the range are ASCII digits, do not worry about + multi-character collating sequences and the like. Be consistent + about using isalpha as a precondition for setbit_case_fold_c. + 2017-03-19 Bruno Haible lock: Fix compilation error with HP-UX IA64 cc. diff --git a/lib/dfa.c b/lib/dfa.c index 5bac288..e97dae1c 100644 --- a/lib/dfa.c +++ b/lib/dfa.c @@ -551,8 +551,9 @@ struct dfa bool, size_t *, bool *); /* The locale is simple, like the C locale. These locales can be - processed more efficiently, e.g., the relationship between lower- - and upper-case letters is 1-1. */ + processed more efficiently, as they are single-byte, their native + character set is in collating-sequence order, and they do not + have multi-character collating elements. */ bool simple_locale; /* Other cached information derived from the locale. */ @@ -1012,7 +1013,6 @@ parse_bracket_exp (struct dfa *dfa) if (invert) { c = bracket_fetch_wc (dfa); - invert = true; known_bracket_exp = dfa->simple_locale; } wint_t wc = dfa->lex.wctok; @@ -1143,24 +1143,14 @@ parse_bracket_exp (struct dfa *dfa) /* Treat [x-y] as a range if x != y. */ if (wc != wc2 || wc == WEOF) { - if (dfa->localeinfo.multibyte) - known_bracket_exp = false; - else if (dfa->simple_locale) + if (dfa->simple_locale + || (isasciidigit (c) & isasciidigit (c2))) { - int ci; - for (ci = c; ci <= c2; ci++) - setbit (ci, &ccl); - if (dfa->syntax.case_fold) - { - int uc = toupper (c); - int uc2 = toupper (c2); - for (ci = 0; ci < NOTCHAR; ci++) - { - int uci = toupper (ci); - if (uc <= uci && uci <= uc2) - setbit (ci, &ccl); - } - } + for (int ci = c; ci <= c2; ci++) + if (dfa->syntax.case_fold && isalpha (ci)) + setbit_case_fold_c (ci, &ccl); + else + setbit (ci, &ccl); } else known_bracket_exp = false; @@ -1174,7 +1164,7 @@ parse_bracket_exp (struct dfa *dfa) if (!dfa->localeinfo.multibyte) { - if (dfa->syntax.case_fold) + if (dfa->syntax.case_fold && isalpha (c)) setbit_case_fold_c (c, &ccl); else setbit (c, &ccl); @@ -1209,7 +1199,7 @@ parse_bracket_exp (struct dfa *dfa) if (! known_bracket_exp) return BACKREF; - if (dfa->localeinfo.multibyte) + if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0)) { dfa->lex.brack.invert = invert; dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl); @@ -1218,7 +1208,6 @@ parse_bracket_exp (struct dfa *dfa) if (invert) { - assert (!dfa->localeinfo.multibyte); notset (&ccl); if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) clrbit ('\n', &ccl); -- 2.7.4