From 3d0a031af562171915d76f0f62181e3b3d2b8510 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sat, 25 Oct 2014 01:46:01 +0900 Subject: [PATCH] dfa: support for \w and \W in multibyte locale Reported by Jaroslav Skarvada in: http://bugs.gnu.org/18817 Now, \w and \W are supported in not only single byte locale but multibyte locale. * src/dfa.c (lex): Add support for \w and \W in multibyte locale. * tests/word-multibyte: New test for this change. * tests/Makefile.am: Add a rule to build new test. --- src/dfa.c | 61 ++++++++++++++++++++++++++++++++++------------------ tests/Makefile.am | 1 + tests/word-multibyte | 23 ++++++++++++++++++++ 3 files changed, 64 insertions(+), 21 deletions(-) create mode 100644 tests/word-multibyte diff --git a/src/dfa.c b/src/dfa.c index 80510a8..ea48d75 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -1243,6 +1243,20 @@ parse_bracket_exp (void) return CSET + charclass_index (ccl); } +#define PUSH_LEX_STATE(s) \ + do \ + { \ + char const *lexptr_saved = lexptr; \ + size_t lexleft_saved = lexleft; \ + lexptr = (s); \ + lexleft = strlen (lexptr) + +#define POP_LEX_STATE() \ + lexptr = lexptr_saved; \ + lexleft = lexleft_saved; \ + } \ + while (0) + static token lex (void) { @@ -1490,20 +1504,6 @@ lex (void) return lasttok = CSET + charclass_index (ccl); } -#define PUSH_LEX_STATE(s) \ - do \ - { \ - char const *lexptr_saved = lexptr; \ - size_t lexleft_saved = lexleft; \ - lexptr = (s); \ - lexleft = strlen (lexptr) - -#define POP_LEX_STATE() \ - lexptr = lexptr_saved; \ - lexleft = lexleft_saved; \ - } \ - while (0) - /* FIXME: see if optimizing this, as is done with ANYCHAR and add_utf8_anychar, makes sense. */ @@ -1523,14 +1523,33 @@ lex (void) case 'W': if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - zeroset (ccl); - for (c2 = 0; c2 < NOTCHAR; ++c2) - if (IS_WORD_CONSTITUENT (c2)) - setbit (c2, ccl); - if (c == 'W') - notset (ccl); + + if (!dfa->multibyte) + { + zeroset (ccl); + for (c2 = 0; c2 < NOTCHAR; ++c2) + if (IS_WORD_CONSTITUENT (c2)) + setbit (c2, ccl); + if (c == 'W') + notset (ccl); + laststart = false; + return lasttok = CSET + charclass_index (ccl); + } + + /* FIXME: see if optimizing this, as is done with ANYCHAR and + add_utf8_anychar, makes sense. */ + + /* \w and \W are documented to be equivalent to [_[:alnum:]] and + [^_[:alnum:]] respectively, so tell the lexer to process those + strings, each minus its "already processed" '['. */ + PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]"); + + lasttok = parse_bracket_exp (); + + POP_LEX_STATE (); + laststart = false; - return lasttok = CSET + charclass_index (ccl); + return lasttok; case '[': if (backslash) diff --git a/tests/Makefile.am b/tests/Makefile.am index c298835..1e2cffd 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -113,6 +113,7 @@ TESTS = \ warn-char-classes \ word-delim-multibyte \ word-multi-file \ + word-multibyte \ yesno EXTRA_DIST = \ diff --git a/tests/word-multibyte b/tests/word-multibyte new file mode 100644 index 0000000..e067a37 --- /dev/null +++ b/tests/word-multibyte @@ -0,0 +1,23 @@ +#!/bin/sh +# This would fail for grep-2.20 +. "${srcdir=.}/init.sh"; path_prepend_ ../src + +require_en_utf8_locale_ + +printf '\xc3\xa1\n' > in || framework_failure_ +LC_ALL=en_US.UTF-8 +export LC_ALL + +fail=0 + +for LOC in en_US.UTF-8 zh_CN $LOCALE_FR_UTF8; do + out=out1-$LOC + LC_ALL=$LOC grep '\w' in >$out || fail=1 + compare in $out || fail=1 + + out=out2-$LOC + LC_ALL=$LOC grep '\W' in >$out && fail=1 + compare /dev/null $out || fail=1 +done + +Exit $fail -- 2.1.1