grep branch, master, updated. v2.20-70-gf66dafc

grep-commit

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.20-70-gf66dafc

From:	Jim Meyering
Subject:	grep branch, master, updated. v2.20-70-gf66dafc
Date:	Wed, 29 Oct 2014 03:50:52 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  f66dafc2181bf997f8e7192ad49d3d6ec9dc2b87 (commit)
      from  1519c4e5e4bf68ec348bfe4261f78768710aa985 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=f66dafc2181bf997f8e7192ad49d3d6ec9dc2b87


commit f66dafc2181bf997f8e7192ad49d3d6ec9dc2b87
Author: Norihiro Tanaka <address@hidden>
Date:   Sat Oct 25 01:46:01 2014 +0900

    dfa: make \w and \W work in multibyte locales
    
    Reported by Jaroslav Skarvada in: http://bugs.gnu.org/18817
    Now, \w and \W are supported in not only single byte locale but multibyte
    locale.
    
    * src/dfa.c (PUSH_LEX_STATE, POP_LEX_STATE): Move definitions "up",
    so they are not within the function.
    (lex): Make \w and \W work in a multibyte locale, the same way
    we made \s and \S work.
    * tests/word-multibyte: New test for this change.
    * tests/Makefile.am: Add a rule to build new test.
    * NEWS (Bug fixes): Mention it.

diff --git a/NEWS b/NEWS
index 94eeeeb..183b7f0 100644
--- a/NEWS
+++ b/NEWS
@@ -21,6 +21,9 @@ GNU grep NEWS                                    -*- outline 
-*-
 
 ** Bug fixes
 
+  grep no longer mishandles patterns that contain \w or \W in multibyte
+  locales.
+
   grep would fail to count newlines internally when operating in non-UTF8
   multibyte locales, leading it to print potentially many lines that did
   not match.  E.g., the command, "seq 10 | env LC_ALL=zh_CN src/grep -n .."
diff --git a/src/dfa.c b/src/dfa.c
index 5b9d154..e0fc120 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1249,6 +1249,20 @@ parse_bracket_exp (void)
   return CSET + charclass_index (ccl);
 }
 
+#define PUSH_LEX_STATE(s)                      \
+  do                                           \
+    {                                          \
+      char const *lexptr_saved = lexptr;       \
+      size_t lexleft_saved = lexleft;          \
+      lexptr = (s);                            \
+      lexleft = strlen (lexptr)
+
+#define POP_LEX_STATE()                                \
+      lexptr = lexptr_saved;                   \
+      lexleft = lexleft_saved;                 \
+    }                                          \
+  while (0)
+
 static token
 lex (void)
 {
@@ -1496,20 +1510,6 @@ lex (void)
               return lasttok = CSET + charclass_index (ccl);
             }
 
-#define PUSH_LEX_STATE(s)                      \
-  do                                           \
-    {                                          \
-      char const *lexptr_saved = lexptr;       \
-      size_t lexleft_saved = lexleft;          \
-      lexptr = (s);                            \
-      lexleft = strlen (lexptr)
-
-#define POP_LEX_STATE()                                \
-      lexptr = lexptr_saved;                   \
-      lexleft = lexleft_saved;                 \
-    }                                          \
-  while (0)
-
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
              add_utf8_anychar, makes sense.  */
 
@@ -1529,14 +1529,33 @@ lex (void)
         case 'W':
           if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
             goto normal_char;
-          zeroset (ccl);
-          for (c2 = 0; c2 < NOTCHAR; ++c2)
-            if (IS_WORD_CONSTITUENT (c2))
-              setbit (c2, ccl);
-          if (c == 'W')
-            notset (ccl);
+
+          if (!dfa->multibyte)
+            {
+              zeroset (ccl);
+              for (c2 = 0; c2 < NOTCHAR; ++c2)
+                if (IS_WORD_CONSTITUENT (c2))
+                  setbit (c2, ccl);
+              if (c == 'W')
+                notset (ccl);
+              laststart = false;
+              return lasttok = CSET + charclass_index (ccl);
+            }
+
+          /* FIXME: see if optimizing this, as is done with ANYCHAR and
+             add_utf8_anychar, makes sense.  */
+
+          /* \w and \W are documented to be equivalent to [_[:alnum:]] and
+             [^_[:alnum:]] respectively, so tell the lexer to process those
+             strings, each minus its "already processed" '['.  */
+          PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]");
+
+          lasttok = parse_bracket_exp ();
+
+          POP_LEX_STATE ();
+
           laststart = false;
-          return lasttok = CSET + charclass_index (ccl);
+          return lasttok;
 
         case '[':
           if (backslash)
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f6f051c..c006e58 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -114,6 +114,7 @@ TESTS =                                             \
   warn-char-classes                            \
   word-delim-multibyte                         \
   word-multi-file                              \
+  word-multibyte                               \
   yesno
 
 EXTRA_DIST =                                   \
diff --git a/tests/word-multibyte b/tests/word-multibyte
new file mode 100644
index 0000000..e067a37
--- /dev/null
+++ b/tests/word-multibyte
@@ -0,0 +1,23 @@
+#!/bin/sh
+# This would fail for grep-2.20
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+require_en_utf8_locale_
+
+printf '\xc3\xa1\n' > in || framework_failure_
+LC_ALL=en_US.UTF-8
+export LC_ALL
+
+fail=0
+
+for LOC in en_US.UTF-8 zh_CN $LOCALE_FR_UTF8; do
+  out=out1-$LOC
+  LC_ALL=$LOC grep '\w' in >$out || fail=1
+  compare in $out || fail=1
+
+  out=out2-$LOC
+  LC_ALL=$LOC grep '\W' in >$out && fail=1
+  compare /dev/null $out || fail=1
+done
+
+Exit $fail

-----------------------------------------------------------------------

Summary of changes:
 NEWS                 |    3 ++
 src/dfa.c            |   61 ++++++++++++++++++++++++++++++++-----------------
 tests/Makefile.am    |    1 +
 tests/word-multibyte |   23 ++++++++++++++++++
 4 files changed, 67 insertions(+), 21 deletions(-)
 create mode 100644 tests/word-multibyte


hooks/post-receive
-- 
grep

[Prev in Thread]

Current Thread

[Next in Thread]

grep branch, master, updated. v2.20-70-gf66dafc, Jim Meyering <=

Prev by Date: grep branch, master, updated. v2.20-69-g1519c4e
Next by Date: grep branch, master, updated. v2.20-71-g834f42b
Previous by thread: grep branch, master, updated. v2.20-69-g1519c4e
Next by thread: grep branch, master, updated. v2.20-71-g834f42b
Index(es):
- Date
- Thread