From fce643886981ab14c1d4c8fd8f0f4d33f57c5ef9 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Sun, 27 Nov 2016 15:31:35 -0800 Subject: [PATCH] grep: avoid false matches in non-UTF8 multibyte locales * gnulib: Update to latest, for the dfa.c fix. * NEWS (Bug fixes): Mention it. * tests/false-match-mb-non-utf8: New file, with tests for this. Based on tests from Stephane Chazelas. * tests/Makefile.am (TESTS): Add it. Introduced by commit v2.18-54-g3ef4c8e, a change that made grep use its DFA matcher more aggressively. The malfunction arises only with the DFA matcher, not with regex. Reported by Stephane Chazelas in https://bugs.gnu.org/24975 --- NEWS | 7 +++++++ gnulib | 2 +- tests/Makefile.am | 1 + tests/false-match-mb-non-utf8 | 38 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) create mode 100755 tests/false-match-mb-non-utf8 diff --git a/NEWS b/NEWS index bd1a201..971cbd9 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,13 @@ GNU grep NEWS -*- outline -*- ** Bug fixes + grep no longer reports a false match in a multibyte, non-UTF8 locale + like zh_CN.gb18030, with a regular expression like ".*7" that just + happens to match the 4-byte representation of gb18030's \uC9, the + final byte of which is the digit "7". This "fix" is to make grep + always use the slower regex matcher in such locales. + [bug introduced in grep-2.19] + grep by default now reads all of standard input if it is a pipe, even if this cannot affect grep's output or exit status. This works better with nonportable scripts that run "PROGRAM | grep PATTERN diff --git a/gnulib b/gnulib index 60e8ffc..bd6d66e 160000 --- a/gnulib +++ b/gnulib @@ -1 +1 @@ -Subproject commit 60e8ffca02dd4eac3a87b744f4f9ef68f3dffa35 +Subproject commit bd6d66e502786df21d2dcaa7b473ee851f840aaa diff --git a/tests/Makefile.am b/tests/Makefile.am index 56e860f..442e85a 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -94,6 +94,7 @@ TESTS = \ equiv-classes \ ere \ euc-mb \ + false-match-mb-non-utf8 \ fedora \ fgrep-infloop \ file \ diff --git a/tests/false-match-mb-non-utf8 b/tests/false-match-mb-non-utf8 new file mode 100755 index 0000000..6dfd10a --- /dev/null +++ b/tests/false-match-mb-non-utf8 @@ -0,0 +1,38 @@ +#! /bin/sh +# Test for false matches in grep 2.19..2.26 in multibyte, non-UTF8 locales +# +# Copyright (C) 2016 Free Software Foundation, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. + +. "${srcdir=.}/init.sh"; path_prepend_ ../src + +# Add "." to PATH for the use of get-mb-cur-max. +path_prepend_ . + +fail=0 + +loc=zh_CN.gb18030 +test "$(get-mb-cur-max $loc)" = 4 || skip_ "no support for the $loc locale" + +# This must not match: the input is a single character, \uC9 followed +# by a newline. But it just so happens that that character is made up +# of four bytes, the last of which is the digit, 7, and grep's DFA +# matcher would mistakenly report that ".*7" matches that input line. +printf '\2010\2077\n' > in || framework_failure_ +LC_ALL=$loc returns_ 1 grep -E '.*7' in || fail=1 + +LC_ALL=$loc returns_ 1 grep -E '.{0,1}7' in || fail=1 + +LC_ALL=$loc returns_ 1 grep -E '.?7' in || fail=1 + +# Similar for the \ue9 code point, which ends in an "m" byte. +loc=zh_HK.big5hkscs +test "$(get-mb-cur-max $loc)" = 2 || skip_ "no support for the $loc locale" + +printf '\210m\n' > in || framework_failure_ +LC_ALL=$loc returns_ 1 grep '.*m' in || fail=1 + +Exit $fail -- 2.9.3