From fce643886981ab14c1d4c8fd8f0f4d33f57c5ef9 Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Sun, 27 Nov 2016 15:31:35 -0800
Subject: [PATCH] grep: avoid false matches in non-UTF8 multibyte locales

* gnulib: Update to latest, for the dfa.c fix.
* NEWS (Bug fixes): Mention it.
* tests/false-match-mb-non-utf8: New file, with tests for this.
Based on tests from Stephane Chazelas.
* tests/Makefile.am (TESTS): Add it.
Introduced by commit v2.18-54-g3ef4c8e, a change that made grep use
its DFA matcher more aggressively.  The malfunction arises only with
the DFA matcher, not with regex.
Reported by Stephane Chazelas in https://bugs.gnu.org/24975
---
 NEWS                          |  7 +++++++
 gnulib                        |  2 +-
 tests/Makefile.am             |  1 +
 tests/false-match-mb-non-utf8 | 38 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100755 tests/false-match-mb-non-utf8

diff --git a/NEWS b/NEWS
index bd1a201..971cbd9 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,13 @@ GNU grep NEWS                                    -*- outline -*-

 ** Bug fixes

+  grep no longer reports a false match in a multibyte, non-UTF8 locale
+  like zh_CN.gb18030, with a regular expression like ".*7" that just
+  happens to match the 4-byte representation of gb18030's \uC9, the
+  final byte of which is the digit "7".  This "fix" is to make grep
+  always use the slower regex matcher in such locales.
+  [bug introduced in grep-2.19]
+
   grep by default now reads all of standard input if it is a pipe,
   even if this cannot affect grep's output or exit status.  This works
   better with nonportable scripts that run "PROGRAM | grep PATTERN
diff --git a/gnulib b/gnulib
index 60e8ffc..bd6d66e 160000
--- a/gnulib
+++ b/gnulib
@@ -1 +1 @@
-Subproject commit 60e8ffca02dd4eac3a87b744f4f9ef68f3dffa35
+Subproject commit bd6d66e502786df21d2dcaa7b473ee851f840aaa
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 56e860f..442e85a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -94,6 +94,7 @@ TESTS =						\
   equiv-classes					\
   ere						\
   euc-mb					\
+  false-match-mb-non-utf8			\
   fedora					\
   fgrep-infloop					\
   file						\
diff --git a/tests/false-match-mb-non-utf8 b/tests/false-match-mb-non-utf8
new file mode 100755
index 0000000..6dfd10a
--- /dev/null
+++ b/tests/false-match-mb-non-utf8
@@ -0,0 +1,38 @@
+#! /bin/sh
+# Test for false matches in grep 2.19..2.26 in multibyte, non-UTF8 locales
+#
+# Copyright (C) 2016 Free Software Foundation, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+# Add "." to PATH for the use of get-mb-cur-max.
+path_prepend_ .
+
+fail=0
+
+loc=zh_CN.gb18030
+test "$(get-mb-cur-max $loc)" = 4 || skip_ "no support for the $loc locale"
+
+# This must not match: the input is a single character, \uC9 followed
+# by a newline.  But it just so happens that that character is made up
+# of four bytes, the last of which is the digit, 7, and grep's DFA
+# matcher would mistakenly report that ".*7" matches that input line.
+printf '\2010\2077\n' > in || framework_failure_
+LC_ALL=$loc returns_ 1 grep -E '.*7' in || fail=1
+
+LC_ALL=$loc returns_ 1 grep -E '.{0,1}7' in || fail=1
+
+LC_ALL=$loc returns_ 1 grep -E '.?7' in || fail=1
+
+# Similar for the \ue9 code point, which ends in an "m" byte.
+loc=zh_HK.big5hkscs
+test "$(get-mb-cur-max $loc)" = 2 || skip_ "no support for the $loc locale"
+
+printf '\210m\n' > in || framework_failure_
+LC_ALL=$loc returns_ 1 grep '.*m' in || fail=1
+
+Exit $fail
-- 
2.9.3