From 3ce8b39e3137d3cdcf8cec84dc89788037e76742 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Sat, 20 Feb 2016 12:50:27 -0800 Subject: [PATCH] grep -z: avoid erroneous match with regexp anchor and \n in text * src/dfasearch.c (EGexecute): Clear the newline_anchor bit when eolbyte is not '\n'. * tests/z-anchor-newline: New file. * tests/Makefile.am (TESTS): Add it. * NEWS (Bug fixes): Describe it. Originally reported by Ulrich Mueller in https://bugs.gentoo.org/show_bug.cgi?id=574662 Reported to us by Sergei Trofimovich as http://debbugs.gnu.org/22655 --- NEWS | 13 +++++++++++++ src/dfasearch.c | 1 + tests/Makefile.am | 3 ++- tests/z-anchor-newline | 43 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 1 deletion(-) create mode 100755 tests/z-anchor-newline diff --git a/NEWS b/NEWS index feca5c5..ae238be 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,19 @@ GNU grep NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** Bug fixes + + grep -z would match strings it should not. To trigger the bug, you'd + have to use a regular expression including an anchor (^ or $) and a + feature like a range or a backreference, causing grep to forego its DFA + matcher and resort to using re_search. With a multibyte locale, that + matcher could mistakenly match a string containing a newline. + For example, this command: + printf 'a\nb\0' | LC_ALL=en_US.utf-8 grep -z '^[a-b]*b' + would mistakenly match and print all four input bytes. After the fix, + there is no match, as expected. + [bug introduced in grep-2.7] + * Noteworthy changes in release 2.23 (2016-02-04) [stable] diff --git a/src/dfasearch.c b/src/dfasearch.c index e04a2df..d348d44 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -342,6 +342,7 @@ EGexecute (char *buf, size_t size, size_t *match_size, for (i = 0; i < pcount; i++) { patterns[i].regexbuf.not_eol = 0; + patterns[i].regexbuf.newline_anchor = eolbyte == '\n'; start = re_search (&(patterns[i].regexbuf), beg, end - beg - 1, ptr - beg, end - ptr - 1, diff --git a/tests/Makefile.am b/tests/Makefile.am index a38303c..5a2c0f0 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -141,7 +141,8 @@ TESTS = \ word-delim-multibyte \ word-multi-file \ word-multibyte \ - yesno + yesno \ + z-anchor-newline EXTRA_DIST = \ $(TESTS) \ diff --git a/tests/z-anchor-newline b/tests/z-anchor-newline new file mode 100755 index 0000000..b4dfebc --- /dev/null +++ b/tests/z-anchor-newline @@ -0,0 +1,43 @@ +#!/bin/sh +# grep -z with an anchor in the regex could mistakenly match text +# including a newline. + +# Copyright 2016 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/init.sh"; path_prepend_ ../src + +require_en_utf8_locale_ +require_compiled_in_MB_support +LC_ALL=en_US.UTF-8 + +printf 'a\nb\0' > in || framework_failure_ + +fail=0 + +env > /t/x +# These three would all mistakenly match, because the [a-b] range +# forced the non-DFA (regexp-using) code path. +returns_ 1 grep -z '^[a-b]*$' in || fail=1 +returns_ 1 grep -z 'a[a-b]*$' in || fail=1 +returns_ 1 grep -z '^[a-b]*b' in || fail=1 + +# Test these for good measure; they exercise the DFA code path +# and always worked +returns_ 1 grep -z '^[ab]*$' in || fail=1 +returns_ 1 grep -z 'a[ab]*$' in || fail=1 +returns_ 1 grep -z '^[ab]*b' in || fail=1 + +Exit $fail -- 2.6.4