From 25b665c0eb04c8fb68034cc7db1ceea08e625b5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Santiago=20Ruano=20Rinc=C3=B3n?= Date: Fri, 13 Dec 2013 07:53:37 -0800 Subject: [PATCH] PCRE: tell grep -P to relax its stance on invalid multibyte chars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not exit-2 for invalid UTF-8 characters. Just prior to this change, this command would match no lines and fail like this: $ printf 'j\x82\nj\n'|LC_ALL=en_US.UTF-8 grep -P j|cat -A; echo $? src/grep: invalid UTF-8 byte sequence in input 2 After this change, the same command matches both lines, and succeeds: jM-^B$ j$ 0 * src/pcresearch.c (Pcompile): Use PCRE_NO_UTF8_CHECK, too, and add a comment. * tests/pcre-utf8: Add a test and a comment. Based on a patch by Santiago Ruano Rincón. See http://bugs.gnu.org/15758/ --- src/pcresearch.c | 6 +++++- tests/pcre-utf8 | 6 ++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/pcresearch.c b/src/pcresearch.c index 9ba1227..43988c6 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -62,7 +62,11 @@ Pcompile (char const *pattern, size_t size) #if defined HAVE_LANGINFO_CODESET if (STREQ (nl_langinfo (CODESET), "UTF-8")) - flags |= PCRE_UTF8; + { + /* Enable PCRE's UTF-8 matching, but disable the check that would + make an invalid byte seqence *in the input* trigger a failure. */ + flags |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK; + } #endif /* FIXME: Remove these restrictions. */ diff --git a/tests/pcre-utf8 b/tests/pcre-utf8 index b8228d5..a3b9390 100755 --- a/tests/pcre-utf8 +++ b/tests/pcre-utf8 @@ -19,9 +19,15 @@ echo '$' | LC_ALL=en_US.UTF-8 grep -qP '\p{S}' \ euro='\342\202\254 euro' printf "$euro\\n" > in || framework_failure_ +# The euro sign has the unicode "Symbol" property, so this must match: LC_ALL=en_US.UTF-8 grep -P '^\p{S}' in > out || fail=1 compare in out || fail=1 +# This RE must *not* match in the C locale, because the first +# byte is not a "Symbol". +LC_ALL=C grep -P '^\p{S}' in > out && fail=1 +compare /dev/null out || fail=1 + LC_ALL=en_US.UTF-8 grep -P '^. euro$' in > out2 || fail=1 compare in out2 || fail=1 -- 1.8.5.rc2.6.gc6f1b92