From 17fb604a4cd23b07b99584706f92db8d6dd05e74 Mon Sep 17 00:00:00 2001
From: Paul Eggert <address@hidden>
Date: Sun, 10 Apr 2016 01:33:25 -0700
Subject: [PATCH 2/2] grep: in C locale, all bytes are valid characters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This works around glibc bug 19932:
https://sourceware.org/bugzilla/show_bug.cgi?id=19932
The actual bug fix was the update to the current version of Gnulib.
grep problem reported by Björn Jacke in: http://bugs.gnu.org/23234
* NEWS: Mention this.
* doc/grep.texi (File and Directory Selection): Crossref to LC_*
section.  Suggest why -a or LC_ALL=C might be useful.
(Environment Variables): Mention 'locale -a'.
Say that LC_CTYPE also specifies encoding, and that every
byte is a valid character in the C or POSIX locale.
* tests/c-locale: New test.
* tests/Makefile.am (TESTS): Add it.
---
 NEWS              |  6 ++++++
 doc/grep.texi     | 19 ++++++++++++++-----
 tests/Makefile.am |  1 +
 tests/c-locale    | 26 ++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 5 deletions(-)
 create mode 100755 tests/c-locale

diff --git a/NEWS b/NEWS
index 69e4a23..63767aa 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,12 @@ GNU grep NEWS                                    -*- outline -*-
 
 ** Bug fixes
 
+  In the C or POSIX locale, grep now treats all bytes as valid
+  characters even if the C runtime library says otherwise.  The
+  revised behavior is more compatible with the original intent of
+  POSIX, and the next release of POSIX will likely make this official.
+  [bug introduced in grep-2.23]
+
   grep -Pz no longer mistakenly diagnoses patterns like [^a] that use
   negated character classes. [bug introduced in grep-2.24]
 
diff --git a/doc/grep.texi b/doc/grep.texi
index 1d3d5cb..4e0e48e 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -599,7 +599,8 @@ If a file's data or metadata
 indicate that the file contains binary data,
 assume that the file is of type @var{type}.
 Non-text bytes indicate binary data; these are either output bytes that are
-improperly encoded for the current locale, or null input bytes when the
+improperly encoded for the current locale (@pxref{Environment
+Variables}), or null input bytes when the
 @option{-z} (@option{--null-data}) option is not given (@pxref{Other
 Options}).
 
@@ -627,10 +628,13 @@ is not matched when @var{type} is @samp{text}.  Conversely, when
 @var{type} is @samp{binary} the pattern @samp{.} (period) might not
 match a null byte.
 
address@hidden:} @samp{--binary-files=text} might output binary garbage,
-which can have nasty side effects
-if the output is a terminal and
-if the terminal driver interprets some of it as commands.
address@hidden:} The @option{-a} (@option{--binary-files=text}) option
+might output binary garbage, which can have nasty side effects if the
+output is a terminal and if the terminal driver interprets some of it
+as commands.  On the other hand, when reading files whose text
+encodings are unknown, it can be helpful to use @option{-a} or to set
address@hidden'C'} in the environment, in order to find more matches
+even if the matches are unsafe for direct display.
 
 @item -D @var{action}
 @itemx address@hidden
@@ -803,6 +807,7 @@ The @samp{C} locale is used if none of these environment variables are set,
 if the locale catalog is not installed,
 or if @command{grep} was not compiled
 with national language support (NLS).
+The shell command @code{locale -a} lists locales that are currently available.
 
 Many of the environment variables in the following list let you
 control highlighting using
@@ -1004,6 +1009,10 @@ interpreted.
 These variables specify the locale for the @env{LC_CTYPE} category,
 which determines the type of characters,
 e.g., which characters are whitespace.
+This category also determines the character encoding, that is, whether
+text is encoded in UTF-8, ASCII, or some other encoding.  In the
address@hidden or @samp{POSIX} locale, all characters are encoded as a
+single byte and every byte is a valid character.
 
 @item LANGUAGE
 @itemx LC_ALL
diff --git a/tests/Makefile.am b/tests/Makefile.am
index b65fc39..45908ce 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -53,6 +53,7 @@ TESTS =						\
   big-match					\
   bogus-wctob					\
   bre						\
+  c-locale					\
   case-fold-backref				\
   case-fold-backslash-w				\
   case-fold-char-class				\
diff --git a/tests/c-locale b/tests/c-locale
new file mode 100755
index 0000000..1fe5c70
--- /dev/null
+++ b/tests/c-locale
@@ -0,0 +1,26 @@
+#! /bin/sh
+# Regression test for GNU grep.
+#
+# Copyright 2016 Free Software Foundation, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+fail=0
+
+c=1
+while test $c -lt 256; do
+  tr2=$(printf '\\%o\n' $c)
+  echo X | tr X "$tr2" >in
+  if test $(wc -l <in) -eq 1; then
+    grep . in >out || fail=1
+    compare in out || fail=1
+  fi
+  test $fail -ne 0 && Exit $fail
+  c=$(expr $c + 1)
+done
+
+Exit $fail
-- 
2.5.5