From 65a02d81b69da2cab9cccfb15ce3837685c0807b Mon Sep 17 00:00:00 2001
From: Paul Eggert <address@hidden>
Date: Tue, 14 Feb 2017 15:02:59 -0800
Subject: [PATCH] Simplify -U on MS-Windows by removing guesswork

Suggested by Eric Blake (Bug#25707#11).
* NEWS, doc/grep.texi: Document this.
* src/dosbuf.c: Remove.
* bootstrap.conf (gnulib_modules): Add xfreopen.
* src/grep.c: Include xfreopen.h, not dosbuf.c.
(fillbuf, print_line_head): Do not undossify input.
(binary): New static var.
(grepdesc): Apply BINARY to input file.
(usage): Remove -u help.
(main): Set BINARY if -U, and apply it to stdout.  Do nothing if -u.
With -f, apply BINARY to input file.
---
 NEWS           |   7 ++
 bootstrap.conf |   1 +
 doc/grep.texi  |  54 ++++++--------
 src/dosbuf.c   | 222 ---------------------------------------------------------
 src/grep.c     |  34 ++++-----
 5 files changed, 46 insertions(+), 272 deletions(-)
 delete mode 100644 src/dosbuf.c

diff --git a/NEWS b/NEWS
index 89674f3..c387aef 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,13 @@ GNU grep NEWS                                    -*- outline -*-
 
 * Noteworthy changes in release ?.? (????-??-??) [?]
 
+** Changes in behavior
+
+  The following changes affect only MS-Windows platforms.  First, the
+  --binary (-U) option now governs whether binary I/O is used, instead
+  of a heuristic that was sometimes incorrect.  Second, the
+  --unix-byte-offsets (-u) option now has no effect on MS-Windows too.
+
 
 * Noteworthy changes in release 3.0 (2017-02-09) [stable]
 
diff --git a/bootstrap.conf b/bootstrap.conf
index cd2d756..4115f35 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -95,6 +95,7 @@ wcrtomb
 wctob
 wctype-h
 xalloc
+xfreopen
 xstrtoimax
 '
 gnulib_name=libgreputils
diff --git a/doc/grep.texi b/doc/grep.texi
index 7c051e0..94a0621 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -412,10 +412,6 @@ Print the 0-based byte offset within the input file
 before each line of output.
 If @option{-o} (@option{--only-matching}) is specified,
 print the offset of the matching part itself.
-When @command{grep} runs on MS-DOS or MS-Windows,
-the printed byte offsets depend on whether
-the @option{-u} (@option{--unix-byte-offsets}) option is used;
-see below.
 
 @item -H
 @itemx --with-filename
@@ -466,21 +462,6 @@ This is useful with options that prefix their output to the actual content:
 This may also prepend spaces to output line numbers and byte offsets
 so that lines from a single file all start at the same column.
 
address@hidden -u
address@hidden --unix-byte-offsets
address@hidden -u
address@hidden --unix-byte-offsets
address@hidden MS-DOS/MS-Windows byte offsets
address@hidden byte offsets, on MS-DOS/MS-Windows
-Report Unix-style byte offsets.
-This option causes @command{grep} to report byte offsets
-as if the file were a Unix-style text file,
-i.e., the byte offsets ignore carriage returns that were stripped.
-This will produce results identical
-to running @command{grep} on a Unix machine.
-This option has no effect unless the @option{-b} option is also used;
-it has no effect on platforms other than MS-DOS and MS-Windows.
-
 @item -Z
 @itemx --null
 @opindex -Z
@@ -759,21 +740,26 @@ This can cause a performance penalty.
 @itemx --binary
 @opindex -U
 @opindex --binary
address@hidden MS-DOS/MS-Windows binary files
address@hidden binary files, MS-DOS/MS-Windows
-Treat the file(s) as binary.
-By default, under MS-DOS and MS-Windows,
address@hidden guesses whether a file is text or binary
-as described for the @option{--binary-files} option.
-If @command{grep} decides the file is a text file,
-it strips carriage returns from the original file contents
-(to make regular expressions with @code{^} and @code{$} work correctly).
-Specifying @option{-U} overrules this guesswork,
-causing all files to be read and passed to the matching mechanism verbatim;
-if the file is a text file with @code{CR/LF} pairs at the end of each line,
-this will cause some regular expressions to fail.
-This option has no effect
-on platforms other than MS-DOS and MS-Windows.
address@hidden MS-Windows binary I/O
address@hidden binary I/O, /MS-Windows
+Under MS-Windows, use binary I/O when reading and writing files other
+than the user's terminal, so that all input bytes are read and written
+as-is.  This overrides the default behavior where @command{grep}
+follows the operating system's advice whether to use binary or text
+I/O.  When @command{grep} uses text I/O it reads a carriage
+return--newline pair as a newline and a Control-Z as end-of-file, and
+it writes a newline as a carriage return--newline pair.  When this
+option is combined with @option{--byte-offset} (@option{-b}), byte
+counts treat each text I/O carriage return--newline as a single byte.
+
+When using text I/O, @option{--binary-files} heuristics are applied to
+input data after text-I/O processing.  These heuristics need not agree
+with the @option{--binary} option; that is, they may treat the data as
+text even if @option{--binary} is given, or vice versa.
address@hidden and Directory Selection}.
+
+This option has no effect on platforms other than MS-Windows.
+
 
 @item -z
 @itemx --null-data
diff --git a/src/dosbuf.c b/src/dosbuf.c
deleted file mode 100644
index 9892755..0000000
--- a/src/dosbuf.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/* dosbuf.c
-   Copyright (C) 1992, 1997-2002, 2004-2017 Free Software Foundation, Inc.
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3, or (at your option)
-   any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-   02110-1301, USA.  */
-
-/* Messy DOS-specific code for correctly treating binary, Unix text
-   and DOS text files.
-
-   This has several aspects:
-
-     * Guessing the file type (unless the user tells us);
-     * Stripping CR characters from DOS text files (otherwise regex
-       functions won't work correctly);
-     * Reporting correct byte count with -b for any kind of file.
-
-*/
-
-#include <config.h>
-
-typedef enum {
-  UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT
-} File_type;
-
-struct dos_map {
-  off_t pos;	/* position in buffer passed to matcher */
-  off_t add;	/* how much to add when reporting char position */
-};
-
-static int       dos_report_unix_offset = 0;
-
-static File_type dos_file_type     = UNKNOWN;
-static File_type dos_use_file_type = UNKNOWN;
-static off_t     dos_stripped_crs  = 0;
-static struct dos_map *dos_pos_map;
-static int       dos_pos_map_size  = 0;
-static int       dos_pos_map_used  = 0;
-static int       inp_map_idx = 0, out_map_idx = 1;
-
-/* Set default DOS file type to binary.  */
-static void
-dos_binary (void)
-{
-  if (O_BINARY)
-    dos_use_file_type = DOS_BINARY;
-}
-
-/* Tell DOS routines to report Unix offset.  */
-static void
-dos_unix_byte_offsets (void)
-{
-  if (O_BINARY)
-    dos_report_unix_offset = 1;
-}
-
-/* Guess DOS file type by looking at its contents.  */
-static File_type
-guess_type (char *buf, size_t buflen)
-{
-  int crlf_seen = 0;
-  char *bp = buf;
-
-  while (buflen--)
-    {
-      /* Treat a file as binary if it has a NUL character.  */
-      if (!*bp)
-        return DOS_BINARY;
-
-      /* CR before LF means DOS text file (unless we later see
-         binary characters).  */
-      else if (*bp == '\r' && buflen && bp[1] == '\n')
-        crlf_seen = 1;
-
-      bp++;
-    }
-
-  return crlf_seen ? DOS_TEXT : UNIX_TEXT;
-}
-
-/* Convert external DOS file representation to internal.
-   Return the count of bytes left in the buffer.
-   Build table to map character positions when reporting byte counts.  */
-static size_t
-undossify_input (char *buf, size_t buflen)
-{
-  if (! O_BINARY)
-    return buflen;
-
-  size_t bytes_left = 0;
-
-  if (totalcc == 0)
-    {
-      /* New file: forget everything we knew about character
-         position mapping table and file type.  */
-      inp_map_idx = 0;
-      out_map_idx = 1;
-      dos_pos_map_used = 0;
-      dos_stripped_crs = 0;
-      dos_file_type = dos_use_file_type;
-    }
-
-  /* Guess if this file is binary, unless we already know that.  */
-  if (dos_file_type == UNKNOWN)
-    dos_file_type = guess_type (buf, buflen);
-
-  /* If this file is to be treated as DOS Text, strip the CR characters
-     and maybe build the table for character position mapping on output.  */
-  if (dos_file_type == DOS_TEXT)
-    {
-      char   *destp   = buf;
-
-      while (buflen--)
-        {
-          if (*buf != '\r')
-            {
-              *destp++ = *buf++;
-              bytes_left++;
-            }
-          else
-            {
-              buf++;
-              if (out_byte && !dos_report_unix_offset)
-                {
-                  dos_stripped_crs++;
-                  while (buflen && *buf == '\r')
-                    {
-                      dos_stripped_crs++;
-                      buflen--;
-                      buf++;
-                    }
-                  if (inp_map_idx >= dos_pos_map_size - 1)
-                    {
-                      dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000;
-                      dos_pos_map = xrealloc (dos_pos_map,
-                                              (dos_pos_map_size
-                                               * sizeof (struct dos_map)));
-                    }
-
-                  if (!inp_map_idx)
-                    {
-                      /* Add sentinel entry.  */
-                      dos_pos_map[inp_map_idx].pos = 0;
-                      dos_pos_map[inp_map_idx++].add = 0;
-
-                      /* Initialize first real entry.  */
-                      dos_pos_map[inp_map_idx].add = 0;
-                    }
-
-                  /* Put the new entry.  If the stripped CR characters
-                     precede a Newline (the usual case), pretend that
-                     they were found *after* the Newline.  This makes
-                     displayed byte offsets more reasonable in some
-                     cases, and fits better the intuitive notion that
-                     the line ends *before* the CR, not *after* it.  */
-                  inp_map_idx++;
-                  dos_pos_map[inp_map_idx-1].pos =
-                    (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc;
-                  dos_pos_map[inp_map_idx].add = dos_stripped_crs;
-                  dos_pos_map_used = inp_map_idx;
-
-                  /* The following will be updated on the next pass.  */
-                  dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1;
-                }
-            }
-        }
-
-      return bytes_left;
-    }
-
-  return buflen;
-}
-
-/* Convert internal byte count into external.  */
-static off_t
-dossified_pos (off_t byteno)
-{
-  if (! O_BINARY)
-    return byteno;
-
-  off_t pos_lo;
-  off_t pos_hi;
-
-  if (dos_file_type != DOS_TEXT || dos_report_unix_offset)
-    return byteno;
-
-  /* Optimization: usually the file will be scanned sequentially.
-     So in most cases, this byte position will be found in the
-     table near the previous one, as recorded in 'out_map_idx'.  */
-  pos_lo = dos_pos_map[out_map_idx-1].pos;
-  pos_hi = dos_pos_map[out_map_idx].pos;
-
-  /* If the initial guess failed, search up or down, as
-     appropriate, beginning with the previous place.  */
-  if (byteno >= pos_hi)
-    {
-      out_map_idx++;
-      while (out_map_idx < dos_pos_map_used
-             && byteno >= dos_pos_map[out_map_idx].pos)
-        out_map_idx++;
-    }
-
-  else if (byteno < pos_lo)
-    {
-      out_map_idx--;
-      while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos)
-        out_map_idx--;
-    }
-
-  return byteno + dos_pos_map[out_map_idx].add;
-}
diff --git a/src/grep.c b/src/grep.c
index 437ae0a..32cb642 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -48,6 +48,7 @@
 #include "search.h"
 #include "version-etc.h"
 #include "xalloc.h"
+#include "xfreopen.h"
 #include "xstrtol.h"
 
 enum { SEP_CHAR_SELECTED = ':' };
@@ -536,10 +537,6 @@ static enum
 static bool grepfile (int, char const *, bool, bool);
 static bool grepdesc (int, bool);
 
-static void dos_binary (void);
-static void dos_unix_byte_offsets (void);
-static size_t undossify_input (char *, size_t);
-
 static bool
 is_device_mode (mode_t m)
 {
@@ -973,7 +970,6 @@ fillbuf (size_t save, struct stat const *st)
         }
     }
 
-  fillsize = undossify_input (readbuf, fillsize);
   buflim = readbuf + fillsize;
 
   /* Initialize the following word, because skip_easy_bytes and some
@@ -1033,8 +1029,7 @@ static intmax_t pending;	/* Pending lines of output.
 static bool done_on_match;	/* Stop scanning file on first match.  */
 static bool exit_on_match;	/* Exit on first match.  */
 static bool dev_null_output;	/* Stdout is known to be /dev/null.  */
-
-#include "dosbuf.c"
+static bool binary;		/* Use binary rather than text I/O.  */
 
 static void
 nlscan (char const *lim)
@@ -1127,7 +1122,6 @@ print_line_head (char *beg, size_t len, char const *lim, char sep)
   if (out_byte)
     {
       uintmax_t pos = add_count (totalcc, beg - bufbeg);
-      pos = dossified_pos (pos);
       print_offset (pos, byte_num_color);
       print_sep (sep);
     }
@@ -1862,7 +1856,7 @@ grepdesc (int desc, bool command_line)
 
   /* Set input to binary mode.  Pipes are simulated with files
      on DOS, so this includes the case of "foo | grep bar".  */
-  if (O_BINARY && !isatty (desc))
+  if (binary && !isatty (desc))
     set_binary_mode (desc, O_BINARY);
 
   count = grep (desc, &st, &ineof);
@@ -2006,8 +2000,6 @@ Context control:\n\
       --colour[=WHEN]       use markers to highlight the matching strings;\n\
                             WHEN is 'always', 'never', or 'auto'\n\
   -U, --binary              do not strip CR characters at EOL (MSDOS/Windows)\n\
-  -u, --unix-byte-offsets   report offsets as if CRs were not there\n\
-                            (MSDOS/Windows)\n\
 \n"));
       printf (_("\
 When FILE is '-', read standard input.  With no FILE, read '.' if\n\
@@ -2537,11 +2529,13 @@ main (int argc, char **argv)
         break;
 
       case 'U':
-        dos_binary ();
+        if (O_BINARY)
+          binary = true;
         break;
 
       case 'u':
-        dos_unix_byte_offsets ();
+        /* Obsolete option; it has no effect.  FIXME: Diagnose use of
+           this option starting in (say) the year 2020.  */
         break;
 
       case 'V':
@@ -2582,7 +2576,15 @@ main (int argc, char **argv)
         break;
 
       case 'f':
-        fp = STREQ (optarg, "-") ? stdin : fopen (optarg, "r");
+        if (STREQ (optarg, "-"))
+          {
+            if (binary)
+              xfreopen (NULL, "rb", stdin);
+            fp = stdin;
+          }
+        else
+          fp = fopen (optarg, binary ? "rb" : "r");
+
         if (!fp)
           die (EXIT_TROUBLE, errno, "%s", optarg);
         oldcc = keycc;
@@ -2897,8 +2899,8 @@ main (int argc, char **argv)
 
   /* Output is set to binary mode because we shouldn't convert
      NL to CR-LF pairs, especially when grepping binary files.  */
-  if (O_BINARY && !isatty (STDOUT_FILENO))
-    set_binary_mode (STDOUT_FILENO, O_BINARY);
+  if (binary && !isatty (STDOUT_FILENO))
+    xfreopen (NULL, "wb", stdout);
 
   /* Prefer sysconf for page size, as getpagesize typically returns int.  */
 #ifdef _SC_PAGESIZE
-- 
2.9.3