From 65a02d81b69da2cab9cccfb15ce3837685c0807b Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Tue, 14 Feb 2017 15:02:59 -0800 Subject: [PATCH] Simplify -U on MS-Windows by removing guesswork Suggested by Eric Blake (Bug#25707#11). * NEWS, doc/grep.texi: Document this. * src/dosbuf.c: Remove. * bootstrap.conf (gnulib_modules): Add xfreopen. * src/grep.c: Include xfreopen.h, not dosbuf.c. (fillbuf, print_line_head): Do not undossify input. (binary): New static var. (grepdesc): Apply BINARY to input file. (usage): Remove -u help. (main): Set BINARY if -U, and apply it to stdout. Do nothing if -u. With -f, apply BINARY to input file. --- NEWS | 7 ++ bootstrap.conf | 1 + doc/grep.texi | 54 ++++++-------- src/dosbuf.c | 222 --------------------------------------------------------- src/grep.c | 34 ++++----- 5 files changed, 46 insertions(+), 272 deletions(-) delete mode 100644 src/dosbuf.c diff --git a/NEWS b/NEWS index 89674f3..c387aef 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,13 @@ GNU grep NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** Changes in behavior + + The following changes affect only MS-Windows platforms. First, the + --binary (-U) option now governs whether binary I/O is used, instead + of a heuristic that was sometimes incorrect. Second, the + --unix-byte-offsets (-u) option now has no effect on MS-Windows too. + * Noteworthy changes in release 3.0 (2017-02-09) [stable] diff --git a/bootstrap.conf b/bootstrap.conf index cd2d756..4115f35 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -95,6 +95,7 @@ wcrtomb wctob wctype-h xalloc +xfreopen xstrtoimax ' gnulib_name=libgreputils diff --git a/doc/grep.texi b/doc/grep.texi index 7c051e0..94a0621 100644 --- a/doc/grep.texi +++ b/doc/grep.texi @@ -412,10 +412,6 @@ Print the 0-based byte offset within the input file before each line of output. If @option{-o} (@option{--only-matching}) is specified, print the offset of the matching part itself. -When @command{grep} runs on MS-DOS or MS-Windows, -the printed byte offsets depend on whether -the @option{-u} (@option{--unix-byte-offsets}) option is used; -see below. @item -H @itemx --with-filename @@ -466,21 +462,6 @@ This is useful with options that prefix their output to the actual content: This may also prepend spaces to output line numbers and byte offsets so that lines from a single file all start at the same column. address@hidden -u address@hidden --unix-byte-offsets address@hidden -u address@hidden --unix-byte-offsets address@hidden MS-DOS/MS-Windows byte offsets address@hidden byte offsets, on MS-DOS/MS-Windows -Report Unix-style byte offsets. -This option causes @command{grep} to report byte offsets -as if the file were a Unix-style text file, -i.e., the byte offsets ignore carriage returns that were stripped. -This will produce results identical -to running @command{grep} on a Unix machine. -This option has no effect unless the @option{-b} option is also used; -it has no effect on platforms other than MS-DOS and MS-Windows. - @item -Z @itemx --null @opindex -Z @@ -759,21 +740,26 @@ This can cause a performance penalty. @itemx --binary @opindex -U @opindex --binary address@hidden MS-DOS/MS-Windows binary files address@hidden binary files, MS-DOS/MS-Windows -Treat the file(s) as binary. -By default, under MS-DOS and MS-Windows, address@hidden guesses whether a file is text or binary -as described for the @option{--binary-files} option. -If @command{grep} decides the file is a text file, -it strips carriage returns from the original file contents -(to make regular expressions with @code{^} and @code{$} work correctly). -Specifying @option{-U} overrules this guesswork, -causing all files to be read and passed to the matching mechanism verbatim; -if the file is a text file with @code{CR/LF} pairs at the end of each line, -this will cause some regular expressions to fail. -This option has no effect -on platforms other than MS-DOS and MS-Windows. address@hidden MS-Windows binary I/O address@hidden binary I/O, /MS-Windows +Under MS-Windows, use binary I/O when reading and writing files other +than the user's terminal, so that all input bytes are read and written +as-is. This overrides the default behavior where @command{grep} +follows the operating system's advice whether to use binary or text +I/O. When @command{grep} uses text I/O it reads a carriage +return--newline pair as a newline and a Control-Z as end-of-file, and +it writes a newline as a carriage return--newline pair. When this +option is combined with @option{--byte-offset} (@option{-b}), byte +counts treat each text I/O carriage return--newline as a single byte. + +When using text I/O, @option{--binary-files} heuristics are applied to +input data after text-I/O processing. These heuristics need not agree +with the @option{--binary} option; that is, they may treat the data as +text even if @option{--binary} is given, or vice versa. address@hidden and Directory Selection}. + +This option has no effect on platforms other than MS-Windows. + @item -z @itemx --null-data diff --git a/src/dosbuf.c b/src/dosbuf.c deleted file mode 100644 index 9892755..0000000 --- a/src/dosbuf.c +++ /dev/null @@ -1,222 +0,0 @@ -/* dosbuf.c - Copyright (C) 1992, 1997-2002, 2004-2017 Free Software Foundation, Inc. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3, or (at your option) - any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA - 02110-1301, USA. */ - -/* Messy DOS-specific code for correctly treating binary, Unix text - and DOS text files. - - This has several aspects: - - * Guessing the file type (unless the user tells us); - * Stripping CR characters from DOS text files (otherwise regex - functions won't work correctly); - * Reporting correct byte count with -b for any kind of file. - -*/ - -#include - -typedef enum { - UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT -} File_type; - -struct dos_map { - off_t pos; /* position in buffer passed to matcher */ - off_t add; /* how much to add when reporting char position */ -}; - -static int dos_report_unix_offset = 0; - -static File_type dos_file_type = UNKNOWN; -static File_type dos_use_file_type = UNKNOWN; -static off_t dos_stripped_crs = 0; -static struct dos_map *dos_pos_map; -static int dos_pos_map_size = 0; -static int dos_pos_map_used = 0; -static int inp_map_idx = 0, out_map_idx = 1; - -/* Set default DOS file type to binary. */ -static void -dos_binary (void) -{ - if (O_BINARY) - dos_use_file_type = DOS_BINARY; -} - -/* Tell DOS routines to report Unix offset. */ -static void -dos_unix_byte_offsets (void) -{ - if (O_BINARY) - dos_report_unix_offset = 1; -} - -/* Guess DOS file type by looking at its contents. */ -static File_type -guess_type (char *buf, size_t buflen) -{ - int crlf_seen = 0; - char *bp = buf; - - while (buflen--) - { - /* Treat a file as binary if it has a NUL character. */ - if (!*bp) - return DOS_BINARY; - - /* CR before LF means DOS text file (unless we later see - binary characters). */ - else if (*bp == '\r' && buflen && bp[1] == '\n') - crlf_seen = 1; - - bp++; - } - - return crlf_seen ? DOS_TEXT : UNIX_TEXT; -} - -/* Convert external DOS file representation to internal. - Return the count of bytes left in the buffer. - Build table to map character positions when reporting byte counts. */ -static size_t -undossify_input (char *buf, size_t buflen) -{ - if (! O_BINARY) - return buflen; - - size_t bytes_left = 0; - - if (totalcc == 0) - { - /* New file: forget everything we knew about character - position mapping table and file type. */ - inp_map_idx = 0; - out_map_idx = 1; - dos_pos_map_used = 0; - dos_stripped_crs = 0; - dos_file_type = dos_use_file_type; - } - - /* Guess if this file is binary, unless we already know that. */ - if (dos_file_type == UNKNOWN) - dos_file_type = guess_type (buf, buflen); - - /* If this file is to be treated as DOS Text, strip the CR characters - and maybe build the table for character position mapping on output. */ - if (dos_file_type == DOS_TEXT) - { - char *destp = buf; - - while (buflen--) - { - if (*buf != '\r') - { - *destp++ = *buf++; - bytes_left++; - } - else - { - buf++; - if (out_byte && !dos_report_unix_offset) - { - dos_stripped_crs++; - while (buflen && *buf == '\r') - { - dos_stripped_crs++; - buflen--; - buf++; - } - if (inp_map_idx >= dos_pos_map_size - 1) - { - dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000; - dos_pos_map = xrealloc (dos_pos_map, - (dos_pos_map_size - * sizeof (struct dos_map))); - } - - if (!inp_map_idx) - { - /* Add sentinel entry. */ - dos_pos_map[inp_map_idx].pos = 0; - dos_pos_map[inp_map_idx++].add = 0; - - /* Initialize first real entry. */ - dos_pos_map[inp_map_idx].add = 0; - } - - /* Put the new entry. If the stripped CR characters - precede a Newline (the usual case), pretend that - they were found *after* the Newline. This makes - displayed byte offsets more reasonable in some - cases, and fits better the intuitive notion that - the line ends *before* the CR, not *after* it. */ - inp_map_idx++; - dos_pos_map[inp_map_idx-1].pos = - (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc; - dos_pos_map[inp_map_idx].add = dos_stripped_crs; - dos_pos_map_used = inp_map_idx; - - /* The following will be updated on the next pass. */ - dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1; - } - } - } - - return bytes_left; - } - - return buflen; -} - -/* Convert internal byte count into external. */ -static off_t -dossified_pos (off_t byteno) -{ - if (! O_BINARY) - return byteno; - - off_t pos_lo; - off_t pos_hi; - - if (dos_file_type != DOS_TEXT || dos_report_unix_offset) - return byteno; - - /* Optimization: usually the file will be scanned sequentially. - So in most cases, this byte position will be found in the - table near the previous one, as recorded in 'out_map_idx'. */ - pos_lo = dos_pos_map[out_map_idx-1].pos; - pos_hi = dos_pos_map[out_map_idx].pos; - - /* If the initial guess failed, search up or down, as - appropriate, beginning with the previous place. */ - if (byteno >= pos_hi) - { - out_map_idx++; - while (out_map_idx < dos_pos_map_used - && byteno >= dos_pos_map[out_map_idx].pos) - out_map_idx++; - } - - else if (byteno < pos_lo) - { - out_map_idx--; - while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos) - out_map_idx--; - } - - return byteno + dos_pos_map[out_map_idx].add; -} diff --git a/src/grep.c b/src/grep.c index 437ae0a..32cb642 100644 --- a/src/grep.c +++ b/src/grep.c @@ -48,6 +48,7 @@ #include "search.h" #include "version-etc.h" #include "xalloc.h" +#include "xfreopen.h" #include "xstrtol.h" enum { SEP_CHAR_SELECTED = ':' }; @@ -536,10 +537,6 @@ static enum static bool grepfile (int, char const *, bool, bool); static bool grepdesc (int, bool); -static void dos_binary (void); -static void dos_unix_byte_offsets (void); -static size_t undossify_input (char *, size_t); - static bool is_device_mode (mode_t m) { @@ -973,7 +970,6 @@ fillbuf (size_t save, struct stat const *st) } } - fillsize = undossify_input (readbuf, fillsize); buflim = readbuf + fillsize; /* Initialize the following word, because skip_easy_bytes and some @@ -1033,8 +1029,7 @@ static intmax_t pending; /* Pending lines of output. static bool done_on_match; /* Stop scanning file on first match. */ static bool exit_on_match; /* Exit on first match. */ static bool dev_null_output; /* Stdout is known to be /dev/null. */ - -#include "dosbuf.c" +static bool binary; /* Use binary rather than text I/O. */ static void nlscan (char const *lim) @@ -1127,7 +1122,6 @@ print_line_head (char *beg, size_t len, char const *lim, char sep) if (out_byte) { uintmax_t pos = add_count (totalcc, beg - bufbeg); - pos = dossified_pos (pos); print_offset (pos, byte_num_color); print_sep (sep); } @@ -1862,7 +1856,7 @@ grepdesc (int desc, bool command_line) /* Set input to binary mode. Pipes are simulated with files on DOS, so this includes the case of "foo | grep bar". */ - if (O_BINARY && !isatty (desc)) + if (binary && !isatty (desc)) set_binary_mode (desc, O_BINARY); count = grep (desc, &st, &ineof); @@ -2006,8 +2000,6 @@ Context control:\n\ --colour[=WHEN] use markers to highlight the matching strings;\n\ WHEN is 'always', 'never', or 'auto'\n\ -U, --binary do not strip CR characters at EOL (MSDOS/Windows)\n\ - -u, --unix-byte-offsets report offsets as if CRs were not there\n\ - (MSDOS/Windows)\n\ \n")); printf (_("\ When FILE is '-', read standard input. With no FILE, read '.' if\n\ @@ -2537,11 +2529,13 @@ main (int argc, char **argv) break; case 'U': - dos_binary (); + if (O_BINARY) + binary = true; break; case 'u': - dos_unix_byte_offsets (); + /* Obsolete option; it has no effect. FIXME: Diagnose use of + this option starting in (say) the year 2020. */ break; case 'V': @@ -2582,7 +2576,15 @@ main (int argc, char **argv) break; case 'f': - fp = STREQ (optarg, "-") ? stdin : fopen (optarg, "r"); + if (STREQ (optarg, "-")) + { + if (binary) + xfreopen (NULL, "rb", stdin); + fp = stdin; + } + else + fp = fopen (optarg, binary ? "rb" : "r"); + if (!fp) die (EXIT_TROUBLE, errno, "%s", optarg); oldcc = keycc; @@ -2897,8 +2899,8 @@ main (int argc, char **argv) /* Output is set to binary mode because we shouldn't convert NL to CR-LF pairs, especially when grepping binary files. */ - if (O_BINARY && !isatty (STDOUT_FILENO)) - set_binary_mode (STDOUT_FILENO, O_BINARY); + if (binary && !isatty (STDOUT_FILENO)) + xfreopen (NULL, "wb", stdout); /* Prefer sysconf for page size, as getpagesize typically returns int. */ #ifdef _SC_PAGESIZE -- 2.9.3