[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH] grep: sparse files are now considered binary
From: |
Paul Eggert |
Subject: |
Re: [PATCH] grep: sparse files are now considered binary |
Date: |
Tue, 15 May 2012 09:29:59 -0700 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:11.0) Gecko/20120329 Thunderbird/11.0.1 |
No further comment, so I pushed the following
slightly-improved version of that patch.
* NEWS: Document this.
* doc/grep.texi (File and Directory Selection): Likewise.
* bootstrap.conf (gnulib_modules): Add stat-size.
* src/main.c: Include stat-size.h.
(usable_st_size): New function, mostly stolen from coreutils.
(fillbuf): Use it.
(file_is_binary): New function, which looks for holes too.
(grep): Use it.
* tests/Makefile.am (TESTS): Add big-hole.
* tests/big-hole: New file.
---
NEWS | 6 +++-
bootstrap.conf | 1 +
doc/grep.texi | 7 +++--
src/main.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++-
tests/Makefile.am | 1 +
tests/big-hole | 31 +++++++++++++++++++++
6 files changed, 117 insertions(+), 6 deletions(-)
create mode 100755 tests/big-hole
diff --git a/NEWS b/NEWS
index 1497b92..f515e84 100644
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,11 @@ GNU grep NEWS -*- outline
-*-
PATTERN *" again reads all *.c and *.h files except for system.h.
[bug introduced in grep-2.6]
+** New features
+
+ 'grep' without -z now treats a sparse file as binary, if it can
+ easily determine that the file is sparse.
+
** Dropped features
Bootstrapping with Makefile.boot has been broken since grep 2.6,
@@ -45,7 +50,6 @@ GNU grep NEWS -*- outline
-*-
use -R if you prefer the old behavior of following all symlinks and
defaulting to reading all devices.
-
* Noteworthy changes in release 2.11 (2012-03-02) [stable]
** Bug fixes
diff --git a/bootstrap.conf b/bootstrap.conf
index a7853c9..57749b4 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -69,6 +69,7 @@ realloc-gnu
regex
same-inode
ssize_t
+stat-size
stddef
stdlib
stpcpy
diff --git a/doc/grep.texi b/doc/grep.texi
index 3b52a19..0e519dd 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -580,7 +580,8 @@ this is equivalent to the @samp{--binary-files=text} option.
@item address@hidden
@opindex --binary-files
@cindex binary files
-If the first few bytes of a file indicate that the file contains binary data,
+If a file's allocation metadata or its first few bytes
+indicate that the file contains binary data,
assume that the file is of type @var{type}.
By default, @var{type} is @samp{binary},
and @command{grep} normally outputs either
@@ -722,8 +723,8 @@ better performance.
@cindex binary files, MS-DOS/MS-Windows
Treat the file(s) as binary.
By default, under MS-DOS and MS-Windows,
address@hidden guesses the file type
-by looking at the contents of the first 32kB read from the file.
address@hidden guesses whether a file is text or binary
+as described for the @option{--binary-files} option.
If @command{grep} decides the file is a text file,
it strips the @code{CR} characters from the original file contents
(to make regular expressions with @code{^} and @code{$} work correctly).
diff --git a/src/main.c b/src/main.c
index bc9177e..10fbfac 100644
--- a/src/main.c
+++ b/src/main.c
@@ -44,6 +44,7 @@
#include "progname.h"
#include "propername.h"
#include "quote.h"
+#include "stat-size.h"
#include "version-etc.h"
#include "xalloc.h"
#include "xstrtol.h"
@@ -406,6 +407,14 @@ is_device_mode (mode_t m)
return S_ISCHR (m) || S_ISBLK (m) || S_ISSOCK (m) || S_ISFIFO (m);
}
+/* Return nonzero if ST->st_size is defined. Assume the file is not a
+ symbolic link. */
+static int
+usable_st_size (struct stat const *st)
+{
+ return S_ISREG (st->st_mode) || S_TYPEISSHM (st) || S_TYPEISTMO (st);
+}
+
/* Functions we'll use to search. */
static compile_fp_t compile;
static execute_fp_t execute;
@@ -428,6 +437,70 @@ clean_up_stdout (void)
close_stdout ();
}
+/* Return 1 if a file is known to be binary for the purpose of 'grep'.
+ BUF, of size BUFSIZE, is the initial buffer read from the file with
+ descriptor FD and status ST. */
+static int
+file_is_binary (char const *buf, size_t bufsize, int fd, struct stat const *st)
+{
+ #ifndef HAVE_STRUCT_STAT_ST_BLOCKS
+ enum { HAVE_STRUCT_STAT_ST_BLOCKS = 0 };
+ #endif
+ #ifndef SEEK_HOLE
+ enum { SEEK_HOLE = SEEK_END };
+ #endif
+
+ /* If -z, test only whether the initial buffer contains '\200';
+ knowing about holes won't help. */
+ if (! eolbyte)
+ return memchr (buf, '\200', bufsize) != 0;
+
+ /* If the initial buffer contains a null byte, guess that the file
+ is binary. */
+ if (memchr (buf, '\0', bufsize))
+ return 1;
+
+ /* If the file has holes, it must contain a null byte somewhere. */
+ if ((HAVE_STRUCT_STAT_ST_BLOCKS || SEEK_HOLE != SEEK_END)
+ && usable_st_size (st))
+ {
+ off_t cur = bufsize;
+ if (O_BINARY || fd == STDIN_FILENO)
+ {
+ cur = lseek (fd, 0, SEEK_CUR);
+ if (cur < 0)
+ return 0;
+ }
+
+ /* If the file has fewer blocks than would be needed to
+ represent its data, then it must have at least one hole. */
+ if (HAVE_STRUCT_STAT_ST_BLOCKS)
+ {
+ off_t nonzeros_needed = st->st_size - cur + bufsize;
+ off_t full_blocks = nonzeros_needed / ST_NBLOCKSIZE;
+ int partial_block = 0 < nonzeros_needed % ST_NBLOCKSIZE;
+ if (ST_NBLOCKS (*st) < full_blocks + partial_block)
+ return 1;
+ }
+
+ /* Look for a hole after the current location. */
+ if (SEEK_HOLE != SEEK_END)
+ {
+ off_t hole_start = lseek (fd, cur, SEEK_HOLE);
+ if (0 <= hole_start)
+ {
+ if (lseek (fd, cur, SEEK_SET) < 0)
+ suppressible_error (filename, errno);
+ if (hole_start < st->st_size)
+ return 1;
+ }
+ }
+ }
+
+ /* Guess that the file does not contain binary data. */
+ return 0;
+}
+
/* Convert STR to a nonnegative integer, storing the result in *OUT.
STR must be a valid context length argument; report an error if it
isn't. Silently ceiling *OUT at the maximum value, as that is
@@ -559,7 +632,7 @@ fillbuf (size_t save, struct stat const *st)
is large. However, do not use the original file size as a
heuristic if we've already read past the file end, as most
likely the file is growing. */
- if (S_ISREG (st->st_mode))
+ if (usable_st_size (st))
{
off_t to_be_read = st->st_size - bufoffset;
off_t maxsize_off = save + to_be_read;
@@ -1133,7 +1206,7 @@ grep (int fd, struct stat const *st)
not_text = (((binary_files == BINARY_BINARY_FILES && !out_quiet)
|| binary_files == WITHOUT_MATCH_BINARY_FILES)
- && memchr (bufbeg, eol ? '\0' : '\200', buflim - bufbeg));
+ && file_is_binary (bufbeg, buflim - bufbeg, fd, st));
if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES)
return 0;
done_on_match += not_text;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index d0d622b..7be788c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -38,6 +38,7 @@ TESTS = \
backref \
backref-multibyte-slow \
backref-word \
+ big-hole \
big-match \
bogus-wctob \
bre \
diff --git a/tests/big-hole b/tests/big-hole
new file mode 100755
index 0000000..47e36e1
--- /dev/null
+++ b/tests/big-hole
@@ -0,0 +1,31 @@
+#!/bin/sh
+# Check that grep --binary-file=without-match quickly skips files with holes.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+expensive_
+
+# Try to make this test not THAT expensive, on typical hosts.
+virtual_memory_KiB=10240
+if echo x | (ulimit -v $virtual_memory_KiB && grep x) >/dev/null 2>&1; then
+ ulimit -v $virtual_memory_KiB
+fi
+
+# Create a file that starts with at least a buffer's worth of text,
+# but has a big hole later.
+ten='1 2 3 4 5 6 7 8 9 10'
+x='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
+(for i in $ten; do
+ for j in $ten; do
+ for k in $ten; do
+ echo $x
+ done
+ done
+ done
+ echo x | dd bs=1024k seek=8000000
+) >8T-or-so || skip_ 'cannot create big sparse file'
+
+grep --binary-file=without-match x 8T-or-so >/dev/null
+test $? -eq 1 || fail=1
+
+Exit $fail
--
1.7.6.5
- Re: [PATCH] grep: sparse files are now considered binary,
Paul Eggert <=