Re: [PATCH] grep: sparse files are now considered binary

bug-grep

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] grep: sparse files are now considered binary

From:	Paul Eggert
Subject:	Re: [PATCH] grep: sparse files are now considered binary
Date:	Tue, 15 May 2012 09:29:59 -0700
User-agent:	Mozilla/5.0 (X11; Linux x86_64; rv:11.0) Gecko/20120329 Thunderbird/11.0.1

No further comment, so I pushed the following
slightly-improved version of that patch.

* NEWS: Document this.
* doc/grep.texi (File and Directory Selection): Likewise.
* bootstrap.conf (gnulib_modules): Add stat-size.
* src/main.c: Include stat-size.h.
(usable_st_size): New function, mostly stolen from coreutils.
(fillbuf): Use it.
(file_is_binary): New function, which looks for holes too.
(grep): Use it.
* tests/Makefile.am (TESTS): Add big-hole.
* tests/big-hole: New file.
---
 NEWS              |    6 +++-
 bootstrap.conf    |    1 +
 doc/grep.texi     |    7 +++--
 src/main.c        |   77 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 tests/Makefile.am |    1 +
 tests/big-hole    |   31 +++++++++++++++++++++
 6 files changed, 117 insertions(+), 6 deletions(-)
 create mode 100755 tests/big-hole

diff --git a/NEWS b/NEWS
index 1497b92..f515e84 100644
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,11 @@ GNU grep NEWS                                    -*- outline 
-*-
   PATTERN *" again reads all *.c and *.h files except for system.h.
   [bug introduced in grep-2.6]
 
+** New features
+
+  'grep' without -z now treats a sparse file as binary, if it can
+  easily determine that the file is sparse.
+
 ** Dropped features
 
   Bootstrapping with Makefile.boot has been broken since grep 2.6,
@@ -45,7 +50,6 @@ GNU grep NEWS                                    -*- outline 
-*-
   use -R if you prefer the old behavior of following all symlinks and
   defaulting to reading all devices.
 
-
 * Noteworthy changes in release 2.11 (2012-03-02) [stable]
 
 ** Bug fixes
diff --git a/bootstrap.conf b/bootstrap.conf
index a7853c9..57749b4 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -69,6 +69,7 @@ realloc-gnu
 regex
 same-inode
 ssize_t
+stat-size
 stddef
 stdlib
 stpcpy
diff --git a/doc/grep.texi b/doc/grep.texi
index 3b52a19..0e519dd 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -580,7 +580,8 @@ this is equivalent to the @samp{--binary-files=text} option.
 @item address@hidden
 @opindex --binary-files
 @cindex binary files
-If the first few bytes of a file indicate that the file contains binary data,
+If a file's allocation metadata or its first few bytes
+indicate that the file contains binary data,
 assume that the file is of type @var{type}.
 By default, @var{type} is @samp{binary},
 and @command{grep} normally outputs either
@@ -722,8 +723,8 @@ better performance.
 @cindex binary files, MS-DOS/MS-Windows
 Treat the file(s) as binary.
 By default, under MS-DOS and MS-Windows,
address@hidden guesses the file type
-by looking at the contents of the first 32kB read from the file.
address@hidden guesses whether a file is text or binary
+as described for the @option{--binary-files} option.
 If @command{grep} decides the file is a text file,
 it strips the @code{CR} characters from the original file contents
 (to make regular expressions with @code{^} and @code{$} work correctly).
diff --git a/src/main.c b/src/main.c
index bc9177e..10fbfac 100644
--- a/src/main.c
+++ b/src/main.c
@@ -44,6 +44,7 @@
 #include "progname.h"
 #include "propername.h"
 #include "quote.h"
+#include "stat-size.h"
 #include "version-etc.h"
 #include "xalloc.h"
 #include "xstrtol.h"
@@ -406,6 +407,14 @@ is_device_mode (mode_t m)
   return S_ISCHR (m) || S_ISBLK (m) || S_ISSOCK (m) || S_ISFIFO (m);
 }
 
+/* Return nonzero if ST->st_size is defined.  Assume the file is not a
+   symbolic link.  */
+static int
+usable_st_size (struct stat const *st)
+{
+  return S_ISREG (st->st_mode) || S_TYPEISSHM (st) || S_TYPEISTMO (st);
+}
+
 /* Functions we'll use to search. */
 static compile_fp_t compile;
 static execute_fp_t execute;
@@ -428,6 +437,70 @@ clean_up_stdout (void)
     close_stdout ();
 }
 
+/* Return 1 if a file is known to be binary for the purpose of 'grep'.
+   BUF, of size BUFSIZE, is the initial buffer read from the file with
+   descriptor FD and status ST.  */
+static int
+file_is_binary (char const *buf, size_t bufsize, int fd, struct stat const *st)
+{
+  #ifndef HAVE_STRUCT_STAT_ST_BLOCKS
+  enum { HAVE_STRUCT_STAT_ST_BLOCKS = 0 };
+  #endif
+  #ifndef SEEK_HOLE
+  enum { SEEK_HOLE = SEEK_END };
+  #endif
+
+  /* If -z, test only whether the initial buffer contains '\200';
+     knowing about holes won't help.  */
+  if (! eolbyte)
+    return memchr (buf, '\200', bufsize) != 0;
+
+  /* If the initial buffer contains a null byte, guess that the file
+     is binary.  */
+  if (memchr (buf, '\0', bufsize))
+    return 1;
+
+  /* If the file has holes, it must contain a null byte somewhere.  */
+  if ((HAVE_STRUCT_STAT_ST_BLOCKS || SEEK_HOLE != SEEK_END)
+      && usable_st_size (st))
+    {
+      off_t cur = bufsize;
+      if (O_BINARY || fd == STDIN_FILENO)
+        {
+          cur = lseek (fd, 0, SEEK_CUR);
+          if (cur < 0)
+            return 0;
+        }
+
+      /* If the file has fewer blocks than would be needed to
+         represent its data, then it must have at least one hole.  */
+      if (HAVE_STRUCT_STAT_ST_BLOCKS)
+        {
+          off_t nonzeros_needed = st->st_size - cur + bufsize;
+          off_t full_blocks = nonzeros_needed / ST_NBLOCKSIZE;
+          int partial_block = 0 < nonzeros_needed % ST_NBLOCKSIZE;
+          if (ST_NBLOCKS (*st) < full_blocks + partial_block)
+            return 1;
+        }
+
+      /* Look for a hole after the current location.  */
+      if (SEEK_HOLE != SEEK_END)
+        {
+          off_t hole_start = lseek (fd, cur, SEEK_HOLE);
+          if (0 <= hole_start)
+            {
+              if (lseek (fd, cur, SEEK_SET) < 0)
+                suppressible_error (filename, errno);
+              if (hole_start < st->st_size)
+                return 1;
+            }
+        }
+    }
+
+  /* Guess that the file does not contain binary data.  */
+  return 0;
+}
+
 /* Convert STR to a nonnegative integer, storing the result in *OUT.
    STR must be a valid context length argument; report an error if it
    isn't.  Silently ceiling *OUT at the maximum value, as that is
@@ -559,7 +632,7 @@ fillbuf (size_t save, struct stat const *st)
          is large.  However, do not use the original file size as a
          heuristic if we've already read past the file end, as most
          likely the file is growing.  */
-      if (S_ISREG (st->st_mode))
+      if (usable_st_size (st))
         {
           off_t to_be_read = st->st_size - bufoffset;
           off_t maxsize_off = save + to_be_read;
@@ -1133,7 +1206,7 @@ grep (int fd, struct stat const *st)
 
   not_text = (((binary_files == BINARY_BINARY_FILES && !out_quiet)
                || binary_files == WITHOUT_MATCH_BINARY_FILES)
-              && memchr (bufbeg, eol ? '\0' : '\200', buflim - bufbeg));
+              && file_is_binary (bufbeg, buflim - bufbeg, fd, st));
   if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES)
     return 0;
   done_on_match += not_text;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index d0d622b..7be788c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -38,6 +38,7 @@ TESTS =                                               \
   backref                                      \
   backref-multibyte-slow                       \
   backref-word                                 \
+  big-hole                                     \
   big-match                                    \
   bogus-wctob                                  \
   bre                                          \
diff --git a/tests/big-hole b/tests/big-hole
new file mode 100755
index 0000000..47e36e1
--- /dev/null
+++ b/tests/big-hole
@@ -0,0 +1,31 @@
+#!/bin/sh
+# Check that grep --binary-file=without-match quickly skips files with holes.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+expensive_
+
+# Try to make this test not THAT expensive, on typical hosts.
+virtual_memory_KiB=10240
+if echo x | (ulimit -v $virtual_memory_KiB && grep x) >/dev/null 2>&1; then
+  ulimit -v $virtual_memory_KiB
+fi
+
+# Create a file that starts with at least a buffer's worth of text,
+# but has a big hole later.
+ten='1 2 3 4 5 6 7 8 9 10'
+x='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
+(for i in $ten; do
+   for j in $ten; do
+     for k in $ten; do
+       echo $x
+     done
+   done
+ done
+ echo x | dd bs=1024k seek=8000000
+) >8T-or-so || skip_ 'cannot create big sparse file'
+
+grep --binary-file=without-match x 8T-or-so >/dev/null
+test $? -eq 1 || fail=1
+
+Exit $fail
-- 
1.7.6.5

[Prev in Thread]

Current Thread

[Next in Thread]

Re: [PATCH] grep: sparse files are now considered binary, Paul Eggert <=
- Re: [PATCH] grep: sparse files are now considered binary, Jim Meyering, 2012/05/15
  - Re: [PATCH] grep: sparse files are now considered binary, Paul Eggert, 2012/05/15
    - Re: [PATCH] grep: sparse files are now considered binary, Jim Meyering, 2012/05/16

Prev by Date: Re: [PATCH] maint: quote 'like this' or "like this", not `like this'
Next by Date: [PATCH] grep: handle non-devices like regular files
Previous by thread: Re: [PATCH] maint: quote 'like this' or "like this", not `like this'
Next by thread: Re: [PATCH] grep: sparse files are now considered binary
Index(es):
- Date
- Thread