coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] cut: Improved large file support on 32 bit archs


From: Tobias Stoeckmann
Subject: [PATCH] cut: Improved large file support on 32 bit archs
Date: Mon, 26 Mar 2018 13:37:12 +0200

Using ranges in cut is limited to SIZE_MAX, which means that they
cannot exceed 4 GB on 32 bit systems. These systems have generally
turned large file support on, i.e. off_t would be 64 bit. Even if
there is no large file support, streams like standard input could
exceed 4 GB.

This patch replaces the limitation of SIZE_MAX with UINTMAX_MAX,
which is at least 64 bit on i386 and amd64.

The test case for cut, written in 2013, mentioned that SIZE_MAX is
a required limitation due to allocation per line, which is not the
case any longer (discussion happened in bug#13127).

The mentioned test cases in this bug report pass as of today
(I use 2^32 -2 instead of -1 to not trigger safety checks):

$ uname -m
i686
$ echo a | cut -b1-$(echo '2^32-2'|bc)
a
$ _

And this is the output of original and patched cut with a large file:

$ echo 1 | dd of=test.4gb bs=1 count=1 seek=$((2**32-1))
$ cut -c$((2**32)) test.4gb
cut: byte/character offset ‘4294967296’ is too large
Try 'cut --help' for more information.
$ new-cut -c$((2**32)) test.4gb
1
$ _

Please note that numfmt supports larger field indices due to shared
code base now as well.

No functional change on amd64.

Signed-off-by: Tobias Stoeckmann <address@hidden>
---
 src/cut.c                    | 10 +++++-----
 src/numfmt.c                 |  8 ++++----
 src/set-fields.c             | 34 +++++++++++++++++-----------------
 src/set-fields.h             |  4 ++--
 tests/misc/cut-huge-range.sh |  9 +++------
 tests/misc/numfmt.pl         |  4 ++--
 6 files changed, 33 insertions(+), 36 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index be2e67af4..5dc511af3 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -204,7 +204,7 @@ Each range is one of:\n\
    and if required CURRENT_RP.  */
 
 static inline void
-next_item (size_t *item_idx)
+next_item (uintmax_t *item_idx)
 {
   (*item_idx)++;
   if ((*item_idx) > current_rp->hi)
@@ -214,7 +214,7 @@ next_item (size_t *item_idx)
 /* Return nonzero if the K'th field or byte is printable. */
 
 static inline bool
-print_kth (size_t k)
+print_kth (uintmax_t k)
 {
   return current_rp->lo <= k;
 }
@@ -222,7 +222,7 @@ print_kth (size_t k)
 /* Return nonzero if K'th byte is the beginning of a range. */
 
 static inline bool
-is_range_start_index (size_t k)
+is_range_start_index (uintmax_t k)
 {
   return k == current_rp->lo;
 }
@@ -232,7 +232,7 @@ is_range_start_index (size_t k)
 static void
 cut_bytes (FILE *stream)
 {
-  size_t byte_idx;     /* Number of bytes in the line so far. */
+  uintmax_t byte_idx;  /* Number of bytes in the line so far. */
   /* Whether to begin printing delimiters between ranges for the current line.
      Set after we've begun printing data corresponding to the first range.  */
   bool print_delimiter;
@@ -286,7 +286,7 @@ static void
 cut_fields (FILE *stream)
 {
   int c;
-  size_t field_idx = 1;
+  uintmax_t field_idx = 1;
   bool found_any_selected_field = false;
   bool buffer_first_field;
 
diff --git a/src/numfmt.c b/src/numfmt.c
index 130e0388d..ce5c131e4 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -1351,13 +1351,13 @@ next_field (char **line)
 }
 
 static bool _GL_ATTRIBUTE_PURE
-include_field (size_t field)
+include_field (uintmax_t field)
 {
   struct field_range_pair *p = frp;
   if (!p)
     return field == 1;
 
-  while (p->lo != SIZE_MAX)
+  while (p->lo != UINTMAX_MAX)
     {
       if (p->lo <= field && p->hi >= field)
         return true;
@@ -1369,7 +1369,7 @@ include_field (size_t field)
 /* Convert and output the given field. If it is not included in the set
    of fields to process just output the original */
 static bool
-process_field (char *text, size_t field)
+process_field (char *text, uintmax_t field)
 {
   long double val = 0;
   size_t precision = 0;
@@ -1400,7 +1400,7 @@ static int
 process_line (char *line, bool newline)
 {
   char *next;
-  size_t field = 0;
+  uintmax_t field = 0;
   bool valid_number = true;
 
   while (true) {
diff --git a/src/set-fields.c b/src/set-fields.c
index 20687b293..0119e3f99 100644
--- a/src/set-fields.c
+++ b/src/set-fields.c
@@ -45,7 +45,7 @@ static size_t n_frp_allocated;
    space if necessary.  Update global variable N_FRP.  When allocating,
    update global variable N_FRP_ALLOCATED.  */
 static void
-add_range_pair (size_t lo, size_t hi)
+add_range_pair (uintmax_t lo, uintmax_t hi)
 {
   if (n_frp == n_frp_allocated)
     frp = X2NREALLOC (frp, &n_frp_allocated);
@@ -89,8 +89,8 @@ complement_rp (void)
       add_range_pair (c[i-1].hi + 1, c[i].lo - 1);
     }
 
-  if (c[n-1].hi < SIZE_MAX)
-    add_range_pair (c[n-1].hi + 1, SIZE_MAX);
+  if (c[n-1].hi < UINTMAX_MAX)
+    add_range_pair (c[n-1].hi + 1, UINTMAX_MAX);
 
   free (c);
 }
@@ -100,7 +100,7 @@ complement_rp (void)
    be composed of one or more numbers or ranges of numbers, separated
    by blanks or commas.  Incomplete ranges may be given: '-m' means '1-m';
    'n-' means 'n' through end of line.
-   n=0 and n>=SIZE_MAX values will trigger an error.
+   n=0 and n>=UINTMAX_MAX values will trigger an error.
 
    if SETFLD_ALLOW_DASH option is used, a single '-' means all fields
    (otherwise a single dash triggers an error).
@@ -121,24 +121,24 @@ complement_rp (void)
 
    The first field is stored as 1 (zero is not used).
    An open-ended range (i.e., until the last field of the input line)
-   is indicated with hi = SIZE_MAX.
+   is indicated with hi = UINTMAX_MAX.
 
-   A sentinel of SIZE_MAX/SIZE_MAX is always added as the last
+   A sentinel of UINTMAX_MAX/UINTMAX_MAX is always added as the last
    field range pair.
 
    Examples:
-   given '1-2,4', frp = [ { .lo = 1,        .hi = 2 },
-                          { .lo = 4,        .hi = 4 },
-                          { .lo = SIZE_MAX, .hi = SIZE_MAX } ];
+   given '1-2,4', frp = [ { .lo = 1,           .hi = 2 },
+                          { .lo = 4,           .hi = 4 },
+                          { .lo = UINTMAX_MAX, .hi = UINTMAX_MAX } ];
 
-   given '3-',    frp = [ { .lo = 3,        .hi = SIZE_MAX },
-                          { .lo = SIZE_MAX, .hi = SIZE_MAX } ];
+   given '3-',    frp = [ { .lo = 3,           .hi = UINTMAX_MAX },
+                          { .lo = UINTMAX_MAX, .hi = UINTMAX_MAX } ];
 */
 void
 set_fields (const char *fieldstr, unsigned int options)
 {
-  size_t initial = 1;          /* Value of first number in a range.  */
-  size_t value = 0;            /* If nonzero, a number being accumulated.  */
+  uintmax_t initial = 1;       /* Value of first number in a range.  */
+  uintmax_t value = 0;         /* If nonzero, a number being accumulated.  */
   bool lhs_specified = false;
   bool rhs_specified = false;
   bool dash_found = false;     /* True if a '-' is found in this field.  */
@@ -201,7 +201,7 @@ set_fields (const char *fieldstr, unsigned int options)
               if (!rhs_specified)
                 {
                   /* 'n-'.  From 'initial' to end of line. */
-                  add_range_pair (initial, SIZE_MAX);
+                  add_range_pair (initial, UINTMAX_MAX);
                 }
               else
                 {
@@ -247,8 +247,8 @@ set_fields (const char *fieldstr, unsigned int options)
             lhs_specified = 1;
 
           /* Detect overflow.  */
-          if (!DECIMAL_DIGIT_ACCUMULATE (value, *fieldstr - '0', size_t)
-              || value == SIZE_MAX)
+          if (!DECIMAL_DIGIT_ACCUMULATE (value, *fieldstr - '0', uintmax_t)
+              || value == UINTMAX_MAX)
             {
               /* In case the user specified -c$(echo 2^64|bc),22,
                  complain only about the first number.  */
@@ -307,7 +307,7 @@ set_fields (const char *fieldstr, unsigned int options)
      and for performance reasons.  */
   ++n_frp;
   frp = xrealloc (frp, n_frp * sizeof (struct field_range_pair));
-  frp[n_frp - 1].lo = frp[n_frp - 1].hi = SIZE_MAX;
+  frp[n_frp - 1].lo = frp[n_frp - 1].hi = UINTMAX_MAX;
 }
 
 void
diff --git a/src/set-fields.h b/src/set-fields.h
index b9bab1fd5..06f5ba8f3 100644
--- a/src/set-fields.h
+++ b/src/set-fields.h
@@ -19,8 +19,8 @@
 
 struct field_range_pair
   {
-    size_t lo;
-    size_t hi;
+    uintmax_t lo;
+    uintmax_t hi;
   };
 
 /* Array of `struct range_pair' holding all the finite ranges. */
diff --git a/tests/misc/cut-huge-range.sh b/tests/misc/cut-huge-range.sh
index e7c17c222..01936266f 100755
--- a/tests/misc/cut-huge-range.sh
+++ b/tests/misc/cut-huge-range.sh
@@ -44,11 +44,9 @@ subtract_one='
 '
 
 # Ensure we can cut up to our sentinel value.
-# This is currently SIZE_MAX, but could be raised to UINTMAX_MAX
-# if we didn't allocate memory for each line as a unit.
 # Don't use expr to subtract one,
-# since SIZE_MAX may exceed its maximum value.
-CUT_MAX=$(echo $SIZE_MAX | sed "$subtract_one")
+# since UINTMAX_MAX may exceed its maximum value.
+CUT_MAX=$(echo $UINTMAX_MAX | sed "$subtract_one")
 
 # From coreutils-8.10 through 8.20, this would make cut try to allocate
 # a 256MiB bit vector.
@@ -59,8 +57,7 @@ CUT_MAX=$(echo $SIZE_MAX | sed "$subtract_one")
 (ulimit -v $vm && cut -b1-$CUT_MAX /dev/null >> err 2>&1) || fail=1
 
 # Explicitly disallow values above CUT_MAX
-(ulimit -v $vm && returns_ 1 cut -b$SIZE_MAX /dev/null 2>/dev/null) || fail=1
-(ulimit -v $vm && returns_ 1 cut -b$SIZE_OFLOW /dev/null 2>/dev/null) || fail=1
+(ulimit -v $vm && returns_ 1 cut -b$UINTMAX_MAX /dev/null 2>/dev/null) || 
fail=1
 
 compare /dev/null err || fail=1
 
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl
index 6b3623399..dee4a1d58 100755
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -308,9 +308,9 @@ my @Tests =
              {EXIT=>1}, {ERR=>"$prog: invalid field range\n$try"}],
      ['field-range-err-12','--field 0-1 --to=si 10',
              {EXIT=>1}, {ERR=>"$prog: fields are numbered from 1\n$try"}],
-     ['field-range-err-13','--field '.$limits->{SIZE_MAX}.',22 --to=si 10',
+     ['field-range-err-13','--field '.$limits->{UINTMAX_MAX}.',22 --to=si 10',
              {EXIT=>1}, {ERR=>"$prog: field number " .
-                              "'".$limits->{SIZE_MAX}."' is too large\n$try"}],
+                              "'".$limits->{UINTMAX_MAX}."' is too 
large\n$try"}],
 
      # Auto-consume white-space, setup auto-padding
      ['whitespace-1', '--to=si --field 2 "A    500 B"', {OUT=>"A    500 B"}],
-- 
2.16.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]