>From 7480f48cd587b77d94feacea71163e2fc7d694f2 Mon Sep 17 00:00:00 2001
From: Assaf Gordon
Date: Fri, 10 Jul 2015 20:46:46 -0400
Subject: [PATCH] numfmt: improve --field parsing
* src/numfmt.c: make '--field' more similar to cut's '--fields'.
set_fields() - new function (replaces parse_field_arg()), copied from
src/cut.c with few minor changes: modified error messages;
accept '-' as all fields.
free_field(), match_field(), struct range_pair:
replaced/modified to be more similar to src/cut.c.
include_field() - modified to use new data structure.
* tests/misc/numfmt.pl - add more tests.
---
src/numfmt.c | 338 ++++++++++++++++++++++++++++++---------------------
tests/misc/numfmt.pl | 64 ++++++++++
2 files changed, 265 insertions(+), 137 deletions(-)
diff --git a/src/numfmt.c b/src/numfmt.c
index 35c5c5b..e73e7e4 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -29,8 +29,6 @@
#include "system.h"
#include "xstrtol.h"
#include "xstrndup.h"
-#include "gl_linked_list.h"
-#include "gl_xlist.h"
#if HAVE_FPSETPREC
# include
@@ -189,10 +187,6 @@ static int conv_exit_code = EXIT_CONVERSION_WARNINGS;
/* auto-pad each line based on skipped whitespace. */
static int auto_padding = 0;
static mbs_align_t padding_alignment = MBS_ALIGN_RIGHT;
-static bool all_fields = false;
-static size_t all_fields_after = 0;
-static size_t all_fields_before = 0;
-static gl_list_t field_list;
static int delimiter = DELIMITER_DEFAULT;
/* if non-zero, the first 'header' lines from STDIN are skipped. */
@@ -209,6 +203,36 @@ static int decimal_point_length;
/* debugging for developers. Enables devmsg(). */
static bool dev_debug = false;
+struct range_pair
+ {
+ size_t lo;
+ size_t hi;
+ };
+
+/* Array of `struct range_pair' holding all the finite ranges. */
+static struct range_pair *rp;
+
+/* Number of finite ranges specified by the user. */
+static size_t n_rp;
+
+/* Number of `struct range_pair's allocated. */
+static size_t n_rp_allocated;
+
+
+/* Append LOW, HIGH to the list RP of range pairs, allocating additional
+ space if necessary. Update global variable N_RP. When allocating,
+ update global variable N_RP_ALLOCATED. */
+
+static void
+add_range_pair (size_t lo, size_t hi)
+{
+ if (n_rp == n_rp_allocated)
+ rp = X2NREALLOC (rp, &n_rp_allocated);
+ rp[n_rp].lo = lo;
+ rp[n_rp].hi = hi;
+ ++n_rp;
+}
+
static inline int
default_scale_base (enum scale_type scale)
{
@@ -1313,139 +1337,186 @@ process_suffixed_number (char *text, long double *result,
return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS);
}
-typedef struct range_pair
-{
- size_t lo;
- size_t hi;
-} range_pair_t;
-
-static int
-sort_field (const void *elt1, const void *elt2)
-{
- range_pair_t* rp1 = (range_pair_t*) elt1;
- range_pair_t* rp2 = (range_pair_t*) elt2;
-
- if (rp1->lo < rp2->lo)
- return -1;
-
- return rp1->lo > rp2->lo;
-}
-
+/* Comparison function for qsort to order the list of
+ struct range_pairs. */
static int
-match_field (const void *elt1, const void *elt2)
+compare_ranges (const void *a, const void *b)
{
- range_pair_t* rp = (range_pair_t*) elt1;
- size_t field = *(size_t*) elt2;
-
- if (rp->lo <= field && field <= rp->hi)
- return 0;
-
- if (rp->lo < field)
- return -1;
-
- return 1;
+ int a_start = ((const struct range_pair *) a)->lo;
+ int b_start = ((const struct range_pair *) b)->lo;
+ return a_start < b_start ? -1 : a_start > b_start;
}
-static void
-free_field (const void *elt)
-{
- void *p = (void *)elt;
- free (p);
-}
+/* Given the list of field or byte range specifications FIELDSTR,
+ allocate and initialize the RP array. FIELDSTR should
+ be composed of one or more numbers or ranges of numbers, separated
+ by blanks or commas. Incomplete ranges may be given: '-m' means '1-m';
+ 'n-' means 'n' through end of line.
+ Return true if FIELDSTR contains at least one field specification,
+ false otherwise. */
-/* Add the specified fields to field_list.
- The format recognized is similar to cut.
- TODO: Refactor the more performant cut implementation
- for use by both utilities. */
-static void
-parse_field_arg (char *arg)
+static bool
+set_fields (const char *fieldstr)
{
+ size_t initial = 1; /* Value of first number in a range. */
+ size_t value = 0; /* If nonzero, a number being accumulated. */
+ bool lhs_specified = false;
+ bool rhs_specified = false;
+ bool dash_found = false; /* True if a '-' is found in this field. */
+ bool field_found = false; /* True if at least one field spec
+ has been processed. */
- char *start, *end;
- range_pair_t *rp;
- size_t field_val;
- size_t range_val = 0;
-
- start = end = arg;
+ size_t i;
+ bool in_digits = false;
- if (STREQ (arg, "-"))
+ /* Special case: '--field=-' means all fields (unlike 'cut'),
+ emulate '--field=1-' . */
+ if (STREQ (fieldstr,"-"))
{
- all_fields = true;
-
- return;
+ value = 1;
+ lhs_specified = true;
+ dash_found = true;
+ fieldstr++;
}
- if (*start == '-')
- {
- /* range -M */
- ++start;
+ /* Collect and store in RP the range end points. */
- all_fields_before = strtol (start, &end, 10);
+ while (true)
+ {
+ if (*fieldstr == '-')
+ {
+ in_digits = false;
+ /* Starting a range. */
+ if (dash_found)
+ error (EXIT_FAILURE, 0, _("invalid field range"));
- if (start == end || all_fields_before <=0)
- error (EXIT_FAILURE, 0, _("invalid field value %s"),
- quote (start));
+ dash_found = true;
+ fieldstr++;
- return;
- }
+ if (lhs_specified && !value)
+ error (EXIT_FAILURE, 0, _("invalid field value '0'"));
- field_list = gl_list_create_empty (GL_LINKED_LIST,
- NULL, NULL, free_field, false);
+ initial = (lhs_specified ? value : 1);
+ value = 0;
+ }
+ else if (*fieldstr == ','
+ || isblank (to_uchar (*fieldstr)) || *fieldstr == '\0')
+ {
+ in_digits = false;
+ /* Ending the string, or this field/byte sublist. */
+ if (dash_found)
+ {
+ dash_found = false;
+
+ /* unlike cut's '-f', a lone '-' means 'all fields'.
+ this happens with '--fields 3,-' .
+ fall-through and add all fields. */
+ if (!lhs_specified && !rhs_specified)
+ initial = 1;
+
+ /* A range. Possibilities: -n, m-n, n-.
+ In any case, 'initial' contains the start of the range. */
+ if (!rhs_specified)
+ {
+ /* 'n-'. From 'initial' to end of line. */
+ add_range_pair (initial, SIZE_MAX);
+ field_found = true;
+ }
+ else
+ {
+ /* 'm-n' or '-n' (1-n). */
+ if (value < initial)
+ error (EXIT_FAILURE, 0, _("invalid decreasing range"));
+
+ add_range_pair (initial, value);
+ field_found = true;
+ }
+ value = 0;
+ }
+ else
+ {
+ /* A simple field number, not a range. */
+ if (value == 0)
+ error (EXIT_FAILURE, 0, _("invalid field value '0'"));
- while (*end != '\0') {
- field_val = strtol (start, &end, 10);
+ add_range_pair (value, value);
+ value = 0;
+ field_found = true;
+ }
- if (start == end || field_val <=0)
- error (EXIT_FAILURE, 0, _("invalid field value %s"),
- quote (start));
+ if (*fieldstr == '\0')
+ break;
- if (! range_val)
- {
- /* field N */
- rp = xmalloc (sizeof (*rp));
- rp->lo = rp->hi = field_val;
- gl_sortedlist_add (field_list, sort_field, rp);
- }
- else
- {
- /* range N-M
- The last field was the start of the field range. The current
- field is the end of the field range. We already added the
- start field, so increment and add all the fields through
- range end. */
- if (field_val < range_val)
- error (EXIT_FAILURE, 0, _("invalid decreasing range"));
- rp = xmalloc (sizeof (*rp));
- rp->lo = range_val + 1;
- rp->hi = field_val;
- gl_sortedlist_add (field_list, sort_field, rp);
-
- range_val = 0;
- }
+ fieldstr++;
+ lhs_specified = false;
+ rhs_specified = false;
+ }
+ else if (ISDIGIT (*fieldstr))
+ {
+ /* Record beginning of digit string, in case we have to
+ complain about it. */
+ static char const *num_start;
+ if (!in_digits || !num_start)
+ num_start = fieldstr;
+ in_digits = true;
+
+ if (dash_found)
+ rhs_specified = 1;
+ else
+ lhs_specified = 1;
- switch (*end) {
- case ',':
- /* discrete field separator */
- ++end;
- start = end;
- break;
+ /* Detect overflow. */
+ if (!DECIMAL_DIGIT_ACCUMULATE (value, *fieldstr - '0', size_t)
+ || value == SIZE_MAX)
+ {
+ /* In case the user specified -c$(echo 2^64|bc),22,
+ complain only about the first number. */
+ /* Determine the length of the offending number. */
+ size_t len = strspn (num_start, "0123456789");
+ char *bad_num = xstrndup (num_start, len);
+ error (0, 0,
+ _("field number %s is too large"), quote (bad_num));
+ free (bad_num);
+ exit (EXIT_FAILURE);
+ }
- case '-':
- /* field range separator */
- ++end;
- start = end;
- range_val = field_val;
- break;
+ fieldstr++;
+ }
+ else
+ error (EXIT_FAILURE, 0, _("invalid field value %s"),
+ quote (fieldstr));
}
- }
- if (range_val)
+ qsort (rp, n_rp, sizeof (rp[0]), compare_ranges);
+
+ /* Merge range pairs (e.g. `2-5,3-4' becomes `2-5'). */
+ for (i = 0; i < n_rp; ++i)
{
- /* range N-
- range_val was not reset indicating ARG
- ended with a trailing '-' */
- all_fields_after = range_val;
+ for (size_t j = i + 1; j < n_rp; ++j)
+ {
+ if (rp[j].lo <= rp[i].hi)
+ {
+ rp[i].hi = MAX (rp[j].hi, rp[i].hi);
+ memmove (rp + j, rp + j + 1, (n_rp - j - 1) * sizeof *rp);
+ n_rp--;
+ j--;
+ }
+ else
+ break;
+ }
}
+
+ /* TODO: optionally support --complement, like 'cut' */
+ /* complement_rp (); */
+
+ /* After merging, reallocate RP so we release memory to the system.
+ Also add a sentinel at the end of RP, to avoid out of bounds access
+ and for performance reasons. */
+ ++n_rp;
+ rp = xrealloc (rp, n_rp * sizeof (struct range_pair));
+ rp[n_rp - 1].lo = rp[n_rp - 1].hi = SIZE_MAX;
+
+ return field_found;
}
/* Return a pointer to the beginning of the next field in line.
@@ -1479,23 +1550,20 @@ next_field (char **line)
return field_start;
}
-static bool
+static bool _GL_ATTRIBUTE_PURE
include_field (size_t field)
{
- if (all_fields)
- return true;
-
- if (all_fields_after && all_fields_after <= field)
- return true;
-
- if (all_fields_before && field <= all_fields_before)
- return true;
-
- /* default to field 1 */
- if (! field_list)
+ struct range_pair *p = rp;
+ if (!p)
return field == 1;
- return gl_sortedlist_search (field_list, match_field, &field);
+ while (p->lo != SIZE_MAX)
+ {
+ if (p->lo <= field && p->hi >= field)
+ return true;
+ ++p;
+ }
+ return false;
}
/* Convert and output the given field. If it is not included in the set
@@ -1640,12 +1708,10 @@ main (int argc, char **argv)
break;
case FIELD_OPTION:
- if (all_fields || all_fields_before || all_fields_after || field_list)
- {
- error (EXIT_FAILURE, 0,
- _("multiple field specifications"));
- }
- parse_field_arg (optarg);
+ if (n_rp)
+ error (EXIT_FAILURE, 0, _("multiple field specifications"));
+ if (!set_fields (optarg))
+ error (EXIT_FAILURE, 0, _("missing list of fields"));
break;
case 'd':
@@ -1761,9 +1827,7 @@ main (int argc, char **argv)
free (padding_buffer);
free (format_str_prefix);
free (format_str_suffix);
-
- if (field_list)
- gl_list_free (field_list);
+ free (rp);
#endif
if (debug && !valid_numbers)
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl
index 0e4dc79..38a005e 100755
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -197,6 +197,8 @@ my @Tests =
['delim-4', '--delimiter=: --from=auto 40M:60M', {OUT=>'40000000:60M'}],
['delim-5', '-d: --field=2 --from=auto :40M:60M', {OUT=>':40000000:60M'}],
['delim-6', '-d: --field 3 --from=auto 40M:60M', {OUT=>"40M:60M"}],
+ ['delim-err-1', '-d,, --to=si 1', {EXIT=>1},
+ {ERR => "$prog: the delimiter must be a single character\n"}],
#Fields
['field-1', '--field A',
@@ -244,9 +246,71 @@ my @Tests =
['field-range-7', '--field -3 --to=si "1000 2000 3000 4000 5000"',
{OUT=>"1.0K 2.0K 3.0K 4000 5000"}],
+ ['field-range-8', '--field 1-2,4-5 --to=si "1000 2000 3000 4000 5000"',
+ {OUT=>"1.0K 2.0K 3000 4.0K 5.0K"}],
+ ['field-range-9', '--field 4-5,1-2 --to=si "1000 2000 3000 4000 5000"',
+ {OUT=>"1.0K 2.0K 3000 4.0K 5.0K"}],
+
+ ['field-range-10','--field 1-3,2-4 --to=si "1000 2000 3000 4000 5000"',
+ {OUT=>"1.0K 2.0K 3.0K 4.0K 5000"}],
+ ['field-range-11','--field 2-4,1-3 --to=si "1000 2000 3000 4000 5000"',
+ {OUT=>"1.0K 2.0K 3.0K 4.0K 5000"}],
+
+ ['field-range-12','--field 1-1,3-3 --to=si "1000 2000 3000 4000 5000"',
+ {OUT=>"1.0K 2000 3.0K 4000 5000"}],
+
+ ['field-range-13', '--field 1,-2 --to=si "1000 2000 3000"',
+ {OUT=>"1.0K 2.0K 3000"}],
+
+ ['field-range-14', '--field -2,4- --to=si "1000 2000 3000 4000 5000"',
+ {OUT=>"1.0K 2.0K 3000 4.0K 5.0K"}],
+ ['field-range-15', '--field -2,-4 --to=si "1000 2000 3000 4000 5000"',
+ {OUT=>"1.0K 2.0K 3.0K 4.0K 5000"}],
+ ['field-range-16', '--field 2-,4- --to=si "1000 2000 3000 4000 5000"',
+ {OUT=>"1000 2.0K 3.0K 4.0K 5.0K"}],
+ ['field-range-17', '--field 4-,2- --to=si "1000 2000 3000 4000 5000"',
+ {OUT=>"1000 2.0K 3.0K 4.0K 5.0K"}],
+
+ # white space are valid field separators
+ # (undocumented? but works in cut as well).
+ ['field-range-18', '--field "1,2 4" --to=si "1000 2000 3000 4000 5000"',
+ {OUT=>"1.0K 2.0K 3000 4.0K 5000"}],
+
+ # Unlike 'cut', a lone '-' means 'all fields', even as part of a list
+ # of fields.
+ ['field-range-19','--field 3,- --to=si "1000 2000 3000 4000 5000"',
+ {OUT=>"1.0K 2.0K 3.0K 4.0K 5.0K"}],
+
['all-fields-1', '--field=- --to=si "1000 2000 3000 4000 5000"',
{OUT=>"1.0K 2.0K 3.0K 4.0K 5.0K"}],
+ ['field-range-err-1', '--field -foo --to=si 10',
+ {EXIT=>1}, {ERR=>"$prog: invalid field value 'foo'\n"}],
+ ['field-range-err-2', '--field --3 --to=si 10',
+ {EXIT=>1}, {ERR=>"$prog: invalid field range\n"}],
+ ['field-range-err-3', '--field 0 --to=si 10',
+ {EXIT=>1}, {ERR=>"$prog: invalid field value '0'\n"}],
+ ['field-range-err-4', '--field 3-2 --to=si 10',
+ {EXIT=>1}, {ERR=>"$prog: invalid decreasing range\n"}],
+ ['field-range-err-6', '--field - --field 1- --to=si 10',
+ {EXIT=>1}, {ERR=>"$prog: multiple field specifications\n"}],
+ ['field-range-err-7', '--field -1 --field 1- --to=si 10',
+ {EXIT=>1}, {ERR=>"$prog: multiple field specifications\n"}],
+ ['field-range-err-8', '--field -1 --field 1,2,3 --to=si 10',
+ {EXIT=>1}, {ERR=>"$prog: multiple field specifications\n"}],
+ ['field-range-err-9', '--field 1- --field 1,2,3 --to=si 10',
+ {EXIT=>1}, {ERR=>"$prog: multiple field specifications\n"}],
+ ['field-range-err-10','--field 1,2,3 --field 1- --to=si 10',
+ {EXIT=>1}, {ERR=>"$prog: multiple field specifications\n"}],
+ ['field-range-err-11','--field 1-2-3 --to=si 10',
+ {EXIT=>1}, {ERR=>"$prog: invalid field range\n"}],
+ ['field-range-err-12','--field 0-1 --to=si 10',
+ {EXIT=>1}, {ERR=>"$prog: invalid field value '0'\n"}],
+ ['field-range-err-13','--field 9918446744073709551616,22 --to=si 10',
+ {EXIT=>1}, {ERR=>"$prog: field number " .
+ "'9918446744073709551616' is too large\n"}],
+
+
# Auto-consume white-space, setup auto-padding
['whitespace-1', '--to=si --field 2 "A 500 B"', {OUT=>"A 500 B"}],
['whitespace-2', '--to=si --field 2 "A 5000 B"', {OUT=>"A 5.0K B"}],
--
1.9.1