>From 7480f48cd587b77d94feacea71163e2fc7d694f2 Mon Sep 17 00:00:00 2001 From: Assaf Gordon Date: Fri, 10 Jul 2015 20:46:46 -0400 Subject: [PATCH] numfmt: improve --field parsing * src/numfmt.c: make '--field' more similar to cut's '--fields'. set_fields() - new function (replaces parse_field_arg()), copied from src/cut.c with few minor changes: modified error messages; accept '-' as all fields. free_field(), match_field(), struct range_pair: replaced/modified to be more similar to src/cut.c. include_field() - modified to use new data structure. * tests/misc/numfmt.pl - add more tests. --- src/numfmt.c | 338 ++++++++++++++++++++++++++++++--------------------- tests/misc/numfmt.pl | 64 ++++++++++ 2 files changed, 265 insertions(+), 137 deletions(-) diff --git a/src/numfmt.c b/src/numfmt.c index 35c5c5b..e73e7e4 100644 --- a/src/numfmt.c +++ b/src/numfmt.c @@ -29,8 +29,6 @@ #include "system.h" #include "xstrtol.h" #include "xstrndup.h" -#include "gl_linked_list.h" -#include "gl_xlist.h" #if HAVE_FPSETPREC # include @@ -189,10 +187,6 @@ static int conv_exit_code = EXIT_CONVERSION_WARNINGS; /* auto-pad each line based on skipped whitespace. */ static int auto_padding = 0; static mbs_align_t padding_alignment = MBS_ALIGN_RIGHT; -static bool all_fields = false; -static size_t all_fields_after = 0; -static size_t all_fields_before = 0; -static gl_list_t field_list; static int delimiter = DELIMITER_DEFAULT; /* if non-zero, the first 'header' lines from STDIN are skipped. */ @@ -209,6 +203,36 @@ static int decimal_point_length; /* debugging for developers. Enables devmsg(). */ static bool dev_debug = false; +struct range_pair + { + size_t lo; + size_t hi; + }; + +/* Array of `struct range_pair' holding all the finite ranges. */ +static struct range_pair *rp; + +/* Number of finite ranges specified by the user. */ +static size_t n_rp; + +/* Number of `struct range_pair's allocated. */ +static size_t n_rp_allocated; + + +/* Append LOW, HIGH to the list RP of range pairs, allocating additional + space if necessary. Update global variable N_RP. When allocating, + update global variable N_RP_ALLOCATED. */ + +static void +add_range_pair (size_t lo, size_t hi) +{ + if (n_rp == n_rp_allocated) + rp = X2NREALLOC (rp, &n_rp_allocated); + rp[n_rp].lo = lo; + rp[n_rp].hi = hi; + ++n_rp; +} + static inline int default_scale_base (enum scale_type scale) { @@ -1313,139 +1337,186 @@ process_suffixed_number (char *text, long double *result, return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS); } -typedef struct range_pair -{ - size_t lo; - size_t hi; -} range_pair_t; - -static int -sort_field (const void *elt1, const void *elt2) -{ - range_pair_t* rp1 = (range_pair_t*) elt1; - range_pair_t* rp2 = (range_pair_t*) elt2; - - if (rp1->lo < rp2->lo) - return -1; - - return rp1->lo > rp2->lo; -} - +/* Comparison function for qsort to order the list of + struct range_pairs. */ static int -match_field (const void *elt1, const void *elt2) +compare_ranges (const void *a, const void *b) { - range_pair_t* rp = (range_pair_t*) elt1; - size_t field = *(size_t*) elt2; - - if (rp->lo <= field && field <= rp->hi) - return 0; - - if (rp->lo < field) - return -1; - - return 1; + int a_start = ((const struct range_pair *) a)->lo; + int b_start = ((const struct range_pair *) b)->lo; + return a_start < b_start ? -1 : a_start > b_start; } -static void -free_field (const void *elt) -{ - void *p = (void *)elt; - free (p); -} +/* Given the list of field or byte range specifications FIELDSTR, + allocate and initialize the RP array. FIELDSTR should + be composed of one or more numbers or ranges of numbers, separated + by blanks or commas. Incomplete ranges may be given: '-m' means '1-m'; + 'n-' means 'n' through end of line. + Return true if FIELDSTR contains at least one field specification, + false otherwise. */ -/* Add the specified fields to field_list. - The format recognized is similar to cut. - TODO: Refactor the more performant cut implementation - for use by both utilities. */ -static void -parse_field_arg (char *arg) +static bool +set_fields (const char *fieldstr) { + size_t initial = 1; /* Value of first number in a range. */ + size_t value = 0; /* If nonzero, a number being accumulated. */ + bool lhs_specified = false; + bool rhs_specified = false; + bool dash_found = false; /* True if a '-' is found in this field. */ + bool field_found = false; /* True if at least one field spec + has been processed. */ - char *start, *end; - range_pair_t *rp; - size_t field_val; - size_t range_val = 0; - - start = end = arg; + size_t i; + bool in_digits = false; - if (STREQ (arg, "-")) + /* Special case: '--field=-' means all fields (unlike 'cut'), + emulate '--field=1-' . */ + if (STREQ (fieldstr,"-")) { - all_fields = true; - - return; + value = 1; + lhs_specified = true; + dash_found = true; + fieldstr++; } - if (*start == '-') - { - /* range -M */ - ++start; + /* Collect and store in RP the range end points. */ - all_fields_before = strtol (start, &end, 10); + while (true) + { + if (*fieldstr == '-') + { + in_digits = false; + /* Starting a range. */ + if (dash_found) + error (EXIT_FAILURE, 0, _("invalid field range")); - if (start == end || all_fields_before <=0) - error (EXIT_FAILURE, 0, _("invalid field value %s"), - quote (start)); + dash_found = true; + fieldstr++; - return; - } + if (lhs_specified && !value) + error (EXIT_FAILURE, 0, _("invalid field value '0'")); - field_list = gl_list_create_empty (GL_LINKED_LIST, - NULL, NULL, free_field, false); + initial = (lhs_specified ? value : 1); + value = 0; + } + else if (*fieldstr == ',' + || isblank (to_uchar (*fieldstr)) || *fieldstr == '\0') + { + in_digits = false; + /* Ending the string, or this field/byte sublist. */ + if (dash_found) + { + dash_found = false; + + /* unlike cut's '-f', a lone '-' means 'all fields'. + this happens with '--fields 3,-' . + fall-through and add all fields. */ + if (!lhs_specified && !rhs_specified) + initial = 1; + + /* A range. Possibilities: -n, m-n, n-. + In any case, 'initial' contains the start of the range. */ + if (!rhs_specified) + { + /* 'n-'. From 'initial' to end of line. */ + add_range_pair (initial, SIZE_MAX); + field_found = true; + } + else + { + /* 'm-n' or '-n' (1-n). */ + if (value < initial) + error (EXIT_FAILURE, 0, _("invalid decreasing range")); + + add_range_pair (initial, value); + field_found = true; + } + value = 0; + } + else + { + /* A simple field number, not a range. */ + if (value == 0) + error (EXIT_FAILURE, 0, _("invalid field value '0'")); - while (*end != '\0') { - field_val = strtol (start, &end, 10); + add_range_pair (value, value); + value = 0; + field_found = true; + } - if (start == end || field_val <=0) - error (EXIT_FAILURE, 0, _("invalid field value %s"), - quote (start)); + if (*fieldstr == '\0') + break; - if (! range_val) - { - /* field N */ - rp = xmalloc (sizeof (*rp)); - rp->lo = rp->hi = field_val; - gl_sortedlist_add (field_list, sort_field, rp); - } - else - { - /* range N-M - The last field was the start of the field range. The current - field is the end of the field range. We already added the - start field, so increment and add all the fields through - range end. */ - if (field_val < range_val) - error (EXIT_FAILURE, 0, _("invalid decreasing range")); - rp = xmalloc (sizeof (*rp)); - rp->lo = range_val + 1; - rp->hi = field_val; - gl_sortedlist_add (field_list, sort_field, rp); - - range_val = 0; - } + fieldstr++; + lhs_specified = false; + rhs_specified = false; + } + else if (ISDIGIT (*fieldstr)) + { + /* Record beginning of digit string, in case we have to + complain about it. */ + static char const *num_start; + if (!in_digits || !num_start) + num_start = fieldstr; + in_digits = true; + + if (dash_found) + rhs_specified = 1; + else + lhs_specified = 1; - switch (*end) { - case ',': - /* discrete field separator */ - ++end; - start = end; - break; + /* Detect overflow. */ + if (!DECIMAL_DIGIT_ACCUMULATE (value, *fieldstr - '0', size_t) + || value == SIZE_MAX) + { + /* In case the user specified -c$(echo 2^64|bc),22, + complain only about the first number. */ + /* Determine the length of the offending number. */ + size_t len = strspn (num_start, "0123456789"); + char *bad_num = xstrndup (num_start, len); + error (0, 0, + _("field number %s is too large"), quote (bad_num)); + free (bad_num); + exit (EXIT_FAILURE); + } - case '-': - /* field range separator */ - ++end; - start = end; - range_val = field_val; - break; + fieldstr++; + } + else + error (EXIT_FAILURE, 0, _("invalid field value %s"), + quote (fieldstr)); } - } - if (range_val) + qsort (rp, n_rp, sizeof (rp[0]), compare_ranges); + + /* Merge range pairs (e.g. `2-5,3-4' becomes `2-5'). */ + for (i = 0; i < n_rp; ++i) { - /* range N- - range_val was not reset indicating ARG - ended with a trailing '-' */ - all_fields_after = range_val; + for (size_t j = i + 1; j < n_rp; ++j) + { + if (rp[j].lo <= rp[i].hi) + { + rp[i].hi = MAX (rp[j].hi, rp[i].hi); + memmove (rp + j, rp + j + 1, (n_rp - j - 1) * sizeof *rp); + n_rp--; + j--; + } + else + break; + } } + + /* TODO: optionally support --complement, like 'cut' */ + /* complement_rp (); */ + + /* After merging, reallocate RP so we release memory to the system. + Also add a sentinel at the end of RP, to avoid out of bounds access + and for performance reasons. */ + ++n_rp; + rp = xrealloc (rp, n_rp * sizeof (struct range_pair)); + rp[n_rp - 1].lo = rp[n_rp - 1].hi = SIZE_MAX; + + return field_found; } /* Return a pointer to the beginning of the next field in line. @@ -1479,23 +1550,20 @@ next_field (char **line) return field_start; } -static bool +static bool _GL_ATTRIBUTE_PURE include_field (size_t field) { - if (all_fields) - return true; - - if (all_fields_after && all_fields_after <= field) - return true; - - if (all_fields_before && field <= all_fields_before) - return true; - - /* default to field 1 */ - if (! field_list) + struct range_pair *p = rp; + if (!p) return field == 1; - return gl_sortedlist_search (field_list, match_field, &field); + while (p->lo != SIZE_MAX) + { + if (p->lo <= field && p->hi >= field) + return true; + ++p; + } + return false; } /* Convert and output the given field. If it is not included in the set @@ -1640,12 +1708,10 @@ main (int argc, char **argv) break; case FIELD_OPTION: - if (all_fields || all_fields_before || all_fields_after || field_list) - { - error (EXIT_FAILURE, 0, - _("multiple field specifications")); - } - parse_field_arg (optarg); + if (n_rp) + error (EXIT_FAILURE, 0, _("multiple field specifications")); + if (!set_fields (optarg)) + error (EXIT_FAILURE, 0, _("missing list of fields")); break; case 'd': @@ -1761,9 +1827,7 @@ main (int argc, char **argv) free (padding_buffer); free (format_str_prefix); free (format_str_suffix); - - if (field_list) - gl_list_free (field_list); + free (rp); #endif if (debug && !valid_numbers) diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl index 0e4dc79..38a005e 100755 --- a/tests/misc/numfmt.pl +++ b/tests/misc/numfmt.pl @@ -197,6 +197,8 @@ my @Tests = ['delim-4', '--delimiter=: --from=auto 40M:60M', {OUT=>'40000000:60M'}], ['delim-5', '-d: --field=2 --from=auto :40M:60M', {OUT=>':40000000:60M'}], ['delim-6', '-d: --field 3 --from=auto 40M:60M', {OUT=>"40M:60M"}], + ['delim-err-1', '-d,, --to=si 1', {EXIT=>1}, + {ERR => "$prog: the delimiter must be a single character\n"}], #Fields ['field-1', '--field A', @@ -244,9 +246,71 @@ my @Tests = ['field-range-7', '--field -3 --to=si "1000 2000 3000 4000 5000"', {OUT=>"1.0K 2.0K 3.0K 4000 5000"}], + ['field-range-8', '--field 1-2,4-5 --to=si "1000 2000 3000 4000 5000"', + {OUT=>"1.0K 2.0K 3000 4.0K 5.0K"}], + ['field-range-9', '--field 4-5,1-2 --to=si "1000 2000 3000 4000 5000"', + {OUT=>"1.0K 2.0K 3000 4.0K 5.0K"}], + + ['field-range-10','--field 1-3,2-4 --to=si "1000 2000 3000 4000 5000"', + {OUT=>"1.0K 2.0K 3.0K 4.0K 5000"}], + ['field-range-11','--field 2-4,1-3 --to=si "1000 2000 3000 4000 5000"', + {OUT=>"1.0K 2.0K 3.0K 4.0K 5000"}], + + ['field-range-12','--field 1-1,3-3 --to=si "1000 2000 3000 4000 5000"', + {OUT=>"1.0K 2000 3.0K 4000 5000"}], + + ['field-range-13', '--field 1,-2 --to=si "1000 2000 3000"', + {OUT=>"1.0K 2.0K 3000"}], + + ['field-range-14', '--field -2,4- --to=si "1000 2000 3000 4000 5000"', + {OUT=>"1.0K 2.0K 3000 4.0K 5.0K"}], + ['field-range-15', '--field -2,-4 --to=si "1000 2000 3000 4000 5000"', + {OUT=>"1.0K 2.0K 3.0K 4.0K 5000"}], + ['field-range-16', '--field 2-,4- --to=si "1000 2000 3000 4000 5000"', + {OUT=>"1000 2.0K 3.0K 4.0K 5.0K"}], + ['field-range-17', '--field 4-,2- --to=si "1000 2000 3000 4000 5000"', + {OUT=>"1000 2.0K 3.0K 4.0K 5.0K"}], + + # white space are valid field separators + # (undocumented? but works in cut as well). + ['field-range-18', '--field "1,2 4" --to=si "1000 2000 3000 4000 5000"', + {OUT=>"1.0K 2.0K 3000 4.0K 5000"}], + + # Unlike 'cut', a lone '-' means 'all fields', even as part of a list + # of fields. + ['field-range-19','--field 3,- --to=si "1000 2000 3000 4000 5000"', + {OUT=>"1.0K 2.0K 3.0K 4.0K 5.0K"}], + ['all-fields-1', '--field=- --to=si "1000 2000 3000 4000 5000"', {OUT=>"1.0K 2.0K 3.0K 4.0K 5.0K"}], + ['field-range-err-1', '--field -foo --to=si 10', + {EXIT=>1}, {ERR=>"$prog: invalid field value 'foo'\n"}], + ['field-range-err-2', '--field --3 --to=si 10', + {EXIT=>1}, {ERR=>"$prog: invalid field range\n"}], + ['field-range-err-3', '--field 0 --to=si 10', + {EXIT=>1}, {ERR=>"$prog: invalid field value '0'\n"}], + ['field-range-err-4', '--field 3-2 --to=si 10', + {EXIT=>1}, {ERR=>"$prog: invalid decreasing range\n"}], + ['field-range-err-6', '--field - --field 1- --to=si 10', + {EXIT=>1}, {ERR=>"$prog: multiple field specifications\n"}], + ['field-range-err-7', '--field -1 --field 1- --to=si 10', + {EXIT=>1}, {ERR=>"$prog: multiple field specifications\n"}], + ['field-range-err-8', '--field -1 --field 1,2,3 --to=si 10', + {EXIT=>1}, {ERR=>"$prog: multiple field specifications\n"}], + ['field-range-err-9', '--field 1- --field 1,2,3 --to=si 10', + {EXIT=>1}, {ERR=>"$prog: multiple field specifications\n"}], + ['field-range-err-10','--field 1,2,3 --field 1- --to=si 10', + {EXIT=>1}, {ERR=>"$prog: multiple field specifications\n"}], + ['field-range-err-11','--field 1-2-3 --to=si 10', + {EXIT=>1}, {ERR=>"$prog: invalid field range\n"}], + ['field-range-err-12','--field 0-1 --to=si 10', + {EXIT=>1}, {ERR=>"$prog: invalid field value '0'\n"}], + ['field-range-err-13','--field 9918446744073709551616,22 --to=si 10', + {EXIT=>1}, {ERR=>"$prog: field number " . + "'9918446744073709551616' is too large\n"}], + + # Auto-consume white-space, setup auto-padding ['whitespace-1', '--to=si --field 2 "A 500 B"', {OUT=>"A 500 B"}], ['whitespace-2', '--to=si --field 2 "A 5000 B"', {OUT=>"A 5.0K B"}], -- 1.9.1