From 99a9c4c7d7a828a917e14f7e5241921a4dc77909 Mon Sep 17 00:00:00 2001 From: Drew Frank Date: Thu, 1 Mar 2012 14:24:49 -0800 Subject: [PATCH] join: add numeric sort feature (-g and -n flags). * src/join.c: add flags and implement numeric comparison features. * tests/misc/join: add three tests for numerically sorted key fields. * doc/coreutils.texi: document new functionality. --- doc/coreutils.texi | 16 ++++++ gnulib | 2 +- src/join.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++-- tests/misc/join | 8 +++ 4 files changed, 178 insertions(+), 5 deletions(-) diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 10be715..017118f 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -5798,6 +5798,14 @@ specified format. The header lines will not be checked for ordering even if @option{--check-order} is specified. Also if the header lines from each file do not match, the heading fields from the first file will be used. address@hidden -g address@hidden --general-numeric-sort address@hidden -g address@hidden --general-numeric-sort +Compute general numerical value when comparing keys. +With this option, the lines of the input files must be ordered in the same way. +Use @samp{sort -g} to produce this ordering. + @item -i @itemx --ignore-case @opindex -i @@ -5806,6 +5814,14 @@ Ignore differences in case when comparing keys. With this option, the lines of the input files must be ordered in the same way. Use @samp{sort -f} to produce this ordering. address@hidden -n address@hidden --numeric-sort address@hidden -n address@hidden --numeric-sort +Compute string numerical value when comparing keys. +With this option, the lines of the input files must be ordered in the same way. +Use @samp{sort -n} to produce this ordering. + @item -1 @var{field} @opindex -1 Join on field @var{field} (a positive integer) of file 1. diff --git a/gnulib b/gnulib index 4730c3e..cbc11ff 160000 --- a/gnulib +++ b/gnulib @@ -1 +1 @@ -Subproject commit 4730c3e3692b344effb72d46b3ff92db0bdb797a +Subproject commit cbc11ff0020eb9c04caea6b3e7dc4e4281dff1f9 diff --git a/src/join.c b/src/join.c index b92c1f8..95dac47 100644 --- a/src/join.c +++ b/src/join.c @@ -30,6 +30,7 @@ #include "memcasecmp.h" #include "quote.h" #include "stdio--.h" +#include "strnumcmp.h" #include "xmemcoll.h" #include "xstrtol.h" #include "argmatch.h" @@ -47,6 +48,16 @@ b = tmp; \ } while (0); +#define UCHAR_LIM (UCHAR_MAX + 1) + +#if HAVE_C99_STRTOLD +# define long_double long double +#else +# define long_double double +# undef strtold +# define strtold strtod +#endif + /* An element of the list identifying which fields to print for each output line. */ struct outlist @@ -100,6 +111,12 @@ static char *g_names[2]; want to overwrite the previous buffer before we check order. */ static struct line *spareline[2] = {NULL, NULL}; +/* The representation of the decimal point in the current locale. */ +static int decimal_point; + +/* Thousands separator; if -1, then there isn't one. */ +static int thousands_sep; + /* True if the LC_COLLATE locale is hard. */ static bool hard_LC_COLLATE; @@ -140,6 +157,9 @@ static struct outlist *outlist_end = &outlist_head; tab character whose value (when cast to unsigned char) equals TAB. */ static int tab = -1; +/* Table of blanks. */ +static bool blanks[UCHAR_LIM]; + /* If nonzero, check that the input is correctly ordered. */ static enum { @@ -158,7 +178,9 @@ enum static struct option const longopts[] = { + {"general-numeric-sort", no_argument, NULL, 'g'}, {"ignore-case", no_argument, NULL, 'i'}, + {"numeric-sort", no_argument, NULL, 'n'}, {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, {"header", no_argument, NULL, HEADER_LINE_OPTION}, @@ -173,6 +195,12 @@ static struct line uni_blank; /* If nonzero, ignore case when comparing join fields. */ static bool ignore_case; +/* If nonzero, treat keys as numeric values. */ +static bool numeric_sort; + +/* If nonzero, treat keys as general numeric values. */ +static bool general_numeric_sort; + /* If nonzero, treat the first line of each file as column headers - join them without checking for ordering */ static bool join_header_lines; @@ -198,7 +226,9 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\ -e EMPTY replace missing input fields with EMPTY\n\ "), stdout); fputs (_("\ - -i, --ignore-case ignore differences in case when comparing fields\n\ + -g, --general-numeric-sort compare according to general numerical value\n\ + -i, --ignore-case ignore differences in case when comparing fields\n\ + -n, --numeric-sort compare according to string numerical value\n\ -j FIELD equivalent to '-1 FIELD -2 FIELD'\n\ -o FORMAT obey FORMAT while constructing output line\n\ -t CHAR use CHAR as input and output field separator\n\ @@ -303,6 +333,73 @@ freeline (struct line *line) line->buf.buffer = NULL; } +/* Compare strings A and B as numbers without explicitly converting them to + machine numbers. Comparatively slow for short strings, but asymptotically + hideously fast. Copied from sort.c. */ + +static int +numcompare (char const *a, char const *b) +{ + while (blanks[to_uchar (*a)]) + a++; + while (blanks[to_uchar (*b)]) + b++; + + return strnumcmp (a, b, decimal_point, thousands_sep); +} + +/* Work around a problem whereby the long double value returned by glibc's + strtold ("NaN", ...) contains uninitialized bits: clear all bytes of + A and B before calling strtold. FIXME: remove this function once + gnulib guarantees that strtold's result is always well defined. + Copied from sort.c. */ + +static int +nan_compare (char const *sa, char const *sb) +{ + long_double a; + memset (&a, 0, sizeof a); + a = strtold (sa, NULL); + + long_double b; + memset (&b, 0, sizeof b); + b = strtold (sb, NULL); + + return memcmp (&a, &b, sizeof a); +} + + +/* Compare general numerical value by conversion to long double. + * Copied from sort.c. */ + +static int +general_numcompare (char const *sa, char const *sb) +{ + /* FIXME: maybe add option to try expensive FP conversion + only if A and B can't be compared more cheaply/accurately. */ + + char *ea; + char *eb; + long_double a = strtold (sa, &ea); + long_double b = strtold (sb, &eb); + + /* Put conversion errors at the start of the collating sequence. */ + if (sa == ea) + return sb == eb ? 0 : -1; + if (sb == eb) + return 1; + + /* Sort numbers in the usual way, where -0 == +0. Put NaNs after + conversion errors but before numbers; sort them by internal + bit-pattern, for lack of a more portable alternative. */ + return (a < b ? -1 + : a > b ? 1 + : a == b ? 0 + : b == b ? -1 + : a == a ? 1 + : nan_compare (sa, sb)); +} + /* Return <0 if the join field in LINE1 compares less than the one in LINE2; >0 if it compares greater; 0 if it compares equal. Report an error and exit if the comparison fails. @@ -318,6 +415,7 @@ keycmp (struct line const *line1, struct line const *line2, size_t len1; size_t len2; /* Length of fields to compare. */ + long double x1, x2; int diff; if (jf_1 < line1->nfields) @@ -347,7 +445,19 @@ keycmp (struct line const *line1, struct line const *line2, if (len2 == 0) return 1; - if (ignore_case) + if (numeric_sort) + { + char end1 = beg1[len1]; + char end2 = beg2[len2]; + beg1[len1] = '\0'; beg2[len2] = '\0'; + diff = numcompare (beg1, beg2); + beg1[len1] = end1; beg2[len2] = end2; + } + else if (general_numeric_sort) + { + diff = general_numcompare (beg1, beg2); + } + else if (ignore_case) { /* FIXME: ignore_case does not work with NLS (in particular, with multibyte chars). */ @@ -360,7 +470,7 @@ keycmp (struct line const *line1, struct line const *line2, diff = memcmp (beg1, beg2, MIN (len1, len2)); } - if (diff) + if (diff || numeric_sort || general_numeric_sort) return diff; return len1 < len2 ? -1 : len1 != len2; } @@ -412,6 +522,19 @@ check_order (const struct line *prev, } } +/* Initialize the character class tables. */ + +static void +inittables (void) +{ + size_t i; + + for (i = 0; i < UCHAR_LIM; ++i) + { + blanks[i] = !! isblank (i); + } +} + static inline void reset_line (struct line *line) { @@ -1009,6 +1132,24 @@ main (int argc, char **argv) textdomain (PACKAGE); hard_LC_COLLATE = hard_locale (LC_COLLATE); + /* Get locale's representation of the decimal point. */ + { + struct lconv const *locale = localeconv (); + + /* If the locale doesn't define a decimal point, or if the decimal + point is multibyte, use the C locale's decimal point. FIXME: + add support for multibyte decimal points. */ + decimal_point = to_uchar (locale->decimal_point[0]); + if (! decimal_point || locale->decimal_point[1]) + decimal_point = '.'; + + /* FIXME: add support for multibyte thousands separators. */ + thousands_sep = to_uchar (*locale->thousands_sep); + if (! thousands_sep || locale->thousands_sep[1]) + thousands_sep = -1; + } + inittables (); + atexit (close_stdout); atexit (free_spareline); @@ -1017,7 +1158,7 @@ main (int argc, char **argv) issued_disorder_warning[0] = issued_disorder_warning[1] = false; check_input_order = CHECK_ORDER_DEFAULT; - while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:", + while ((optc = getopt_long (argc, argv, "-a:e:gin1:2:j:o:t:v:", longopts, NULL)) != -1) { @@ -1050,10 +1191,18 @@ main (int argc, char **argv) empty_filler = optarg; break; + case 'g': + general_numeric_sort = true; + break; + case 'i': ignore_case = true; break; + case 'n': + numeric_sort = true; + break; + case '1': set_join_field (&join_field_1, string_to_join_field (optarg)); break; diff --git a/tests/misc/join b/tests/misc/join index a3fd1a8..2dce94e 100755 --- a/tests/misc/join +++ b/tests/misc/join @@ -147,6 +147,14 @@ my @tv = ( ["a,1,,2\nb,1,2\n", "a,3,4\nb,3,4\n"], "a,1,,2,3,4\nb,1,2,,3,4\n"], +# Join on numerically sorted field. +['numeric-1', '-n', ["7 s\n8 e\n10 t\n4000 f\n", "7 S\n9 N\n4000 F\n"], + "7 s S\n4000 f F\n", 0], +['numeric-2', '', ["7 s\n8 e\n10 t\n", "7 S\n9 N\n10 T\n"], + "7 s S\n", 1, "$prog: numeric-2.2:3: is not sorted: 10 T\n"], +['numeric-3', '-g', ["7 s\n5e2 f\n800 e", "70e-1 S\n500.0 F\n"], + "7 s S\n5e2 f F\n", 0], + # From Tim Smithers: fixed in 1.22l ['trailing-sp', '-t: -1 1 -2 1', ["a:x \n", "a:y \n"], "a:x :y \n", 0], -- 1.7.9.4