diff --git a/src/join.c b/src/join.c index fa18c9d..b02dc08 100644 --- a/src/join.c +++ b/src/join.c @@ -47,6 +47,308 @@ b = tmp; \ } while (0); +/* + * The code here is an as verbal copy from sort.c. as possible. + * It's here to test the unification of key handling between sort and join. + * + * Test it in join.c first and decide about move into a shared library + * later on. + */ + +/* TODO : check includes carefully */ +#include "strnumcmp.h" +/* #include "hard-locale.h" */ +#include "langinfo.h" +#include "strnumcmp.h" +/* #include "unistr.h" */ + +#define UCHAR_LIM (UCHAR_MAX + 1) + +#if HAVE_C99_STRTOLD +# define long_double long double +#else +# define long_double double +# undef strtold +# define strtold strtod +#endif + +/* The representation of the decimal point in the current locale. */ +static int decimal_point; + +/* Thousands separator; if -1, then there isn't one. */ +static int thousands_sep; + +/* Nonzero if the corresponding locales are hard. */ +static bool hard_LC_COLLATE; +#if HAVE_NL_LANGINFO +static bool hard_LC_TIME; +#endif + +/* The kind of blanks for '-b' to skip in various options. */ +enum blanktype { bl_start, bl_end, bl_both }; + +/* Table of blanks. */ +static bool blanks[UCHAR_LIM]; + +/* Table of non-printing characters. */ +static bool nonprinting[UCHAR_LIM]; + +/* Table of non-dictionary characters (not letters, digits, or blanks). */ +static bool nondictionary[UCHAR_LIM]; + +/* Translation table folding lower case to upper. */ +static unsigned char fold_toupper[UCHAR_LIM]; + +#define MONTHS_PER_YEAR 12 + +struct month +{ + char const *name; + int val; +}; + +/* Table mapping month names to integers. + Alphabetic order allows binary search. */ +static struct month monthtab[] = +{ + {"APR", 4}, + {"AUG", 8}, + {"DEC", 12}, + {"FEB", 2}, + {"JAN", 1}, + {"JUL", 7}, + {"JUN", 6}, + {"MAR", 3}, + {"MAY", 5}, + {"NOV", 11}, + {"OCT", 10}, + {"SEP", 9} +}; + +#if HAVE_NL_LANGINFO + +static int +struct_month_cmp (void const *m1, void const *m2) +{ + struct month const *month1 = m1; + struct month const *month2 = m2; + return strcmp (month1->name, month2->name); +} + +#endif + +/* Initialize the character class tables. */ + +static void +inittables (void) +{ + size_t i; + + for (i = 0; i < UCHAR_LIM; ++i) + { + blanks[i] = !! isblank (i); + nonprinting[i] = ! isprint (i); + nondictionary[i] = ! isalnum (i) && ! isblank (i); + fold_toupper[i] = toupper (i); + } + +#if HAVE_NL_LANGINFO + /* If we're not in the "C" locale, read different names for months. */ + if (hard_LC_TIME) + { + for (i = 0; i < MONTHS_PER_YEAR; i++) + { + char const *s; + size_t s_len; + size_t j, k; + char *name; + + s = nl_langinfo (ABMON_1 + i); + s_len = strlen (s); + monthtab[i].name = name = xmalloc (s_len + 1); + monthtab[i].val = i + 1; + + for (j = k = 0; j < s_len; j++) + if (! isblank (to_uchar (s[j]))) + name[k++] = fold_toupper[to_uchar (s[j])]; + name[k] = '\0'; + } + qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp); + } +#endif +} + +/* Table that maps characters to order-of-magnitude values. */ +static char const unit_order[UCHAR_LIM] = + { +#if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \ + && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107) + /* This initializer syntax works on all C99 hosts. For now, use + it only on non-ASCII hosts, to ease the pain of porting to + pre-C99 ASCII hosts. */ + ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, + ['k']=1, +#else + /* Generate the following table with this command: + perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8); + foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\ + |fmt */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, + 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +#endif + }; + +/* Return an integer that represents the order of magnitude of the + unit following the number. The number may contain thousands + separators and a decimal point, but it may not contain leading blanks. + Negative numbers get negative orders; zero numbers have a zero order. */ + +static int +find_unit_order (char const *number) +{ + bool minus_sign = (*number == '-'); + char const *p = number + minus_sign; + int nonzero = 0; + unsigned char ch; + + /* Scan to end of number. + Decimals or separators not followed by digits stop the scan. + Numbers ending in decimals or separators are thus considered + to be lacking in units. + FIXME: add support for multibyte thousands_sep and decimal_point. */ + + do + { + while (ISDIGIT (ch = *p++)) + nonzero |= ch - '0'; + } + while (ch == thousands_sep); + + if (ch == decimal_point) + while (ISDIGIT (ch = *p++)) + nonzero |= ch - '0'; + + if (nonzero) + { + int order = unit_order[ch]; + return (minus_sign ? -order : order); + } + else + return 0; +} + +/* Compare numbers A and B ending in units with SI or IEC prefixes + < K/k < M < G < T < P < E < Z < Y */ + +static int +human_numcompare (char const *a, char const *b) +{ + while (blanks[to_uchar (*a)]) + a++; + while (blanks[to_uchar (*b)]) + b++; + + int diff = find_unit_order (a) - find_unit_order (b); + return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep)); +} + +/* Compare strings A and B as numbers without explicitly converting them to + machine numbers. Comparatively slow for short strings, but asymptotically + hideously fast. */ + +static int +numcompare (char const *a, char const *b) +{ + while (blanks[to_uchar (*a)]) + a++; + while (blanks[to_uchar (*b)]) + b++; + + return strnumcmp (a, b, decimal_point, thousands_sep); +} + +static int +general_numcompare (char const *sa, char const *sb) +{ + /* FIXME: maybe add option to try expensive FP conversion + only if A and B can't be compared more cheaply/accurately. */ + + char *ea; + char *eb; + long_double a = strtold (sa, &ea); + long_double b = strtold (sb, &eb); + + /* Put conversion errors at the start of the collating sequence. */ + if (sa == ea) + return sb == eb ? 0 : -1; + if (sb == eb) + return 1; + + /* Sort numbers in the usual way, where -0 == +0. Put NaNs after + conversion errors but before numbers; sort them by internal + bit-pattern, for lack of a more portable alternative. */ + return (a < b ? -1 + : a > b ? 1 + : a == b ? 0 + : b == b ? -1 + : a == a ? 1 + : memcmp (&a, &b, sizeof a)); +} + +/* Return an integer in 1..12 of the month name MONTH. + Return 0 if the name in S is not recognized. */ + +static int +getmonth (char const *month, char **ea) +{ + size_t lo = 0; + size_t hi = MONTHS_PER_YEAR; + + while (blanks[to_uchar (*month)]) + month++; + + do + { + size_t ix = (lo + hi) / 2; + char const *m = month; + char const *n = monthtab[ix].name; + + for (;; m++, n++) + { + if (!*n) + { + if (ea) + *ea = (char *) m; + return monthtab[ix].val; + } + if (fold_toupper[to_uchar (*m)] < to_uchar (*n)) + { + hi = ix; + break; + } + else if (fold_toupper[to_uchar (*m)] > to_uchar (*n)) + { + lo = ix + 1; + break; + } + } + } + while (lo < hi); + + return 0; +} + +/* Import from copy.c ends here */ + /* An element of the list identifying which fields to print for each output line. */ struct outlist @@ -201,6 +503,24 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\ --header treat the first line in each file as field headers,\n\ print them without trying to pair them\n\ "), stdout); + fputs (_("\ + -b, --ignore-leading-blanks ignore leading blanks\n\ + -d, --dictionary-order consider only blanks and alphanumeric characters\n\ + -f, --ignore-case fold lower case to upper case characters\n\ +"), stdout); + fputs (_("\ + -g, --general-numeric-sort compare according to general numerical value\n\ + -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\ + -n, --numeric-sort compare according to string numerical value\n\ + -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\ + -V, --version-sort natural sort of (version) numbers within text\n\ +"), stdout); + fputs (_("\ + --sort=WORD sort according to WORD:\n\ + general-numeric -g, human-numeric -h, month -M,\n\ + numeric -n, version -V\n\ +\n\ +"), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); fputs (_("\ @@ -329,6 +649,16 @@ keycmp (struct line const *line1, struct line const *line2, len2 = 0; } +/* The following ifdef'ed part allowed to solve the problem + * http://debbugs.gnu.org/cgi/bugreport.cgi?bug=6903 + * describes. + * A real solution will need lots of inprovements here. + */ + +#undef KEY_NUMCMP +#ifdef KEY_NUMCMP + diff = numcompare (beg1, beg2); +#else if (len1 == 0) return len2 == 0 ? 0 : -1; if (len2 == 0) @@ -343,13 +673,15 @@ keycmp (struct line const *line1, struct line const *line2, else { if (hard_LC_COLLATE) - return xmemcoll (beg1, len1, beg2, len2); - diff = memcmp (beg1, beg2, MIN (len1, len2)); + diff = xmemcoll (beg1, len1, beg2, len2); + else + diff = memcmp (beg1, beg2, MIN (len1, len2)); } - if (diff) - return diff; - return len1 < len2 ? -1 : len1 != len2; + if (! diff) + diff = len1 < len2 ? -1 : len1 != len2; +#endif +return diff; } /* Check that successive input lines PREV and CURRENT from input file @@ -975,6 +1307,8 @@ main (int argc, char **argv) issued_disorder_warning[0] = issued_disorder_warning[1] = false; check_input_order = CHECK_ORDER_DEFAULT; + inittables(); + while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:", longopts, NULL)) != -1)