bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Human readable sort


From: Pádraig Brady
Subject: Re: Human readable sort
Date: Mon, 27 Apr 2009 17:18:19 +0100
User-agent: Thunderbird 2.0.0.6 (X11/20071008)

address@hidden wrote:
> On Apr 27, 2009 11:41am, Ondřej Vašík <address@hidden> wrote:
>> Pádraig Brady wrote:
>>
>> > Pádraig Brady wrote:
>>
>> > Attached is the full patch, which hopefully we can push soon.
>>
>>
>>
>> I'm not objecting anything relevant in that patch, just the tab/spaces
>>
>> mixing looks inconsistent with the rest of the code in added lines in
>>
>> sort.c . Only cosmetic thing ... but is this intentional?
>>
>>
>>
>> Greetings,
>>
>>         Ondřej Vašík
>>
> 
> That is my fault. My home and work emacs are configured with tabs
> disabled for interaction with my office and home repos. I'll do cleanup
> against the patch this afternoon when I can look at it.

Thanks Ondřej.
This is fixed up in attached I think.

cheers,
Pádraig.

p.s. Micheal, which email address do you want in THANKS?
>From 2e1ee46e97858717535210729f5e0181c08bf6d1 Mon Sep 17 00:00:00 2001
From: Michael Speer <address@hidden>
Date: Mon, 27 Apr 2009 14:51:29 +0100
Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc.

* NEWS: document the new option
* doc/coreutils.texi: ditto
* src/sort.c (main): handle the new -human-numeric-sort option (-h).
(human_numcompare): A new function to compare SI and IEC suffixes
before falling back to the standard --numeric comparision.
(find_unit_order): A new helper function to find the order
of magnitude of a number string as determined by its suffix.
(check_mixed_SI_IEC): A new helper function to exit with error
if both SI and IEC suffixes are presented.
* tests/misc/sort: Add 6 tests to test the new functionality.
* THANKS: Update
---
 NEWS               |    5 ++
 THANKS             |    1 +
 doc/coreutils.texi |   14 ++++++
 src/sort.c         |  115 ++++++++++++++++++++++++++++++++++++++++++++++++----
 tests/misc/sort    |   13 ++++++
 5 files changed, 140 insertions(+), 8 deletions(-)

diff --git a/NEWS b/NEWS
index 8cb17cc..bfebe1d 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU coreutils NEWS                                    -*- 
outline -*-
 
 * Noteworthy changes in release 7.3 (????-??-??) [?]
 
+** New features
+
+  sort accepts a new option, --human-numeric-sort (-h): sort numbers
+  while honouring human readable suffixes like KiB and MB etc.
+
 ** Bug fixes
 
   sort -m no longer segfaults when its output file is also an input file.
diff --git a/THANKS b/THANKS
index 876a6b6..da48d6d 100644
--- a/THANKS
+++ b/THANKS
@@ -395,6 +395,7 @@ Michael J. Croghan                  address@hidden
 Michael McFarland                   address@hidden
 Michael McLagan                     address@hidden
 Michael Piefel                      address@hidden
+Michael Speer                       address@hidden
 Michael Steffens                    address@hidden
 Michael Stone                       address@hidden
 Michael Stutz                       address@hidden
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 918f44e..8f73419 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3785,6 +3785,20 @@ Use this option only if there is no alternative; it is 
much slower than
 @option{--numeric-sort} (@option{-n}) and it can lose information when
 converting to floating point.
 
address@hidden -h
address@hidden --human-numeric-sort
address@hidden --sort=human-numeric
address@hidden -h
address@hidden --human-numeric-sort
address@hidden --sort
address@hidden human numeric sort
address@hidden LC_NUMERIC
+Sort numerically, as per the @option{--numeric-sort} option,
+and in addition handle IEC or SI suffixes like MiB, MB etc.
+Note a mixture of these suffixes is not supported and will
+be flagged as an error. Also the numbers must be abbreviated unambiguously.
+I.E. 5000K and 6M will be sorted incorrectly for example.
+
 @item -i
 @itemx --ignore-nonprinting
 @opindex -i
diff --git a/src/sort.c b/src/sort.c
index f48d727..8a85d54 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -176,6 +176,8 @@ struct keyfield
   bool random;                 /* Sort by random hash of key.  */
   bool general_numeric;                /* Flag for general, numeric comparison.
                                   Handle numbers in exponential notation. */
+  bool human_numeric;          /* Flag for sorting by human readable
+                                  units with either SI xor IEC prefixes. */
   bool month;                  /* Flag for comparison by month name. */
   bool reverse;                        /* Reverse the sense of comparison. */
   bool version;                        /* sort by version number */
@@ -337,6 +339,9 @@ Ordering options:\n\
   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 "), stdout);
       fputs (_("\
+  -h, --human-numeric-sort    compare human readable numbers (e.g., 2K 1G)\n\
+"), stdout);
+      fputs (_("\
   -n, --numeric-sort          compare according to string numerical value\n\
   -R, --random-sort           sort by random hash of keys\n\
       --random-source=FILE    get random bytes from FILE\n\
@@ -344,8 +349,8 @@ Ordering options:\n\
 "), stdout);
       fputs (_("\
       --sort=WORD             sort according to WORD:\n\
-                                general-numeric -g, month -M, numeric -n,\n\
-                                random -R, version -V\n\
+                                general-numeric -g, human-numeric -h, month 
-M,\n\
+                                numeric -n, random -R, version -V\n\
   -V, --version-sort          natural sort of (version) numbers within text\n\
 \n\
 "), stdout);
@@ -426,7 +431,7 @@ enum
   SORT_OPTION
 };
 
-static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z";
+static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
 
 static struct option const long_options[] =
 {
@@ -442,6 +447,7 @@ static struct option const long_options[] =
   {"merge", no_argument, NULL, 'm'},
   {"month-sort", no_argument, NULL, 'M'},
   {"numeric-sort", no_argument, NULL, 'n'},
+  {"human-numeric-sort", no_argument, NULL, 'h'},
   {"version-sort", no_argument, NULL, 'V'},
   {"random-sort", no_argument, NULL, 'R'},
   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -480,6 +486,7 @@ static char const check_types[] =
 
 #define SORT_TABLE \
   _st_("general-numeric", 'g') \
+  _st_("human-numeric",   'h') \
   _st_("month",           'M') \
   _st_("numeric",         'n') \
   _st_("random",          'R') \
@@ -1673,6 +1680,87 @@ numcompare (const char *a, const char *b)
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }
 
+/* Exit with an error if a mixture of SI and IEC units detected.  */
+
+static void
+check_mixed_SI_IEC (char prefix)
+{
+  static int seen_si = -1;
+  bool si_present = prefix == 'i';
+  if (seen_si != -1 && seen_si != si_present)
+    error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units"));
+  seen_si = si_present;
+}
+
+/* Return an integer which represents the order of magnitude of
+   the unit following the number.  NUMBER can contain thousands separators
+   or a decimal point, but not have preceeding blanks.
+   Negative numbers return a negative unit order.  */
+
+static int
+find_unit_order (const char *number)
+{
+  static const char orders [UCHAR_LIM] = {
+    ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+    ['k']=1,
+  };
+
+  const unsigned char *p = number;
+
+  int sign = 1;
+
+  if (*p == '-')
+    {
+      sign = -1;
+      p++;
+    }
+
+  /* Scan to end of number.
+     Decimals or separators not followed by digits stop the scan.
+     Numbers ending in decimals or separators are thus considered
+     to be lacking in units.
+     FIXME: add support for multibyte thousands_sep and decimal_point.  */
+
+  while (ISDIGIT (*p))
+    {
+      p++;
+
+      if (*p == decimal_point && ISDIGIT (*(p + 1)))
+       p += 2;
+      else if (*p == thousands_sep && ISDIGIT (*(p + 1)))
+       p += 2;
+    }
+
+  int order = orders[*p];
+
+  /* For valid units check for MiB vs MB etc.  */
+  if (order)
+    check_mixed_SI_IEC (*(p + 1));
+
+  return sign * order;
+}
+
+/* Compare numbers ending in units with SI xor IEC prefixes
+       <none/unknown> < K/k < M < G < T < P < E < Z < Y
+   Assume that numbers are properly abbreviated.
+   i.e. input will never have both 5000K and 6M.  */
+
+static int
+human_numcompare (const char *a, const char *b)
+{
+  while (blanks[to_uchar (*a)])
+    a++;
+  while (blanks[to_uchar (*b)])
+    b++;
+
+  int order_a = find_unit_order (a);
+  int order_b = find_unit_order (b);
+
+  return (order_a > order_b ? 1
+         : order_a < order_b ? -1
+         : strnumcmp (a, b, decimal_point , thousands_sep));
+}
+
 static int
 general_numcompare (const char *sa, const char *sb)
 {
@@ -1917,13 +2005,14 @@ keycompare (const struct line *a, const struct line *b)
 
       if (key->random)
        diff = compare_random (texta, lena, textb, lenb);
-      else if (key->numeric | key->general_numeric)
+      else if (key->numeric | key->general_numeric | key->human_numeric)
        {
          char savea = *lima, saveb = *limb;
 
          *lima = *limb = '\0';
-         diff = ((key->numeric ? numcompare : general_numcompare)
-                 (texta, textb));
+         diff = ((key->numeric ? numcompare
+                  : key->general_numeric ? general_numcompare
+                  : human_numcompare) (texta, textb));
          *lima = savea, *limb = saveb;
        }
       else if (key->version)
@@ -2889,7 +2978,7 @@ check_ordering_compatibility (void)
 
   for (key = keylist; key; key = key->next)
     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
-             + key->version + !!key->ignore))
+             + key->version + !!key->ignore + key->human_numeric))
        || (key->random && key->translate))
       {
        /* The following is too big, but guaranteed to be "big enough". */
@@ -2901,6 +2990,8 @@ check_ordering_compatibility (void)
          *p++ = 'f';
        if (key->general_numeric)
          *p++ = 'g';
+       if (key->human_numeric)
+         *p++ = 'h';
        if (key->ignore == nonprinting)
          *p++ = 'i';
        if (key->month)
@@ -2992,6 +3083,9 @@ set_ordering (const char *s, struct keyfield *key, enum 
blanktype blanktype)
        case 'g':
          key->general_numeric = true;
          break;
+       case 'h':
+         key->human_numeric = true;
+         break;
        case 'i':
          /* Option order should not matter, so don't let -i override
             -d.  -d implies -i, but -i does not imply -d.  */
@@ -3140,7 +3234,8 @@ main (int argc, char **argv)
   gkey.sword = gkey.eword = SIZE_MAX;
   gkey.ignore = NULL;
   gkey.translate = NULL;
-  gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false;
+  gkey.numeric = gkey.general_numeric = gkey.human_numeric = false;
+  gkey.random = gkey.version = false;
   gkey.month = gkey.reverse = false;
   gkey.skipsblanks = gkey.skipeblanks = false;
 
@@ -3219,6 +3314,7 @@ main (int argc, char **argv)
        case 'd':
        case 'f':
        case 'g':
+       case 'h':
        case 'i':
        case 'M':
        case 'n':
@@ -3471,6 +3567,7 @@ main (int argc, char **argv)
                 | key->numeric
                 | key->version
                 | key->general_numeric
+                | key->human_numeric
                 | key->random)))
         {
           key->ignore = gkey.ignore;
@@ -3480,6 +3577,7 @@ main (int argc, char **argv)
           key->month = gkey.month;
           key->numeric = gkey.numeric;
           key->general_numeric = gkey.general_numeric;
+          key->human_numeric = gkey.human_numeric;
           key->random = gkey.random;
           key->reverse = gkey.reverse;
           key->version = gkey.version;
@@ -3495,6 +3593,7 @@ main (int argc, char **argv)
                       | gkey.month
                       | gkey.numeric
                       | gkey.general_numeric
+                      | gkey.human_numeric
                       | gkey.random
                       | gkey.version)))
     {
diff --git a/tests/misc/sort b/tests/misc/sort
index c5a2d74..76b271d 100755
--- a/tests/misc/sort
+++ b/tests/misc/sort
@@ -54,6 +54,19 @@ my @Tests =
 ["n11a", '-s -n -k1,1', {IN=>".01a\n.010\n"}, {OUT=>".01a\n.010\n"}],
 ["n11b", '-s -n -k1,1', {IN=>".010\n.01a\n"}, {OUT=>".010\n.01a\n"}],
 
+#human readable suffixes
+["h1", '-h', {IN=>"Y\nZ\nE\nP\nT\nG\nM\nK\n"},
+ {OUT=>"K\nM\nG\nT\nP\nE\nZ\nY\n"}],
+["h2", '-h', {IN=>"1M\n-2G\n-3K"}, {OUT=>"-2G\n-3K\n1M\n"}],
+["h3", '-h', {IN=>"1Mi\n1M\n"}, {OUT=>""}, {EXIT=>2},
+ {ERR=>"$prog: both SI and IEC prefixes present on units\n"}],
+# decimal at end => ignore suffix
+["h4", '-h', {IN=>"1.E\n2.M\n"}, {OUT=>"1.E\n2.M\n"}],
+# double decimal => ignore suffix
+["h5", '-h', {IN=>"1..2E\n2..2M\n"}, {OUT=>"1..2E\n2..2M\n"}],
+# illustrate misordering of ambiguous abbreviations
+["h6", '-h', {IN=>"1GiB\n1030MiB\n"}, {OUT=>"1030MiB\n1GiB\n"}],
+
 ["01a", '', {IN=>"A\nB\nC\n"}, {OUT=>"A\nB\nC\n"}],
 #
 ["02a", '-c', {IN=>"A\nB\nC\n"}, {OUT=>''}],
-- 
1.5.3.6


reply via email to

[Prev in Thread] Current Thread [Next in Thread]