bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Human readable sort


From: Michael Speer
Subject: Human readable sort
Date: Fri, 24 Apr 2009 15:15:53 -0400

I wrote the following patch to the 7.2 branch of coreutils to allow
`sort` to sort by human readable byte sizes.  I looked around a bit to
see what the status of previous attempts to integrate this
functionality were, but didn't see any very recent activity.  This is
my first interaction with coreutils, so if I missed something obvious,
please point me towards it.

Is the last potential patch (
http://www.mail-archive.com/address@hidden/msg14080.html )
moving through?  If not, if I cleaned this up ( tabs, documentation,
and test cases ) and applied it to the current HEAD on savannah is
there a chance of getting this functionality into sort?

Patch assumptions :
  * that numbers will use the best representation ( never uses 1024b
instead of 1k, etc )
  * that the sizes will be specified via suffixes of b, K, M, G, T, P,
E, Z, Y or their alternately cased variants

The first assumption results in checking only the suffix when they differ.
This enables it to match the output of `du -h / du --si`, but possibly
not other tools that do not conform to these assumptions.

---------

--- orig/coreutils-7.2/src/sort.c       2009-03-29 13:44:10.000000000 -0400
+++ coreutils-7.2/src/sort.c    2009-04-24 14:03:47.000000000 -0400
@@ -176,6 +176,8 @@
   bool random;                 /* Sort by random hash of key.  */
   bool general_numeric;                /* Flag for general, numeric comparison.
                                   Handle numbers in exponential notation. */
+  bool human_numeric;           /* Flag for sorting by size specified
+                                   data */
   bool month;                  /* Flag for comparison by month name. */
   bool reverse;                        /* Reverse the sense of comparison. */
   bool version;                        /* sort by version number */
@@ -426,7 +428,7 @@
   SORT_OPTION
 };

-static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z";
+static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";

 static struct option const long_options[] =
 {
@@ -442,6 +444,7 @@
   {"merge", no_argument, NULL, 'm'},
   {"month-sort", no_argument, NULL, 'M'},
   {"numeric-sort", no_argument, NULL, 'n'},
+  {"human-sort", no_argument, NULL, 'h'},
   {"version-sort", no_argument, NULL, 'V'},
   {"random-sort", no_argument, NULL, 'R'},
   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -1673,6 +1676,57 @@
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }

+/* assumes UCHAR_MAX of 255 */
+/* Y/y:8 -> K/k:1 , otherwise ( including b ) : 0 */
+const char weights [] =
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 6, 0, 3, 0, 0, 0, 1, 0, 2, 0, 0,
+    5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 6, 0, 3, 0, 0, 0, 1, 0, 2, 0, 0,
+    5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } ;
+
+static int
+human_compare(const char *a, const char *b)
+{
+  /* this tests numeric entities ending in human readable size specifiers
+     b < K < M < G < T < P < E < Z < Y
+     we (rudely I admit) assume that numbers are properly abbreviated.
+     for example, you will never see 500,000,000b, instead of 5M
+  */
+
+  const char * ar, * br ; /* riders */
+  int aw, bw ;
+
+  while(blanks[to_uchar (*a)])
+    a++;
+  while(blanks[to_uchar (*b)])
+    b++;
+
+  ar = a ;
+  br = b ;
+
+  while( ISDIGIT(*ar) || (*ar) == decimal_point || (*ar) == thousands_sep )
+    ar++ ;
+  while( ISDIGIT(*br) || (*br) == decimal_point || (*br) == thousands_sep )
+    br++ ;
+
+  aw = weights[to_uchar (*ar)] ;
+  bw = weights[to_uchar (*br)] ;
+
+  return aw > bw ? 1 : aw < bw ? -1 : strnumcmp( a , b ,
decimal_point , thousands_sep) ;
+}
+
 static int
 general_numcompare (const char *sa, const char *sb)
 {
@@ -1917,6 +1971,10 @@

       if (key->random)
        diff = compare_random (texta, lena, textb, lenb);
+      else if (key->human_numeric)
+        {
+          diff = human_compare(texta, textb);
+        }
       else if (key->numeric | key->general_numeric)
        {
          char savea = *lima, saveb = *limb;
@@ -2887,7 +2945,7 @@

   for (key = keylist; key; key = key->next)
     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
-             + key->version + !!key->ignore))
+             + key->version + (!!key->ignore) + key->human_numeric))
        || (key->random && key->translate))
       {
        /* The following is too big, but guaranteed to be "big enough". */
@@ -2899,6 +2957,8 @@
          *p++ = 'f';
        if (key->general_numeric)
          *p++ = 'g';
+        if (key->human_numeric)
+          *p++ = 'h';
        if (key->ignore == nonprinting)
          *p++ = 'i';
        if (key->month)
@@ -2990,6 +3050,9 @@
        case 'g':
          key->general_numeric = true;
          break;
+        case 'h':
+          key->human_numeric = true;
+          break;
        case 'i':
          /* Option order should not matter, so don't let -i override
             -d.  -d implies -i, but -i does not imply -d.  */
@@ -3138,7 +3201,7 @@
   gkey.sword = gkey.eword = SIZE_MAX;
   gkey.ignore = NULL;
   gkey.translate = NULL;
-  gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false;
+  gkey.numeric = gkey.general_numeric = gkey.random = gkey.version =
gkey.human_numeric = false;
   gkey.month = gkey.reverse = false;
   gkey.skipsblanks = gkey.skipeblanks = false;

@@ -3217,6 +3280,7 @@
        case 'd':
        case 'f':
        case 'g':
+        case 'h':
        case 'i':
        case 'M':
        case 'n':
@@ -3469,6 +3533,7 @@
                 | key->numeric
                 | key->version
                 | key->general_numeric
+                 | key->human_numeric
                 | key->random)))
         {
           key->ignore = gkey.ignore;
@@ -3478,6 +3543,7 @@
           key->month = gkey.month;
           key->numeric = gkey.numeric;
           key->general_numeric = gkey.general_numeric;
+          key->human_numeric = gkey.human_numeric;
           key->random = gkey.random;
           key->reverse = gkey.reverse;
           key->version = gkey.version;
@@ -3493,6 +3559,7 @@
                       | gkey.month
                       | gkey.numeric
                       | gkey.general_numeric
+                       | gkey.human_numeric
                       | gkey.random
                       | gkey.version)))
     {




reply via email to

[Prev in Thread] Current Thread [Next in Thread]