coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

RFE: uniq --sequential


From: Daiki Ueno
Subject: RFE: uniq --sequential
Date: Thu, 11 Jun 2015 06:04:56 +0900
User-agent: Gnus/5.13 (Gnus v5.13) Emacs/24.4 (gnu/linux)

Hello,

I occasionally have to deal with sequential numbers which is largely
contiguous, but contain gaps (e.g., Unicode code points).

To detect gaps, I usually write a shell-script loop, which is not
trivial.  So, I thought that it would be handy if this is supported by
coreutils, like this:

  $ { seq 1 10; seq 12 22; seq 26 34; } | uniq --sequential
  1
  12
  26

or, a more practical use-case:

  $ wc -l UnicodeData.txt
  27268 UnicodeData.txt
  $ cut -f1 -d';' UnicodeData.txt | sed 's/^/0x/' | uniq --sequential | wc -l
  612

where contiguous numbers are treated as duplicates.  I'm attaching a
patch which implements this.

Comments appreciated.

Regards,
-- 
Daiki Ueno
>From 0378c2e3e35fddee69a6e40d2b5fda4c27765d9d Mon Sep 17 00:00:00 2001
From: Daiki Ueno <address@hidden>
Date: Wed, 10 Jun 2015 11:11:23 +0900
Subject: [PATCH] uniq: add the --sequential option

* src/uniq.c (seq_interval): New global variable.
(longopts): Register the --sequential option.
(usage): Summarize the new option.
(different): Check number input based on the --sequential option.
(check_file): Adjust the loop for the --sequential option.
(main): Handle the new --sequential option.
* tests/misc/uniq.pl (add_z_variants): Add tests for the new
--sequential option.
---
 src/uniq.c         | 63 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 tests/misc/uniq.pl |  9 ++++++++
 2 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/src/uniq.c b/src/uniq.c
index e0cfe4d..b3d5619 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -138,6 +138,8 @@ static enum grouping_method const grouping_method_map[] =
 
 static enum grouping_method grouping = GM_NONE;
 
+static size_t seq_interval;
+
 enum
 {
   GROUP_OPTION = CHAR_MAX + 1
@@ -155,6 +157,7 @@ static struct option const longopts[] =
   {"skip-chars", required_argument, NULL, 's'},
   {"check-chars", required_argument, NULL, 'w'},
   {"zero-terminated", no_argument, NULL, 'z'},
+  {"sequential", optional_argument, NULL, 'S'},
   {GETOPT_HELP_OPTION_DECL},
   {GETOPT_VERSION_OPTION_DECL},
   {NULL, 0, NULL, 0}
@@ -207,6 +210,10 @@ With no options, matching lines are merged to the first 
occurrence.\n\
      fputs (_("\
   -w, --check-chars=N   compare no more than N characters in lines\n\
 "), stdout);
+     fputs (_("\
+  -S, --sequential[=INTERVAL]  treat lines as numbers, and remove adjacent\n\
+                          numbers as duplicate lines\n\
+"), stdout);
      fputs (HELP_OPTION_DESCRIPTION, stdout);
      fputs (VERSION_OPTION_DESCRIPTION, stdout);
      fputs (_("\
@@ -284,7 +291,32 @@ different (char *old, char *new, size_t oldlen, size_t 
newlen)
   if (check_chars < newlen)
     newlen = check_chars;
 
-  if (ignore_case)
+  if (seq_interval > 0)
+    {
+      unsigned long int oldval, newval;
+      int oldchar, newchar;
+      bool result = true;
+
+      /* Temporarily NUL terminate OLD and NEW for xstrtoul.  Those
+         should have enough room here.  */
+      oldchar = old[oldlen];
+      newchar = new[newlen];
+      old[oldlen] = '\0';
+      new[newlen] = '\0';
+      if (xstrtoul (old, NULL, 0, &oldval, "") == LONGINT_OK
+          && xstrtoul (new, NULL, 0, &newval, "") == LONGINT_OK)
+        {
+          /* FIXME: This relies on the fact that OLD points to a field
+             on the current line and NEW points to a field on the
+             previous line.  */
+          result = newval + seq_interval != oldval;
+        }
+      /* Restore the original terminators.  */
+      old[oldlen] = oldchar;
+      new[newlen] = newchar;
+      return result;
+    }
+  else if (ignore_case)
     {
       /* FIXME: This should invoke strcoll somehow.  */
       return oldlen != newlen || memcasecmp (old, new, oldlen);
@@ -385,10 +417,13 @@ check_file (const char *infile, const char *outfile, char 
delimiter)
               fwrite (thisline->buffer, sizeof (char),
                       thisline->length, stdout);
 
+              first_group_printed = true;
+            }
+          if (new_group || grouping != GM_NONE || seq_interval > 0)
+            {
               SWAP_LINES (prevline, thisline);
               prevfield = thisfield;
               prevlen = thislen;
-              first_group_printed = true;
             }
         }
       if ((grouping == GM_BOTH || grouping == GM_APPEND) && 
first_group_printed)
@@ -448,11 +483,14 @@ check_file (const char *infile, const char *outfile, char 
delimiter)
           if (!match || output_later_repeated)
             {
               writeline (prevline, match, match_count);
+              if (!match)
+                match_count = 0;
+            }
+          if (!match || output_later_repeated || seq_interval > 0)
+            {
               SWAP_LINES (prevline, thisline);
               prevfield = thisfield;
               prevlen = thislen;
-              if (!match)
-                match_count = 0;
             }
         }
 
@@ -514,7 +552,7 @@ main (int argc, char **argv)
       if (optc == -1
           || (posixly_correct && nfiles != 0)
           || ((optc = getopt_long (argc, argv,
-                                   "-0123456789Dcdf:is:uw:z", longopts, NULL))
+                                   "-0123456789DScdf:is:uw:z", longopts, NULL))
               == -1))
         {
           if (argc <= optind)
@@ -613,6 +651,21 @@ main (int argc, char **argv)
                                  N_("invalid number of bytes to skip"));
           break;
 
+        case 'S':
+          if (optarg == NULL)
+            seq_interval = 1;
+          else
+            {
+              seq_interval = size_opt (optarg,
+                                       N_("invalid interval"));
+              if (seq_interval == 0)
+                {
+                  error (0, 0, _("invalid interval %s"), quote (optarg));
+                  usage (EXIT_FAILURE);
+                }
+            }
+          break;
+
         case 'u':
           output_first_repeated = false;
           output_option_used = true;
diff --git a/tests/misc/uniq.pl b/tests/misc/uniq.pl
index 5eae701..3cea277 100755
--- a/tests/misc/uniq.pl
+++ b/tests/misc/uniq.pl
@@ -227,6 +227,15 @@ my @Tests =
         "  - 'separate'\n" .
         "  - 'both'\n" .
         "Try '$prog --help' for more information.\n"}],
+ # Check sequential option
+ ['146', '--sequential', {IN=>"1\n2\n6\n7\n8\n11\n12\n"}, {OUT=>"1\n6\n11\n"}],
+ ['147', '--sequential --group', {IN=>"1\n2\n6\n7\n8\n11\n12\n"},
+  {OUT=>"1\n2\n\n6\n7\n8\n\n11\n12\n"}],
+ ['148', '--sequential --count', {IN=>"1\n2\n6\n7\n8\n11\n12\n"},
+  {OUT=>"      2 2\n      3 8\n      2 12\n"}],
+ ['149', '--sequential=0', {IN=>""}, {OUT=>""}, {EXIT=>1},
+  {ERR=>"$prog: invalid interval '0'\n" .
+        "Try '$prog --help' for more information.\n"}]
 );
 
 # Locale related tests
-- 
2.1.4


reply via email to

[Prev in Thread] Current Thread [Next in Thread]