bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#10365: [PATCH] uniq: add ability to skip last N chars or fields


From: Adrien Kunysz
Subject: bug#10365: [PATCH] uniq: add ability to skip last N chars or fields
Date: Sun, 25 Dec 2011 12:54:18 +0000
User-agent: Mutt/1.5.20 (2009-06-14)

* doc/coreutils.texi: document the new feature
* src/uniq.c (find_end): new function
(check_file): use find_end() to determine when to stop comparing
(usage): document the new feature
(main): expose the new feature to user
* tests/misc/uniq: add tests to exercise the new code
---
 doc/coreutils.texi |   17 +++++++++++++
 src/uniq.c         |   69 +++++++++++++++++++++++++++++++++++++++++++++++++---
 tests/misc/uniq    |   15 +++++++++++
 3 files changed, 97 insertions(+), 4 deletions(-)

I have recently found myself wishing I could have uniq(1) skip
the last N fields before comparison. I am aware of the rev(1) trick
but I don't find it very satisfactory. So I ended up patching uniq
and implementing the feature for characters skipping as well.

Documentation and tests included. Tests have also been run within
Valgrind on x86_64.

diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index c229f98..b2ca430 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -4680,6 +4680,15 @@ each other by at least one space or tab.
 For compatibility @command{uniq} supports an obsolete option syntax
 @address@hidden  New scripts should use @option{-f @var{n}} instead.
 
address@hidden -F @var{n}
address@hidden address@hidden
address@hidden -F
address@hidden --ignore-fields
+Ignore last @var{n} fields on each line before checking for uniqueness.  Use
+a null string for comparison if a line has fewer than @var{n} fields.  Fields
+are sequences of non-space non-tab characters that are separated from
+each other by at least one space or tab.
+
 @item -s @var{n}
 @itemx address@hidden
 @opindex -s
@@ -4698,6 +4707,14 @@ behavior depends on this variable.
 For example, use @samp{uniq ./+10} or @samp{uniq -s 10} rather than
 the ambiguous @samp{uniq +10}.
 
address@hidden -S @var{n}
address@hidden address@hidden
address@hidden -S
address@hidden --ignore-chars
+Ignore last @var{n} characters before checking for uniqueness.  Use a null
+string for comparison if a line has fewer than @var{n} characters.  If you use
+both the field and character ignoring options, fields are ignored over first.
+
 @item -c
 @itemx --count
 @opindex -c
diff --git a/src/uniq.c b/src/uniq.c
index db717b1..31205f4 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -60,6 +60,12 @@ static size_t skip_fields;
 /* Number of chars to skip after skipping any fields. */
 static size_t skip_chars;
 
+/* Number of fields to ignore at the end. */
+static size_t ignore_fields;
+
+/* Number of chars to ignore at the end after ignoring any fields. */
+static size_t ignore_chars;
+
 /* Number of chars to compare. */
 static size_t check_chars;
 
@@ -116,7 +122,9 @@ static struct option const longopts[] =
   {"ignore-case", no_argument, NULL, 'i'},
   {"unique", no_argument, NULL, 'u'},
   {"skip-fields", required_argument, NULL, 'f'},
+  {"ignore-fields", required_argument, NULL, 'F'},
   {"skip-chars", required_argument, NULL, 's'},
+  {"ignore-chars", required_argument, NULL, 'S'},
   {"check-chars", required_argument, NULL, 'w'},
   {"zero-terminated", no_argument, NULL, 'z'},
   {GETOPT_HELP_OPTION_DECL},
@@ -155,8 +163,10 @@ Mandatory arguments to long options are mandatory for 
short options too.\n\
                         delimit-method={none(default),prepend,separate}\n\
                         Delimiting is done with blank lines\n\
   -f, --skip-fields=N   avoid comparing the first N fields\n\
+  -F, --ignore-fields=N  avoid comparing the last N fields\n\
   -i, --ignore-case     ignore differences in case when comparing\n\
   -s, --skip-chars=N    avoid comparing the first N characters\n\
+  -S, --ignore-chars=N  avoid comparing the last N characters\n\
   -u, --unique          only print unique lines\n\
   -z, --zero-terminated  end lines with 0 byte, not newline\n\
 "), stdout);
@@ -227,6 +237,29 @@ find_field (struct linebuffer const *line)
   return line->buffer + i;
 }
 
+/* Given a linebuffer LINE,
+   return the offset of the first character that doesn't need to be compared. 
*/
+
+static size_t
+find_end (struct linebuffer const *line)
+{
+  size_t count;
+  char const *lp = line->buffer;
+  size_t i = line->length - 1;
+
+  for (count = 0; count < ignore_fields && 0 < i; count++)
+    {
+      while (0 < i && isblank (to_uchar (lp[i])))
+        i--;
+      while (0 < i && !isblank (to_uchar (lp[i])))
+        i--;
+    }
+
+  i -= MIN (ignore_chars, i);
+
+  return i;
+}
+
 /* Return false if two strings OLD and NEW match, true if not.
    OLD and NEW point not to the beginnings of the lines
    but rather to the beginnings of the fields to compare.
@@ -310,10 +343,15 @@ check_file (const char *infile, const char *outfile, char 
delimiter)
         {
           char *thisfield;
           size_t thislen;
+          size_t thisend;
           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
             break;
           thisfield = find_field (thisline);
-          thislen = thisline->length - 1 - (thisfield - thisline->buffer);
+          thisend = find_end (thisline);
+          if (thisend <= thisfield - thisline->buffer)
+            thislen = 0;
+          else
+            thislen = thisend - (thisfield - thisline->buffer);
           if (prevline->length == 0
               || different (thisfield, prevfield, thislen, prevlen))
             {
@@ -330,19 +368,25 @@ check_file (const char *infile, const char *outfile, char 
delimiter)
     {
       char *prevfield;
       size_t prevlen;
+      size_t prevend;
       uintmax_t match_count = 0;
       bool first_delimiter = true;
 
       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
         goto closefiles;
       prevfield = find_field (prevline);
-      prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
+      prevend = find_end (prevline);
+      if (prevend <= prevfield - prevline->buffer)
+        prevlen = 0;
+      else
+        prevlen = prevend - (prevfield - prevline->buffer);
 
       while (!feof (stdin))
         {
           bool match;
           char *thisfield;
           size_t thislen;
+          size_t thisend;
           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
             {
               if (ferror (stdin))
@@ -350,7 +394,11 @@ check_file (const char *infile, const char *outfile, char 
delimiter)
               break;
             }
           thisfield = find_field (thisline);
-          thislen = thisline->length - 1 - (thisfield - thisline->buffer);
+          thisend = find_end (thisline);
+          if (thisend <= thisfield - thisline->buffer)
+            thislen = 0;
+          else
+            thislen = thisend - (thisfield - thisline->buffer);
           match = !different (thisfield, prevfield, thislen, prevlen);
           match_count += match;
 
@@ -430,6 +478,8 @@ main (int argc, char **argv)
 
   skip_chars = 0;
   skip_fields = 0;
+  ignore_chars = 0;
+  ignore_fields = 0;
   check_chars = SIZE_MAX;
   output_unique = output_first_repeated = true;
   output_later_repeated = false;
@@ -445,7 +495,8 @@ main (int argc, char **argv)
       if (optc == -1
           || (posixly_correct && nfiles != 0)
           || ((optc = getopt_long (argc, argv,
-                                   "-0123456789Dcdf:is:uw:z", longopts, NULL))
+                                   "-0123456789Dcdf:F:is:S:uw:z",
+                                   longopts, NULL))
               == -1))
         {
           if (argc <= optind)
@@ -523,6 +574,11 @@ main (int argc, char **argv)
                                   N_("invalid number of fields to skip"));
           break;
 
+        case 'F':
+          ignore_fields = size_opt (optarg,
+                                    N_("invalid number of fields to ignore"));
+          break;
+
         case 'i':
           ignore_case = true;
           break;
@@ -532,6 +588,11 @@ main (int argc, char **argv)
                                  N_("invalid number of bytes to skip"));
           break;
 
+        case 'S':
+          ignore_chars = size_opt (optarg,
+                                   N_("invalud number of bytes to ignore"));
+          break;
+
         case 'u':
           output_first_repeated = false;
           break;
diff --git a/tests/misc/uniq b/tests/misc/uniq
index 99aa8ed..0817b2f 100755
--- a/tests/misc/uniq
+++ b/tests/misc/uniq
@@ -199,6 +199,21 @@ my @Tests =
  # Check that --zero-terminated is synonymous with -z.
  ['123', '--zero-terminated', {IN=>"a\na\nb"}, {OUT=>"a\na\nb\0"}],
  ['124', '--zero-terminated', {IN=>"a\0a\0b"}, {OUT=>"a\0b\0"}],
+ # Skip last N characters/fields
+ ['125', qw(-F 1), {IN=>"a a\na b\n"}, {OUT=>"a a\n"}],
+ ['126', qw(-F 1), {IN=>"a a\nb b\n"}, {OUT=>"a a\nb b\n"}],
+ ['127', qw(-F 1), {IN=>"a a a\nc a b\n"}, {OUT=>"a a a\nc a b\n"}],
+ ['128', qw(-F 1), {IN=>"a b\na a\n"}, {OUT=>"a b\n"}],
+ ['129', qw(-F 2), {IN=>"c a a\nc a b\n"}, {OUT=>"c a a\n"}],
+ ['130', qw(-S 1), {IN=>"aaa\naaa\n"}, {OUT=>"aaa\n"}],
+ ['131', qw(-S 2), {IN=>"aab\naaa\n"}, {OUT=>"aab\n"}],
+ ['132', qw(-F 1 -S 1), {IN=>"aaa a\nba b\n"}, {OUT=>"aaa a\nba b\n"}],
+ ['133', qw(-F 1 -S 1), {IN=>"aaa a\naaa b\n"}, {OUT=>"aaa a\n"}],
+ ['134', qw(-S 1 -F 1), {IN=>"aaa a\nba b\n"}, {OUT=>"aaa a\nba b\n"}],
+ ['135', qw(-S 1 -F 1), {IN=>"aaa a\naaa b\n"}, {OUT=>"aaa a\n"}],
+ ['136', qw(-S 4), {IN=>"cba\ndcba\n"}, {OUT=>"cba\n"}],
+ ['137', qw(-S 0), {IN=>"cba\ndcba\n"}, {OUT=>"cba\ndcba\n"}],
+ ['138', qw(-S 0), {IN=>"cba\n"}, {OUT=>"cba\n"}],
 );
 
 # Set _POSIX2_VERSION=199209 in the environment of each obs-plus* test.
-- 
1.7.2.5

Attachment: signature.asc
Description: Digital signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]