bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] Check the order of the input by default, if there are unpairable


From: James Youngman
Subject: [PATCH] Check the order of the input by default, if there are unpairable input lines.
Date: Mon, 18 Feb 2008 22:38:08 +0000

This patch amends my previous changes to join.c to enable
order-checking by default for the case where the user is not making
use of the extension of processing unsorted inputs containing only
pairable lines.  This idea was suggestied by Pádraig Brady.



2008-02-18  James Youngman  <address@hidden>

        Enable order-checking of join's input where the user is not 
        making use of the GNU extension for processing unsorted input 
        files containing only pairable input lines (idea from Pádraig 
        Brady).
        * src/join.c: New variables seen_unpairable and
        issued_disorder_warning[].  Changed type of check_input_order from
        a boolean to a tristate (enabled/disabled/default).
        (get_line): Also check input order in the default case (i.e. when
        neither --check-order not --nocheck-order is specified).  In the
        default case, we issue a warning only when there are unpairable
        input lines and one of the inputs is out of order.
        (checkorder): Obey the tristate variable check_input_order.
        Avoid unnecessary comparisons.
        (join): Also perform ordering checks after reaching EOF on one
        input.
        * coreutils.texi (join invocation): Explain the handling of the
        default case where neither --check-order nor --nocheck-order is
        specified.

Signed-off-by: James Youngman <address@hidden>
---
 doc/coreutils.texi |   30 +++++++++------
 src/join.c         |  100 ++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 100 insertions(+), 30 deletions(-)

diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index b110273..e8ccb4b 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -5149,17 +5149,10 @@ sort a file on its default join field, but if you 
select a non-default
 locale, join field, separator, or comparison options, then you should
 do so consistently between @command{join} and @command{sort}.
 
address@hidden Unsorted inputs are a common cause of FAQs, but we probably
address@hidden should not make --check-order the default, as we documented this
address@hidden extension and so should continue to allow it
-.
-If the @option{--check-order} option is given, unsorted inputs will
-cause a fatal error message.  If the @option{--check-order} option is
-not given, a @acronym{GNU} extension is available: if the input has no
-unpairable lines the sort order can be any order that considers two
-fields to be equal if and only if the sort comparison described above
-considers them to be equal.
-For example:
+If the input has no unpairable lines, a @acronym{GNU} extension is
+available; the sort order can be any order that considers two fields
+to be equal if and only if the sort comparison described above
+considers them to be equal.  For example:
 
 @example
 $ cat file1
@@ -5176,6 +5169,19 @@ c c1 c2
 b b1 b2
 @end example
 
+If the @option{--check-order} option is given, unsorted inputs will
+cause a fatal error message.  If the option @option{--nocheck-order}
+is given, unsorted inputs will never cause an error message.  If
+neither of these options is given, wrongly sorted inputs are diagnosed
+only if an input file is found to contain unpairable lines.  If an
+input file is diagnosed as being unsorted, the @command{join} command
+will exit with a nonzero status (and the output should not be used).
+
+Forcing @command{join} to process wrongly sorted input files
+containing unpairable lines by specifying @option{--nocheck-order} is
+not guaranteed to produce any particular output.  The output will
+probably not correspond with whatever you hoped it would be.
+
 The defaults are:
 @itemize
 @item the join field is the first field in each line;
@@ -5196,7 +5202,7 @@ Print a line for each unpairable line in file 
@var{file-number} (either
 @samp{1} or @samp{2}), in addition to the normal output.
 
 @item --check-order
-Check that both input files are in sorted order.
+Fail with an error message if either input file is wrongly ordered.
 
 @item --nocheck-order
 Do not check that both input files are in sorted order.  This is the default.
diff --git a/src/join.c b/src/join.c
index 67f81b5..925835e 100644
--- a/src/join.c
+++ b/src/join.c
@@ -90,6 +90,12 @@ static bool print_unpairables_1, print_unpairables_2;
 /* If nonzero, print pairable lines.  */
 static bool print_pairables;
 
+/* If nonzero, we have seen at least one unpairable line. */
+static bool seen_unpairable;
+
+/* If nonzero, we have warned about disorder in that file. */
+static bool issued_disorder_warning[2];
+
 /* Empty output field filler.  */
 static char const *empty_filler;
 
@@ -109,7 +115,13 @@ static struct outlist *outlist_end = &outlist_head;
 static int tab = -1;
 
 /* If nonzero, check that the input is correctly ordered. */
-static bool check_input_order = false;
+enum
+  {
+  CHECK_ORDER_DEFAULT,
+  CHECK_ORDER_ENABLED,
+  CHECK_ORDER_DISABLED
+  } check_input_order;
+
 
 enum
 {
@@ -304,15 +316,12 @@ get_line (FILE *fp, struct line *line, int which)
   line->fields = NULL;
   xfields (line);
 
-  if (check_input_order)
+  if (prevline[which-1])
     {
-      if (prevline[which-1])
-       {
-         checkorder (prevline[which-1], line, which);
-         freeline (prevline[which-1]);
-       }
-      prevline[which-1] = dupline (line);
+      checkorder (prevline[which-1], line, which);
+      freeline (prevline[which-1]);
     }
+  prevline[which-1] = dupline (line);
   return true;
 }
 
@@ -424,17 +433,44 @@ keycmp (struct line const *line1, struct line const 
*line2)
   return len1 < len2 ? -1 : len1 != len2;
 }
 
+
+
+/* Check that successive input lines PREV and CURRENT from input file
+ * WHATFILE are presented in order, unless the user may be relying on
+ * the GNU extension that input lines may be out of order if no input
+ * lines are unpairable.
+ *
+ * If the user specified --nocheck-order, the check is not made.
+ * If the user specified --check-order, the problem is fatal.
+ * Otherwise (the default), the message is simply a warning.
+ *
+ * A message is printed at most once per input file.
+ */
 static void
 checkorder (const struct line *prev,
            const struct line *current,
            int whatfile)
 {
-  if (keycmp (prev, current) > 0)
+  if (check_input_order != CHECK_ORDER_DISABLED
+      && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
     {
-      error (EXIT_FAILURE, 0, _("File %d is not in sorted order"), whatfile);
+      if (!issued_disorder_warning[whatfile-1])
+       {
+         if (keycmp (prev, current) > 0)
+           {
+             error ((check_input_order == CHECK_ORDER_ENABLED ? 1 : 0),
+                    0, _("File %d is not in sorted order"), whatfile);
+             
+             /* If we get to here, the message was just a warning, but we
+                want only to issue it once. */
+             issued_disorder_warning[whatfile-1] = true;
+           }
+       }
     }
 }
 
+
+
 /* Print field N of LINE if it exists and is nonempty, otherwise
    `empty_filler' if it is nonempty.  */
 
@@ -545,7 +581,7 @@ join (FILE *fp1, FILE *fp2)
   struct seq seq1, seq2;
   struct line line;
   int diff;
-  bool eof1, eof2;
+  bool eof1, eof2, checktail;
 
   /* Read the first line of each file.  */
   initseq (&seq1);
@@ -562,6 +598,7 @@ join (FILE *fp1, FILE *fp2)
          if (print_unpairables_1)
            prjoin (&seq1.lines[0], &uni_blank);
          advance_seq (fp1, &seq1, true, 1);
+         seen_unpairable = true;
          continue;
        }
       if (diff > 0)
@@ -569,6 +606,7 @@ join (FILE *fp1, FILE *fp2)
          if (print_unpairables_2)
            prjoin (&uni_blank, &seq2.lines[0]);
          advance_seq (fp2, &seq2, true, 2);
+         seen_unpairable = true;
          continue;
        }
 
@@ -627,25 +665,46 @@ join (FILE *fp1, FILE *fp2)
        seq2.count = 0;
     }
 
-  if (print_unpairables_1 && seq1.count)
+  /* If the user did not specify --check-order, and the we read the
+   * tail ends of both inputs to verify that they are in order.  We
+   * skip the rest of the tail once we have issued a warning for that
+   * file, unless we actually need to print the unpairable lines.
+   */
+  if (check_input_order != CHECK_ORDER_DISABLED
+      && !(issued_disorder_warning[0] && issued_disorder_warning[1]))
+    checktail = true;
+  else
+    checktail = false;
+  
+  if ((print_unpairables_1 || checktail) && seq1.count)
     {
-      prjoin (&seq1.lines[0], &uni_blank);
+      if (print_unpairables_1)
+       prjoin (&seq1.lines[0], &uni_blank);
       freeline (&seq1.lines[0]);
+      seen_unpairable = true;
       while (get_line (fp1, &line, 1))
        {
-         prjoin (&line, &uni_blank);
+         if (print_unpairables_1)
+           prjoin (&line, &uni_blank);
          freeline (&line);
+         if (issued_disorder_warning[0] && !print_unpairables_1)
+           break;
        }
     }
 
-  if (print_unpairables_2 && seq2.count)
+  if ((print_unpairables_2 || checktail) && seq2.count)
     {
-      prjoin (&uni_blank, &seq2.lines[0]);
+      if (print_unpairables_2)
+       prjoin (&uni_blank, &seq2.lines[0]);
       freeline (&seq2.lines[0]);
+      seen_unpairable = true;
       while (get_line (fp2, &line, 2))
        {
-         prjoin (&uni_blank, &line);
+         if (print_unpairables_2)
+           prjoin (&uni_blank, &line);
          freeline (&line);
+         if (issued_disorder_warning[1] && !print_unpairables_2)
+           break;
        }
     }
 
@@ -866,6 +925,8 @@ main (int argc, char **argv)
   atexit (close_stdout);
 
   print_pairables = true;
+  seen_unpairable = false;
+  issued_disorder_warning[0] = issued_disorder_warning[1] = false;
 
   while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
                              longopts, NULL))
@@ -1020,5 +1081,8 @@ main (int argc, char **argv)
   if (fclose (fp2) != 0)
     error (EXIT_FAILURE, errno, "%s", names[1]);
 
-  exit (EXIT_SUCCESS);
+  if (issued_disorder_warning[0] || issued_disorder_warning[1])
+    exit (EXIT_FAILURE);
+  else
+    exit (EXIT_SUCCESS);
 }
-- 
1.5.3.8





reply via email to

[Prev in Thread] Current Thread [Next in Thread]