>From 29746dc55e6176934388c772dbe70012859897ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <address@hidden>
Date: Wed, 5 Jan 2011 11:52:54 +0000
Subject: [PATCH] join: add -o 'auto' to output a constant number of fields per line

Lines with a different number of fields than the first line,
will be truncated or padded.

* src/join.c (prfields): A new function refactored from prjoin(),
to output all but the join field.
(prjoin): Don't swap line1 and line2 when line1 is blank
so that the padding is applied to the right place.
(main): Handle the -o 'auto' option.
* tests/misc/join: Add 6 new cases to test the auto format.
* NEWS: Mention the change in behavior.
Suggestion from Assaf Gordon
---
 NEWS               |    6 +++
 doc/coreutils.texi |   19 ++++++++---
 src/join.c         |   88 +++++++++++++++++++++++++++++++++------------------
 tests/misc/join    |   20 ++++++++++++
 4 files changed, 96 insertions(+), 37 deletions(-)

diff --git a/NEWS b/NEWS
index 9ccad63..a9d329a 100644
--- a/NEWS
+++ b/NEWS
@@ -13,6 +13,12 @@ GNU coreutils NEWS                                    -*- outline -*-
   rm -f no longer fails for EINVAL or EILSEQ on file systems that
   reject file names invalid for that file system.
 
+** New features
+
+  join now supports -o 'auto' which will automatically infer the
+  output format from the first line in each file, to ensure
+  the same number of fields are output for each line.
+
 
 * Noteworthy changes in release 8.9 (2011-01-04) [stable]
 
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 85d5201..9397ab3 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -5675,8 +5675,8 @@ Do not check that both input files are in sorted order.  This is the default.
 
 @item -e @var{string}
 @opindex -e
-Replace those output fields that are missing in the input with
-@var{string}.
+Replace those output fields that are missing in the input with @var{string}.
+I.E. missing fields specified with the @option{-12jo} options.
 
 @item --header
 @opindex --header
@@ -5707,10 +5707,17 @@ Join on field @var{field} (a positive integer) of file 2.
 Equivalent to @option{-1 @var{field} -2 @var{field}}.
 
 @item -o @var{field-list}
-Construct each output line according to the format in @var{field-list}.
-Each element in @var{field-list} is either the single character @samp{0} or
-has the form @var{m.n} where the file number, @var{m}, is @samp{1} or
-@samp{2} and @var{n} is a positive field number.
+@itemx -o auto
+If the keyword @samp{auto} is specified, infer the output format from
+the first line in each file.  This is the same as the default output format
+but also ensures the same number of fields are output for each line.
+Missing fields are replaced with the @option{-e} option and extra fields
+are discarded.
+
+Otherwise, construct each output line according to the format in
+@var{field-list}.  Each element in @var{field-list} is either the single
+character @samp{0} or has the form @var{m.n} where the file number, @var{m},
+is @samp{1} or @samp{2} and @var{n} is a positive field number.
 
 A field specification of @samp{0} denotes the join field.
 In most cases, the functionality of the @samp{0} field spec
diff --git a/src/join.c b/src/join.c
index afda5a1..bf7e908 100644
--- a/src/join.c
+++ b/src/join.c
@@ -112,6 +112,13 @@ static bool issued_disorder_warning[2];
 /* Empty output field filler.  */
 static char const *empty_filler;
 
+/* Whether to ensure the same number of fields are output from each line.  */
+static bool autoformat;
+/* The number of fields to output for each line.
+   Only significant when autoformat is true.  */
+static size_t autocount_1;
+static size_t autocount_2;
+
 /* Field to join on; SIZE_MAX means they haven't been determined yet.  */
 static size_t join_field_1 = SIZE_MAX;
 static size_t join_field_2 = SIZE_MAX;
@@ -210,7 +217,8 @@ else fields are separated by CHAR.  Any FIELD is a field number counted\n\
 from 1.  FORMAT is one or more comma or blank separated specifications,\n\
 each being `FILENUM.FIELD' or `0'.  Default FORMAT outputs the join field,\n\
 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
-separated by CHAR.\n\
+separated by CHAR.  If FORMAT is the keyword 'auto', then the first\n\
+line of each file determines the number of fields output for each line.\n\
 \n\
 Important: FILE1 and FILE2 must be sorted on the join fields.\n\
 E.g., use ` sort -k 1b,1 ' if `join' has no options,\n\
@@ -527,6 +535,27 @@ prfield (size_t n, struct line const *line)
     fputs (empty_filler, stdout);
 }
 
+/* Output all the fields in line, other than the join field.  */
+
+static void
+prfields (struct line const *line, size_t join_field, size_t autocount)
+{
+  size_t i;
+  size_t nfields = autoformat ? autocount : line->nfields;
+  char output_separator = tab < 0 ? ' ' : tab;
+
+  for (i = 0; i < join_field && i < nfields; ++i)
+    {
+      putchar (output_separator);
+      prfield (i, line);
+    }
+  for (i = join_field + 1; i < nfields; ++i)
+    {
+      putchar (output_separator);
+      prfield (i, line);
+    }
+}
+
 /* Print the join of LINE1 and LINE2.  */
 
 static void
@@ -534,6 +563,8 @@ prjoin (struct line const *line1, struct line const *line2)
 {
   const struct outlist *outlist;
   char output_separator = tab < 0 ? ' ' : tab;
+  size_t field;
+  struct line const *line;
 
   outlist = outlist_head.next;
   if (outlist)
@@ -543,9 +574,6 @@ prjoin (struct line const *line1, struct line const *line2)
       o = outlist;
       while (1)
         {
-          size_t field;
-          struct line const *line;
-
           if (o->file == 0)
             {
               if (line1 == &uni_blank)
@@ -574,37 +602,24 @@ prjoin (struct line const *line1, struct line const *line2)
     }
   else
     {
-      size_t i;
-
       if (line1 == &uni_blank)
         {
-          struct line const *t;
-          t = line1;
-          line1 = line2;
-          line2 = t;
+          line = line2;
+          field = join_field_2;
         }
-      prfield (join_field_1, line1);
-      for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
-        {
-          putchar (output_separator);
-          prfield (i, line1);
-        }
-      for (i = join_field_1 + 1; i < line1->nfields; ++i)
+      else
         {
-          putchar (output_separator);
-          prfield (i, line1);
+          line = line1;
+          field = join_field_1;
         }
 
-      for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
-        {
-          putchar (output_separator);
-          prfield (i, line2);
-        }
-      for (i = join_field_2 + 1; i < line2->nfields; ++i)
-        {
-          putchar (output_separator);
-          prfield (i, line2);
-        }
+      /* Output the join field.  */
+      prfield (field, line);
+
+      /* Output other fields.  */
+      prfields (line1, join_field_1, autocount_1);
+      prfields (line2, join_field_2, autocount_2);
+
       putchar ('\n');
     }
 }
@@ -627,6 +642,12 @@ join (FILE *fp1, FILE *fp2)
   initseq (&seq2);
   getseq (fp2, &seq2, 2);
 
+  if (autoformat)
+    {
+      autocount_1 = seq1.count ? seq1.lines[0]->nfields : 0;
+      autocount_2 = seq2.count ? seq2.lines[0]->nfields : 0;
+    }
+
   if (join_header_lines && seq1.count && seq2.count)
     {
       prjoin (seq1.lines[0], seq2.lines[0]);
@@ -1037,8 +1058,13 @@ main (int argc, char **argv)
           break;
 
         case 'o':
-          add_field_list (optarg);
-          optc_status = MIGHT_BE_O_ARG;
+          if (STREQ (optarg, "auto"))
+            autoformat = true;
+          else
+            {
+              add_field_list (optarg);
+              optc_status = MIGHT_BE_O_ARG;
+            }
           break;
 
         case 't':
diff --git a/tests/misc/join b/tests/misc/join
index 3696a03..3cf278b 100755
--- a/tests/misc/join
+++ b/tests/misc/join
@@ -127,6 +127,26 @@ my @tv = (
 # From David Dyck
 ['9a', '', [" a 1\n b 2\n", " a Y\n b Z\n"], "a 1 Y\nb 2 Z\n", 0],
 
+# -o 'auto'
+['10a', '-a1 -a2 -e . -o auto',
+ ["a 1 2\nb 1\nd 1 2\n", "a 3 4\nb 3 4\nc 3 4\n"],
+ "a 1 2 3 4\nb 1 . 3 4\nc . . 3 4\nd 1 2 . .\n", 0],
+['10b', '-a1 -a2 -j3 -e . -o auto',
+ ["a 1 2\nb 1\nd 1 2\n", "a 3 4\nb 3 4\nc 3 4\n"],
+ "2 a 1 . .\n. b 1 . .\n2 d 1 . .\n4 . . a 3\n4 . . b 3\n4 . . c 3\n"],
+['10c', '-a1 -1 1 -2 4 -e. -o auto',
+ ["a 1 2\nb 1\nd 1 2\n", "a 3 4\nb 3 4\nc 3 4\n"],
+ "a 1 2 . . .\nb 1 . . . .\nd 1 2 . . .\n"],
+['10d', '-a2 -1 1 -2 4 -e. -o auto',
+ ["a 1 2\nb 1\nd 1 2\n", "a 3 4\nb 3 4\nc 3 4\n"],
+ ". . . a 3 4\n. . . b 3 4\n. . . c 3 4\n"],
+['10e', '-o auto',
+ ["a 1 2\nb 1 2 discard\n", "a 3 4\nb 3 4 discard\n"],
+ "a 1 2 3 4\nb 1 2 3 4\n"],
+['10f', '-t, -o auto',
+ ["a,1,,2\nb,1,2\n", "a,3,4\nb,3,4\n"],
+ "a,1,,2,3,4\nb,1,2,,3,4\n"],
+
 # From Tim Smithers: fixed in 1.22l
 ['trailing-sp', '-t: -1 1 -2 1', ["a:x \n", "a:y \n"], "a:x :y \n", 0],
 
-- 
1.7.3.4