bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

join with header line support


From: Assaf Gordon
Subject: join with header line support
Date: Wed, 04 Nov 2009 18:38:17 -0500
User-agent: Mozilla-Thunderbird 2.0.0.22 (X11/20090707)

Hello,

Here's an improved version of the '--header' feature for join, with tests, NEWS, doc updates.

Reminder: with this option, one can join files even if they contain a header line as the first line.

I'll be happy to provide more examples and use cases, if needed.

The patch is also available here:
http://cancan.cshl.edu/labmembers/gordon/coreutils8/join_header.patch

Comments are welcomed,
 -gordon





NEWS               |    3 +++
doc/coreutils.texi |    4 ++++
src/join.c         |   23 ++++++++++++++++++++++-
tests/misc/join    |   21 +++++++++++++++++++++
4 files changed, 50 insertions(+), 1 deletions(-)

diff --git a/NEWS b/NEWS
index 03ed83f..4a17a4d 100644
--- a/NEWS
+++ b/NEWS
@@ -66,6 +66,9 @@ GNU coreutils NEWS                                    -*- 
outline -*-
 touch now accepts the option --no-dereference (-h), as a means to
 change symlink timestamps on platforms with enough support.

+  join now accepts the option --header, treating the first line of
+  each input file as a header lines - joining them and printing them
+  without checking for ordering.

* Noteworthy changes in release 8.0 (2009-10-06) [beta]

diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index ec5bcfb..62dfe55 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -5511,6 +5511,10 @@ Do not check that both input files are in sorted order.  
This is the default.
Replace those output fields that are missing in the input with
@var{string}.

address@hidden --header
address@hidden --header
+Treat the first line of each input file as a header line. The header lines 
will be joined and printed as the first output line. If @option{-o} is used to 
specify output format, the header line will be printed according  to the 
specified format. Even if @option{--check-order} is used, the header lines will 
not be checked for ordering.
+
@item -i
@itemx --ignore-case
@opindex -i
diff --git a/src/join.c b/src/join.c
index d734a91..bb8009f 100644
--- a/src/join.c
+++ b/src/join.c
@@ -137,7 +137,8 @@ static enum
enum
{
 CHECK_ORDER_OPTION = CHAR_MAX + 1,
-  NOCHECK_ORDER_OPTION
+  NOCHECK_ORDER_OPTION,
+  HEADER_LINE_OPTION
};


@@ -146,6 +147,7 @@ static struct option const longopts[] =
 {"ignore-case", no_argument, NULL, 'i'},
 {"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
 {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
+  {"header", no_argument, NULL, HEADER_LINE_OPTION},
 {GETOPT_HELP_OPTION_DECL},
 {GETOPT_VERSION_OPTION_DECL},
 {NULL, 0, NULL, 0}
@@ -157,6 +159,10 @@ static struct line uni_blank;
/* If nonzero, ignore case when comparing join fields.  */
static bool ignore_case;

+/* If nonzero, treat the first line of each file as column headers -
+   join them without checking for ordering */
+static bool join_header_lines;
+
void
usage (int status)
{
@@ -191,6 +197,8 @@ by whitespace.  When FILE1 or FILE2 (not both) is -, read 
standard input.\n\
 --check-order     check that the input is correctly sorted, even\n\
                     if all input lines are pairable\n\
 --nocheck-order   do not check that the input is correctly sorted\n\
+  --header          treat first line in each file as field header line,\n\
+                      print them without trying to pair them.\n\
"), stdout);
     fputs (HELP_OPTION_DESCRIPTION, stdout);
     fputs (VERSION_OPTION_DESCRIPTION, stdout);
@@ -616,6 +624,15 @@ join (FILE *fp1, FILE *fp2)
 initseq (&seq2);
 getseq (fp2, &seq2, 2);

+ if (join_header_lines && seq1.count && seq2.count) + {
+      prjoin(seq1.lines[0], seq2.lines[0]);
+      prevline[0] = NULL;
+      prevline[1] = NULL;
+      advance_seq (fp1, &seq1, true, 1);
+      advance_seq (fp2, &seq2, true, 2);
+    }
+
 while (seq1.count && seq2.count)
   {
     size_t i;
@@ -1052,6 +1069,10 @@ main (int argc, char **argv)
                        &nfiles, &prev_optc_status, &optc_status);
         break;

+        case HEADER_LINE_OPTION:
+          join_header_lines = true;
+          break;
+
       case_GETOPT_HELP_CHAR;

       case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
diff --git a/tests/misc/join b/tests/misc/join
index d1f1677..6433e28 100755
--- a/tests/misc/join
+++ b/tests/misc/join
@@ -185,6 +185,27 @@ my @tv = (
# Before 6.10.143, this would mistakenly fail with the diagnostic:
# join: File 1 is not in sorted order
['chkodr-7', '-12', ["2 a\n1 b\n", ""], "", 0],
+
+# Test '--header' feature
+['header-1', '--header',
+ [ "ID Name\n1 A\n2 B\n", "ID Color\n1 red\n"], "ID Name Color\n1 A red\n", 0],
+
+# '--header' with '--check-order' : The header line is out-of-order but the
+# actual data is in order. This join should succeed.
+['header-2', '--header --check-order',
+ ["ID Name\n1 A\n2 B\n", "ID Color\n2 green\n"], "ID Name Color\n2 B green\n", 
0],
+
+# '--header' with '--check-order' : The header line is out-of-order AND the
+# actual data out-of-order. This join should fail.
+['header-3', '--header --check-order',
+ ["ID Name\n2 B\n1 A\n", "ID Color\n2 blue\n"], "ID Name Color\n", 1,
+  "$prog: file 1 is not in sorted order\n"],
+
+# '--header' with specific output format '-o'.
+# output header line should respect the requested format
+['header-4', '--header -o "0,1.3,2.2"',
+ ["ID Group Name\n1 Foo A\n2 Bar B\n", "ID Color\n2 blue\n"], "ID Name Color\n2 B 
blue\n", 0],
+
);

# Convert the above old-style test vectors to the newer





reply via email to

[Prev in Thread] Current Thread [Next in Thread]