>From 29746dc55e6176934388c772dbe70012859897ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Wed, 5 Jan 2011 11:52:54 +0000 Subject: [PATCH] join: add -o 'auto' to output a constant number of fields per line Lines with a different number of fields than the first line, will be truncated or padded. * src/join.c (prfields): A new function refactored from prjoin(), to output all but the join field. (prjoin): Don't swap line1 and line2 when line1 is blank so that the padding is applied to the right place. (main): Handle the -o 'auto' option. * tests/misc/join: Add 6 new cases to test the auto format. * NEWS: Mention the change in behavior. Suggestion from Assaf Gordon --- NEWS | 6 +++ doc/coreutils.texi | 19 ++++++++--- src/join.c | 88 +++++++++++++++++++++++++++++++++------------------ tests/misc/join | 20 ++++++++++++ 4 files changed, 96 insertions(+), 37 deletions(-) diff --git a/NEWS b/NEWS index 9ccad63..a9d329a 100644 --- a/NEWS +++ b/NEWS @@ -13,6 +13,12 @@ GNU coreutils NEWS -*- outline -*- rm -f no longer fails for EINVAL or EILSEQ on file systems that reject file names invalid for that file system. +** New features + + join now supports -o 'auto' which will automatically infer the + output format from the first line in each file, to ensure + the same number of fields are output for each line. + * Noteworthy changes in release 8.9 (2011-01-04) [stable] diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 85d5201..9397ab3 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -5675,8 +5675,8 @@ Do not check that both input files are in sorted order. This is the default. @item -e @var{string} @opindex -e -Replace those output fields that are missing in the input with -@var{string}. +Replace those output fields that are missing in the input with @var{string}. +I.E. missing fields specified with the @option{-12jo} options. @item --header @opindex --header @@ -5707,10 +5707,17 @@ Join on field @var{field} (a positive integer) of file 2. Equivalent to @option{-1 @var{field} -2 @var{field}}. @item -o @var{field-list} -Construct each output line according to the format in @var{field-list}. -Each element in @var{field-list} is either the single character @samp{0} or -has the form @var{m.n} where the file number, @var{m}, is @samp{1} or -@samp{2} and @var{n} is a positive field number. +@itemx -o auto +If the keyword @samp{auto} is specified, infer the output format from +the first line in each file. This is the same as the default output format +but also ensures the same number of fields are output for each line. +Missing fields are replaced with the @option{-e} option and extra fields +are discarded. + +Otherwise, construct each output line according to the format in +@var{field-list}. Each element in @var{field-list} is either the single +character @samp{0} or has the form @var{m.n} where the file number, @var{m}, +is @samp{1} or @samp{2} and @var{n} is a positive field number. A field specification of @samp{0} denotes the join field. In most cases, the functionality of the @samp{0} field spec diff --git a/src/join.c b/src/join.c index afda5a1..bf7e908 100644 --- a/src/join.c +++ b/src/join.c @@ -112,6 +112,13 @@ static bool issued_disorder_warning[2]; /* Empty output field filler. */ static char const *empty_filler; +/* Whether to ensure the same number of fields are output from each line. */ +static bool autoformat; +/* The number of fields to output for each line. + Only significant when autoformat is true. */ +static size_t autocount_1; +static size_t autocount_2; + /* Field to join on; SIZE_MAX means they haven't been determined yet. */ static size_t join_field_1 = SIZE_MAX; static size_t join_field_2 = SIZE_MAX; @@ -210,7 +217,8 @@ else fields are separated by CHAR. Any FIELD is a field number counted\n\ from 1. FORMAT is one or more comma or blank separated specifications,\n\ each being `FILENUM.FIELD' or `0'. Default FORMAT outputs the join field,\n\ the remaining fields from FILE1, the remaining fields from FILE2, all\n\ -separated by CHAR.\n\ +separated by CHAR. If FORMAT is the keyword 'auto', then the first\n\ +line of each file determines the number of fields output for each line.\n\ \n\ Important: FILE1 and FILE2 must be sorted on the join fields.\n\ E.g., use ` sort -k 1b,1 ' if `join' has no options,\n\ @@ -527,6 +535,27 @@ prfield (size_t n, struct line const *line) fputs (empty_filler, stdout); } +/* Output all the fields in line, other than the join field. */ + +static void +prfields (struct line const *line, size_t join_field, size_t autocount) +{ + size_t i; + size_t nfields = autoformat ? autocount : line->nfields; + char output_separator = tab < 0 ? ' ' : tab; + + for (i = 0; i < join_field && i < nfields; ++i) + { + putchar (output_separator); + prfield (i, line); + } + for (i = join_field + 1; i < nfields; ++i) + { + putchar (output_separator); + prfield (i, line); + } +} + /* Print the join of LINE1 and LINE2. */ static void @@ -534,6 +563,8 @@ prjoin (struct line const *line1, struct line const *line2) { const struct outlist *outlist; char output_separator = tab < 0 ? ' ' : tab; + size_t field; + struct line const *line; outlist = outlist_head.next; if (outlist) @@ -543,9 +574,6 @@ prjoin (struct line const *line1, struct line const *line2) o = outlist; while (1) { - size_t field; - struct line const *line; - if (o->file == 0) { if (line1 == &uni_blank) @@ -574,37 +602,24 @@ prjoin (struct line const *line1, struct line const *line2) } else { - size_t i; - if (line1 == &uni_blank) { - struct line const *t; - t = line1; - line1 = line2; - line2 = t; + line = line2; + field = join_field_2; } - prfield (join_field_1, line1); - for (i = 0; i < join_field_1 && i < line1->nfields; ++i) - { - putchar (output_separator); - prfield (i, line1); - } - for (i = join_field_1 + 1; i < line1->nfields; ++i) + else { - putchar (output_separator); - prfield (i, line1); + line = line1; + field = join_field_1; } - for (i = 0; i < join_field_2 && i < line2->nfields; ++i) - { - putchar (output_separator); - prfield (i, line2); - } - for (i = join_field_2 + 1; i < line2->nfields; ++i) - { - putchar (output_separator); - prfield (i, line2); - } + /* Output the join field. */ + prfield (field, line); + + /* Output other fields. */ + prfields (line1, join_field_1, autocount_1); + prfields (line2, join_field_2, autocount_2); + putchar ('\n'); } } @@ -627,6 +642,12 @@ join (FILE *fp1, FILE *fp2) initseq (&seq2); getseq (fp2, &seq2, 2); + if (autoformat) + { + autocount_1 = seq1.count ? seq1.lines[0]->nfields : 0; + autocount_2 = seq2.count ? seq2.lines[0]->nfields : 0; + } + if (join_header_lines && seq1.count && seq2.count) { prjoin (seq1.lines[0], seq2.lines[0]); @@ -1037,8 +1058,13 @@ main (int argc, char **argv) break; case 'o': - add_field_list (optarg); - optc_status = MIGHT_BE_O_ARG; + if (STREQ (optarg, "auto")) + autoformat = true; + else + { + add_field_list (optarg); + optc_status = MIGHT_BE_O_ARG; + } break; case 't': diff --git a/tests/misc/join b/tests/misc/join index 3696a03..3cf278b 100755 --- a/tests/misc/join +++ b/tests/misc/join @@ -127,6 +127,26 @@ my @tv = ( # From David Dyck ['9a', '', [" a 1\n b 2\n", " a Y\n b Z\n"], "a 1 Y\nb 2 Z\n", 0], +# -o 'auto' +['10a', '-a1 -a2 -e . -o auto', + ["a 1 2\nb 1\nd 1 2\n", "a 3 4\nb 3 4\nc 3 4\n"], + "a 1 2 3 4\nb 1 . 3 4\nc . . 3 4\nd 1 2 . .\n", 0], +['10b', '-a1 -a2 -j3 -e . -o auto', + ["a 1 2\nb 1\nd 1 2\n", "a 3 4\nb 3 4\nc 3 4\n"], + "2 a 1 . .\n. b 1 . .\n2 d 1 . .\n4 . . a 3\n4 . . b 3\n4 . . c 3\n"], +['10c', '-a1 -1 1 -2 4 -e. -o auto', + ["a 1 2\nb 1\nd 1 2\n", "a 3 4\nb 3 4\nc 3 4\n"], + "a 1 2 . . .\nb 1 . . . .\nd 1 2 . . .\n"], +['10d', '-a2 -1 1 -2 4 -e. -o auto', + ["a 1 2\nb 1\nd 1 2\n", "a 3 4\nb 3 4\nc 3 4\n"], + ". . . a 3 4\n. . . b 3 4\n. . . c 3 4\n"], +['10e', '-o auto', + ["a 1 2\nb 1 2 discard\n", "a 3 4\nb 3 4 discard\n"], + "a 1 2 3 4\nb 1 2 3 4\n"], +['10f', '-t, -o auto', + ["a,1,,2\nb,1,2\n", "a,3,4\nb,3,4\n"], + "a,1,,2,3,4\nb,1,2,,3,4\n"], + # From Tim Smithers: fixed in 1.22l ['trailing-sp', '-t: -1 1 -2 1', ["a:x \n", "a:y \n"], "a:x :y \n", 0], -- 1.7.3.4