From 1a651ab6aedea0d0cc383f2e60c82fe7f0d395f0 Mon Sep 17 00:00:00 2001 From: Bo Borgerson Date: Sun, 20 Apr 2008 21:24:16 -0400 Subject: [PATCH] Make comm check order of input files * NEWS: List new behavior. * doc/coreutils.texi (checkOrderOption) New macro for describing `--check-order' and `--nocheck-order', used in both join and comm. * src/comm.c (main): Initialize new options. (usage): Describe new options. (compare_files): Keep an extra pair of buffers for the previous line from each file to check the internal order. (check_order): If an order-check is required, compare and handle the result appropriately. (copylinebuffer): Copy a linebuffer; used for copy before read. * tests/misc/Makefile.am: List new test. * tests/misc/comm: Tests for the comm program, including the new order-checking functionality and attendant command-line options. Signed-off-by: Bo Borgerson --- NEWS | 8 +++ doc/coreutils.texi | 39 ++++++++---- src/comm.c | 158 +++++++++++++++++++++++++++++++++++++++++++----- tests/misc/Makefile.am | 1 + tests/misc/comm | 121 ++++++++++++++++++++++++++++++++++++ 5 files changed, 300 insertions(+), 27 deletions(-) create mode 100755 tests/misc/comm diff --git a/NEWS b/NEWS index 04893c6..4038da2 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,13 @@ GNU coreutils NEWS -*- outline -*- +* Noteworthy changes in release ?? + +** New features + + comm now verifies that the inputs are in sorted order. This check can + be turned off with the --nocheck-order option. + + * Noteworthy changes in release 6.11 (2008-04-19) [stable] ** Bug fixes diff --git a/doc/coreutils.texi b/doc/coreutils.texi index f42e736..5ed7f43 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -4342,6 +4342,32 @@ status that does not depend on the result of the comparison. Upon normal completion @command{comm} produces an exit code of zero. If there is an error it exits with nonzero status. address@hidden checkOrderOption{cmd} +If the @option{--check-order} option is given, unsorted inputs will +cause a fatal error message. If the option @option{--nocheck-order} +is given, unsorted inputs will never cause an error message. If +neither of these options is given, wrongly sorted inputs are diagnosed +only if an input file is found to contain unpairable lines. If an +input file is diagnosed as being unsorted, the @command{\cmd\} command +will exit with a nonzero status (and the output should not be used). + +Forcing @command{\cmd\} to process wrongly sorted input files +containing unpairable lines by specifying @option{--nocheck-order} is +not guaranteed to produce any particular output. The output will +probably not correspond with whatever you hoped it would be. address@hidden macro address@hidden + address@hidden @samp + address@hidden --check-order +Fail with an error message if either input file is wrongly ordered. + address@hidden --nocheck-order +Do not check that both input files are in sorted order. + address@hidden table + @node tsort invocation @section @command{tsort}: Topological sort @@ -5183,18 +5209,7 @@ c c1 c2 b b1 b2 @end example -If the @option{--check-order} option is given, unsorted inputs will -cause a fatal error message. If the option @option{--nocheck-order} -is given, unsorted inputs will never cause an error message. If -neither of these options is given, wrongly sorted inputs are diagnosed -only if an input file is found to contain unpairable lines. If an -input file is diagnosed as being unsorted, the @command{join} command -will exit with a nonzero status (and the output should not be used). - -Forcing @command{join} to process wrongly sorted input files -containing unpairable lines by specifying @option{--nocheck-order} is -not guaranteed to produce any particular output. The output will -probably not correspond with whatever you hoped it would be. address@hidden The defaults are: @itemize diff --git a/src/comm.c b/src/comm.c index cbda362..5b1e5a2 100644 --- a/src/comm.c +++ b/src/comm.c @@ -52,8 +52,31 @@ static bool only_file_2; /* If true, print lines that are found in both files. */ static bool both; +/* If nonzero, we have seen at least one unpairable line. */ +static bool seen_unpairable; + +/* If nonzero, we have warned about disorder in that file. */ +static bool issued_disorder_warning[2]; + +/* If nonzero, check that the input is correctly ordered. */ +static enum + { + CHECK_ORDER_DEFAULT, + CHECK_ORDER_ENABLED, + CHECK_ORDER_DISABLED + } check_input_order; + +enum +{ + CHECK_ORDER_OPTION = CHAR_MAX + 1, + NOCHECK_ORDER_OPTION +}; + + static struct option const long_options[] = { + {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, + {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -88,6 +111,12 @@ and column three contains lines common to both files.\n\ -2 suppress lines unique to FILE2\n\ -3 suppress lines that appear in both files\n\ "), stdout); + fputs (_("\ +\n\ + --check-order check that the input is correctly sorted, even\n\ + if all input lines are pairable\n\ + --nocheck-order do not check that the input is correctly sorted\n\ +"), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); emit_bug_reporting_address (); @@ -133,6 +162,71 @@ writeline (const struct linebuffer *line, FILE *stream, int class) fwrite (line->buffer, sizeof (char), line->length, stream); } + +/* This just does a copy of the source linebuffer into the dest + linebuffer. + FIXME: Should this be a part of lib/linebuffer.c? */ + +static struct linebuffer * +copylinebuffer (struct linebuffer *dest, struct linebuffer *source) +{ + if (source->size > dest->size) + { + dest->buffer = xrealloc (dest->buffer, source->size); + dest->size = source->size; + } + memcpy (dest->buffer, source->buffer, source->length); + dest->length = source->length; + return dest; +} + +/* Check that successive input lines PREV and CURRENT from input file + WHATFILE are presented in order. + + If the user specified --nocheck-order, the check is not made. + If the user specified --check-order, the problem is fatal. + Otherwise (the default), the message is simply a warning. + + A message is printed at most once per input file. + + This funtion was copied (nearly) verbatim from `src/join.c'. */ + +static void +check_order (const struct linebuffer *prev, + const struct linebuffer *current, + int whatfile) +{ + + if (check_input_order != CHECK_ORDER_DISABLED + && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable)) + { + if (!issued_disorder_warning[whatfile-1]) + { + int order; + + if (hard_LC_COLLATE) + order = xmemcoll (prev->buffer, prev->length - 1, + current->buffer, current->length - 1); + else + { + size_t len = min (prev->length, current->length) - 1; + order = memcmp (prev->buffer, current->buffer, len); + } + + if (order > 0) + { + error ((check_input_order == CHECK_ORDER_ENABLED + ? EXIT_FAILURE : 0), + 0, _("file %d is not in sorted order"), whatfile); + + /* If we get to here, the message was just a warning, but we + want only to issue it once. */ + issued_disorder_warning[whatfile-1] = true; + } + } + } +} + /* Compare INFILES[0] and INFILES[1]. If either is "-", use the standard input for that file. Assume that each input file is sorted; @@ -141,13 +235,18 @@ writeline (const struct linebuffer *line, FILE *stream, int class) static void compare_files (char **infiles) { - /* For each file, we have one linebuffer in lb1. */ + /* For each file, we have one linebuffer in lb1, and one in lbp. */ struct linebuffer lb1[2]; + struct linebuffer lbp[2]; /* thisline[i] points to the linebuffer holding the next available line in file i, or is NULL if there are no lines left in that file. */ struct linebuffer *thisline[2]; + /* prevline[i] points to the linebuffer holding the previous line in + file i. */ + struct linebuffer *prevline[2]; + /* streams[i] holds the input stream for file i. */ FILE *streams[2]; @@ -157,7 +256,9 @@ compare_files (char **infiles) for (i = 0; i < 2; i++) { initbuffer (&lb1[i]); + initbuffer (&lbp[i]); thisline[i] = &lb1[i]; + prevline[i] = &lbp[i]; streams[i] = (STREQ (infiles[i], "-") ? stdin : fopen (infiles[i], "r")); if (!streams[i]) error (EXIT_FAILURE, errno, "%s", infiles[i]); @@ -170,6 +271,7 @@ compare_files (char **infiles) while (thisline[0] || thisline[1]) { int order; + bool fill_up[2] = {false, false}; /* Compare the next available lines of the two files. */ @@ -196,25 +298,36 @@ compare_files (char **infiles) /* Output the line that is lesser. */ if (order == 0) writeline (thisline[1], stdout, 3); - else if (order > 0) - writeline (thisline[1], stdout, 2); else - writeline (thisline[0], stdout, 1); + { + seen_unpairable = true; + if (order > 0) + writeline (thisline[1], stdout, 2); + else + writeline (thisline[0], stdout, 1); + } /* Step the file the line came from. If the files match, step both files. */ if (order >= 0) - { - thisline[1] = readlinebuffer (thisline[1], streams[1]); - if (ferror (streams[1])) - error (EXIT_FAILURE, errno, "%s", infiles[1]); - } + fill_up[1] = true; if (order <= 0) - { - thisline[0] = readlinebuffer (thisline[0], streams[0]); - if (ferror (streams[0])) - error (EXIT_FAILURE, errno, "%s", infiles[0]); - } + fill_up[0] = true; + + for (i = 0; i < 2; i++) + if (fill_up[i]) + { + prevline[i] = copylinebuffer (prevline[i], thisline[i]); + thisline[i] = readlinebuffer (thisline[i], streams[i]); + + if (thisline[i]) + check_order (prevline[i], thisline[i], i + 1); + + if (ferror (streams[i])) + error (EXIT_FAILURE, errno, "%s", infiles[i]); + + fill_up[i] = false; + } } for (i = 0; i < 2; i++) @@ -240,6 +353,10 @@ main (int argc, char **argv) only_file_2 = true; both = true; + seen_unpairable = false; + issued_disorder_warning[0] = issued_disorder_warning[1] = false; + check_input_order = CHECK_ORDER_DEFAULT; + while ((c = getopt_long (argc, argv, "123", long_options, NULL)) != -1) switch (c) { @@ -255,6 +372,14 @@ main (int argc, char **argv) both = false; break; + case NOCHECK_ORDER_OPTION: + check_input_order = CHECK_ORDER_DISABLED; + break; + + case CHECK_ORDER_OPTION: + check_input_order = CHECK_ORDER_ENABLED; + break; + case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); @@ -280,5 +405,8 @@ main (int argc, char **argv) compare_files (argv + optind); - exit (EXIT_SUCCESS); + if (issued_disorder_warning[0] || issued_disorder_warning[1]) + exit (EXIT_FAILURE); + else + exit (EXIT_SUCCESS); } diff --git a/tests/misc/Makefile.am b/tests/misc/Makefile.am index 17a0ec0..83e0262 100644 --- a/tests/misc/Makefile.am +++ b/tests/misc/Makefile.am @@ -49,6 +49,7 @@ TESTS = \ chcon \ chcon-fail \ selinux \ + comm \ cut \ wc-files0-from \ wc-files0 \ diff --git a/tests/misc/comm b/tests/misc/comm new file mode 100755 index 0000000..ea894e1 --- /dev/null +++ b/tests/misc/comm @@ -0,0 +1,121 @@ +#!/bin/sh +# -*- perl -*- +# Test comm + +# Copyright (C) 2008 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +: ${srcdir=.} +. $srcdir/../require-perl + +me=`echo $0|sed 's,.*/,,'` +exec $PERL -w -I$srcdir/.. -MCoreutils -M"CuTmpdir qw($me)" -- - <<\EOF +#/ +require 5.003; +use strict; + +(my $program_name = $0) =~ s|.*/||; + +my $prog = 'comm'; + +# Turn off localization of executable's ouput. address@hidden(LANGUAGE LANG LC_ALL)} = ('C') x 3; + +my @inputs = ({IN=>{a=>"1\n3"}}, {IN=>{b=>"2\n3"}}); + +my @Tests = + ( + # basic operation + ['basic', @inputs, {OUT=>"1\n\t2\n\t\t3\n"} ], + + # supress lines unique to file 1 + ['opt-1', '-1', @inputs, {OUT=>"2\n\t3\n"} ], + + # supress lines unique to file 2 + ['opt-2', '-2', @inputs, {OUT=>"1\n\t3\n"} ], + + # supress lines that appear in both files + ['opt-3', '-3', @inputs, {OUT=>"1\n\t2\n"} ], + + # supress lines unique to file 1 and lines unique to file 2 + ['opt-12', '-1', '-2', @inputs, {OUT=>"3\n"} ], + + # supress lines unique to file 1 and those that appear in both files + ['opt-13', '-1', '-3', @inputs, {OUT=>"2\n"} ], + + # supress lines unique to file 2 and those that appear in both files + ['opt-23', '-2', '-3', @inputs, {OUT=>"1\n"} ], + + # supress all output (really?) + ['opt-123', '-1', '-2', '-3', @inputs, {OUT=>""} ], + + # invalid missing command line argument (1) + ['missing-arg1', $inputs[0], {EXIT=>1}, + {ERR => "$prog: missing operand after `a'\n" + . "Try `$prog --help' for more information.\n"}], + + # invalid missing command line argument (both) + ['missing-arg2', {EXIT=>1}, + {ERR => "$prog: missing operand\n" + . "Try `$prog --help' for more information.\n"}], + + # invalid extra command line argument + ['extra-arg', @inputs, 'no-such', {EXIT=>1}, + {ERR => "$prog: extra operand `no-such'\n" + . "Try `$prog --help' for more information.\n"}], + + # out-of-order input + ['ooo', {IN=>{a=>"1\n3"}}, {IN=>{b=>"3\n2"}}, {EXIT=>1}, + {OUT => "1\n\t\t3\n\t2\n"}, + {ERR => "$prog: file 2 is not in sorted order\n"}], + + # out-of-order input, fatal + ['ooo2', '--check-order', {IN=>{a=>"1\n3"}}, {IN=>{b=>"3\n2"}}, {EXIT=>1}, + {OUT => "1\n\t\t3\n"}, + {ERR => "$prog: file 2 is not in sorted order\n"}], + + # out-of-order input, ignored + ['ooo3', '--nocheck-order', {IN=>{a=>"1\n3"}}, {IN=>{b=>"3\n2"}}, + {OUT => "1\n\t\t3\n\t2\n"}], + + # both inputs out-of-order + # We haven't seen the terminal pair yet when we do the last check_order. + ['ooo4', {IN=>{a=>"3\n1\n0"}}, {IN=>{b=>"3\n2\n0"}}, {EXIT=>1}, + {OUT => "\t\t3\n1\n0\n\t2\n\t0\n"}, + {ERR => "$prog: file 1 is not in sorted order\n". + "$prog: file 2 is not in sorted order\n" }], + + # FIXME? The nature of SEEN_UNPAIRABLE makes this pass. + # We haven't seen the terminal pair yet when we do the last check_order. + ['ooo5', {IN=>{a=>"3\n1"}}, {IN=>{b=>"3\n2"}}, {EXIT=>0}, + {OUT => "\t\t3\n1\n\t2\n"}], + + # both inputs out-of-order, but fully pairable + ['ooo6', {IN=>{a=>"2\n1\n0"}}, {IN=>{b=>"2\n1\n0"}}, {EXIT=>0}, + {OUT => "\t\t2\n\t\t1\n\t\t0\n"}], + + # both inputs out-of-order, fully pairable, but forced to fail + ['ooo7', '--check-order', {IN=>{a=>"2\n1\n0"}}, {IN=>{b=>"2\n1\n0"}}, + {EXIT=>1}, + {OUT => "\t\t2\n"}, + {ERR => "$prog: file 1 is not in sorted order\n"}], + ); + +my $save_temps = $ENV{DEBUG}; +my $verbose = $ENV{VERBOSE}; + +my $fail = run_tests ($program_name, $prog, address@hidden, $save_temps, $verbose); +exit $fail; +EOF -- 1.5.2.5