[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
(no subject)
From: |
James Youngman |
Subject: |
(no subject) |
Date: |
Sat, 16 Feb 2008 15:46:56 +0000 (GMT) |
>From d3ffc5547f1d77131ebdd4641c422072f2743283 Mon Sep 17 00:00:00 2001
From: James Youngman <address@hidden>
Date: Sat, 16 Feb 2008 15:43:56 +0000
Subject: [PATCH] Implement join --check-order.
2008-02-16 James Youngman <address@hidden>
* src/join.c (join): Support --check-order and --nocheck-order.
For --check-order, verify that the input files are in sorted
order.
(usage): Mention --check-order and --nocheck-order.
(dupline): Save a copy of the previously-read input line so that
we can detect disorder on the input.
(get_line): Temporarily save a copy of the previous line (by
calling dupline) and check relative ordering (by calling
checkorder) before returning the newly-read line.
(getseq, join): Tell get_line which file we are reading from.
(advance_seq): New function, factoring out some of the code
commonly surrounding calls to getseq.
(checkorder): New function. Verifies that a pair of consecutive
input lines are in sorted order.
* coreutils.texi (join invocation): Document the new options
--check-order and --nocheck-order.
---
doc/coreutils.texi | 19 ++++++-
src/join.c | 129 ++++++++++++++++++++++++++++++++++++++++++---------
2 files changed, 122 insertions(+), 26 deletions(-)
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 23d0ab4..0dd4587 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -5149,9 +5149,16 @@ sort a file on its default join field, but if you select
a non-default
locale, join field, separator, or comparison options, then you should
do so consistently between @command{join} and @command{sort}.
-As a @acronym{GNU} extension, if the input has no unpairable lines the
-sort order can be any order that considers two fields to be equal if and
-only if the sort comparison described above considers them to be equal.
address@hidden Unsorted inputs are a common cause of FAQs, but we probably
address@hidden should not make --check-order the default, as we documented this
address@hidden extension and so should continue to allow it
+.
+If the @option{--check-order} option is given, unsorted inputs will
+cause a fatal error message. If the @option{--check-order} option is
+not given, a @acronym{GNU} extension is available: if the input has no
+unpairable lines the sort order can be any order that considers two
+fields to be equal if and only if the sort comparison described above
+considers them to be equal.
For example:
@example
@@ -5188,6 +5195,12 @@ The program accepts the following options. Also see
@ref{Common options}.
Print a line for each unpairable line in file @var{file-number} (either
@samp{1} or @samp{2}), in addition to the normal output.
address@hidden --check-order
+Check that both input files are in sorted order.
+
address@hidden --nocheck-order
+Do not check that both input files are in sorted order. This is the default.
+
@item -e @var{string}
@opindex -e
Replace those output fields that are missing in the input with
diff --git a/src/join.c b/src/join.c
index a6ca7e4..2a5147d 100644
--- a/src/join.c
+++ b/src/join.c
@@ -108,9 +108,21 @@ static struct outlist *outlist_end = &outlist_head;
tab character whose value (when cast to unsigned char) equals TAB. */
static int tab = -1;
+/* If nonzero, check that the input is correctly ordered. */
+static bool check_input_order = false;
+
+enum
+{
+ CHECK_ORDER_OPTION = CHAR_MAX + 1,
+ NOCHECK_ORDER_OPTION
+};
+
+
static struct option const longopts[] =
{
{"ignore-case", no_argument, NULL, 'i'},
+ {"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
+ {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
{GETOPT_HELP_OPTION_DECL},
{GETOPT_VERSION_OPTION_DECL},
{NULL, 0, NULL, 0}
@@ -122,6 +134,9 @@ static struct line uni_blank;
/* If nonzero, ignore case when comparing join fields. */
static bool ignore_case;
+
+static void checkorder (const struct line *, const struct line *, int);
+
void
usage (int status)
{
@@ -153,6 +168,8 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read
standard input.\n\
-v FILENUM like -a FILENUM, but suppress joined output lines\n\
-1 FIELD join on this FIELD of file 1\n\
-2 FIELD join on this FIELD of file 2\n\
+ --check-order check that the input is correctly sorted\n\
+ --nocheck-order do not check that the input is correctly sorted\n\
"), stdout);
fputs (HELP_OPTION_DESCRIPTION, stdout);
fputs (VERSION_OPTION_DESCRIPTION, stdout);
@@ -228,12 +245,49 @@ xfields (struct line *line)
extract_field (line, ptr, lim - ptr);
}
+struct line*
+dupline (const struct line *old)
+{
+ struct line *newline = xmalloc (sizeof *newline);
+ size_t i;
+
+ /* Duplicate the buffer. */
+ initbuffer (&newline->buf);
+ newline->buf.buffer = xmalloc (old->buf.size);
+ newline->buf.size = old->buf.size;
+ memcpy (newline->buf.buffer, old->buf.buffer, old->buf.length);
+ newline->buf.length = old->buf.length;
+
+ /* Duplicate the field positions. */
+ newline->fields = xmalloc (sizeof *newline->fields * old->nfields_allocated);
+ newline->nfields = old->nfields;
+ newline->nfields_allocated = old->nfields_allocated;
+
+ for (i=0; i<old->nfields; i++)
+ {
+ newline->fields[i].len = old->fields[i].len;
+ newline->fields[i].beg = newline->buf.buffer + (old->fields[i].beg
+ - old->buf.buffer);
+ }
+ return newline;
+}
+
+static void
+freeline (struct line *line)
+{
+ free (line->fields);
+ free (line->buf.buffer);
+ line->buf.buffer = NULL;
+}
+
/* Read a line from FP into LINE and split it into fields.
Return true if successful. */
static bool
-get_line (FILE *fp, struct line *line)
+get_line (FILE *fp, struct line *line, int which)
{
+ struct line *old = check_input_order ? dupline (line) : NULL;
+
initbuffer (&line->buf);
if (! readlinebuffer (&line->buf, fp))
@@ -242,6 +296,8 @@ get_line (FILE *fp, struct line *line)
error (EXIT_FAILURE, errno, _("read error"));
free (line->buf.buffer);
line->buf.buffer = NULL;
+ if (check_input_order)
+ freeline (old);
return false;
}
@@ -249,15 +305,13 @@ get_line (FILE *fp, struct line *line)
line->nfields = 0;
line->fields = NULL;
xfields (line);
- return true;
-}
-static void
-freeline (struct line *line)
-{
- free (line->fields);
- free (line->buf.buffer);
- line->buf.buffer = NULL;
+ if (check_input_order)
+ {
+ checkorder (old, line, which);
+ freeline (old);
+ }
+ return true;
}
static void
@@ -271,12 +325,12 @@ initseq (struct seq *seq)
/* Read a line from FP and add it to SEQ. Return true if successful. */
static bool
-getseq (FILE *fp, struct seq *seq)
+getseq (FILE *fp, struct seq *seq, int whichfile)
{
if (seq->count == seq->alloc)
seq->lines = X2NREALLOC (seq->lines, &seq->alloc);
- if (get_line (fp, &seq->lines[seq->count]))
+ if (get_line (fp, &seq->lines[seq->count], whichfile))
{
++seq->count;
return true;
@@ -284,6 +338,20 @@ getseq (FILE *fp, struct seq *seq)
return false;
}
+/* Read a line from FP and add it to SEQ, as the first item if FIRST is
+ * true, else as the next.
+ */
+static bool
+advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile)
+{
+ if (first)
+ {
+ freeline (&seq->lines[0]);
+ seq->count = 0;
+ }
+ return getseq (fp, seq, whichfile);
+}
+
static void
delseq (struct seq *seq)
{
@@ -354,6 +422,17 @@ keycmp (struct line const *line1, struct line const *line2)
return len1 < len2 ? -1 : len1 != len2;
}
+static void
+checkorder (const struct line *prev,
+ const struct line *current,
+ int whatfile)
+{
+ if (keycmp (prev, current) > 0)
+ {
+ error (EXIT_FAILURE, 0, _("File %d is not in sorted order"), whatfile);
+ }
+}
+
/* Print field N of LINE if it exists and is nonempty, otherwise
`empty_filler' if it is nonempty. */
@@ -468,9 +547,9 @@ join (FILE *fp1, FILE *fp2)
/* Read the first line of each file. */
initseq (&seq1);
- getseq (fp1, &seq1);
+ getseq (fp1, &seq1, 1);
initseq (&seq2);
- getseq (fp2, &seq2);
+ getseq (fp2, &seq2, 2);
while (seq1.count && seq2.count)
{
@@ -480,18 +559,14 @@ join (FILE *fp1, FILE *fp2)
{
if (print_unpairables_1)
prjoin (&seq1.lines[0], &uni_blank);
- freeline (&seq1.lines[0]);
- seq1.count = 0;
- getseq (fp1, &seq1);
+ advance_seq (fp1, &seq1, true, 1);
continue;
}
if (diff > 0)
{
if (print_unpairables_2)
prjoin (&uni_blank, &seq2.lines[0]);
- freeline (&seq2.lines[0]);
- seq2.count = 0;
- getseq (fp2, &seq2);
+ advance_seq (fp2, &seq2, true, 2);
continue;
}
@@ -499,7 +574,7 @@ join (FILE *fp1, FILE *fp2)
match the current line from file2. */
eof1 = false;
do
- if (!getseq (fp1, &seq1))
+ if (!advance_seq (fp1, &seq1, false, 1))
{
eof1 = true;
++seq1.count;
@@ -511,7 +586,7 @@ join (FILE *fp1, FILE *fp2)
match the current line from file1. */
eof2 = false;
do
- if (!getseq (fp2, &seq2))
+ if (!advance_seq (fp2, &seq2, false, 2))
{
eof2 = true;
++seq2.count;
@@ -554,7 +629,7 @@ join (FILE *fp1, FILE *fp2)
{
prjoin (&seq1.lines[0], &uni_blank);
freeline (&seq1.lines[0]);
- while (get_line (fp1, &line))
+ while (get_line (fp1, &line, 1))
{
prjoin (&line, &uni_blank);
freeline (&line);
@@ -565,7 +640,7 @@ join (FILE *fp1, FILE *fp2)
{
prjoin (&uni_blank, &seq2.lines[0]);
freeline (&seq2.lines[0]);
- while (get_line (fp2, &line))
+ while (get_line (fp2, &line, 2))
{
prjoin (&uni_blank, &line);
freeline (&line);
@@ -875,6 +950,14 @@ main (int argc, char **argv)
}
break;
+ case NOCHECK_ORDER_OPTION:
+ check_input_order = false;
+ break;
+
+ case CHECK_ORDER_OPTION:
+ check_input_order = true;
+ break;
+
case 1: /* Non-option argument. */
add_file_name (optarg, names, operand_status, joption_count,
&nfiles, &prev_optc_status, &optc_status);
--
1.5.3.8
- (no subject),
James Youngman <=