bug-diffutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[bug-diffutils] bug#19835: RFC: diff: skip initial columns before compar


From: Dave Gordon
Subject: [bug-diffutils] bug#19835: RFC: diff: skip initial columns before comparing
Date: Wed, 11 Feb 2015 14:33:21 +0000
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.4.0

When comparing certain types of files, notably timestamped logfiles
such as the output of dmesg(1), it's necessary to ignore the initial
characters on each line, otherwise every line is different. In the
simplest case, this can be done by applying 'cut(1)' to each input;
but then, important information about when the difference(s) occurred is
lost, and it can be difficult to find the relevant lines in the original
files, especially if they are highly repetitive (as logfiles often are).
When is needed in this situation is to ignore the timestamps for
purposes of comparison, but then include them in any lines copied to
the output.

So this patch adds a new option (long form only) "--ignore-initial=N" to
ignore the first N characters of each line. This is done by skipping the
first N characters of each line in find_and_hash_each_line(), and
likewise lines_differ(). The hashing or comparison of the remaining part
of the line then proceeds as usual.

One subtle point: if both of the lines have less than N characters, the
lines are considered equal iff they have the same length. Usually, the
type of file you would use this option with will have a fixed-format
prefix (which is the part to be ignored), and a line missing this prefix
is generally an indication of a formatting error. So a line with the
prefix but no further content should NOT match an empty line or a line
with a truncated prefix; but we still want two empty lines to match each
other.

For example, with --ignore-initial=10:

These lines match:
[22:47:25] hello
[23:17:24] hello

These lines don't match:
[22:47:25] hello
[23:17:24]

Nor do these:
[22:47:]
[23:17:24]

But these do:
[NOCLOCK]
[CLKFAIL]

Hope this looks useful!
.Dave.

-----------------------
diff --git a/src/diff.c b/src/diff.c
index 50d0365..eccce21 100644
--- a/src/diff.c
+++ b/src/diff.c
@@ -121,6 +121,7 @@ enum
   NO_IGNORE_FILE_NAME_CASE_OPTION,
   NORMAL_OPTION,
   SDIFF_MERGE_ASSIST_OPTION,
+  SKIP_INITIAL_OPTION,
   STRIP_TRAILING_CR_OPTION,
   SUPPRESS_BLANK_EMPTY_OPTION,
   SUPPRESS_COMMON_LINES_OPTION,
@@ -173,6 +174,7 @@ static struct option const longopts[] =
   {"ignore-blank-lines", 0, 0, 'B'},
   {"ignore-case", 0, 0, 'i'},
   {"ignore-file-name-case", 0, 0, IGNORE_FILE_NAME_CASE_OPTION},
+  {"ignore-initial", 1, 0, SKIP_INITIAL_OPTION},
   {"ignore-matching-lines", 1, 0, 'I'},
   {"ignore-space-change", 0, 0, 'b'},
   {"ignore-tab-expansion", 0, 0, 'E'},
@@ -580,6 +582,18 @@ main (int argc, char **argv)
          sdiff_merge_assist = true;
          break;

+       case SKIP_INITIAL_OPTION:
+         numval = strtoumax (optarg, &numend, 10);
+         if (! (0 < numval && numval <= SIZE_MAX) || *numend)
+           try_help ("invalid initial skip '%s'", optarg);
+         if (initial_skip != numval)
+           {
+             if (initial_skip)
+               fatal ("conflicting initial skip options");
+             initial_skip = numval;
+           }
+         break;
+
        case STRIP_TRAILING_CR_OPTION:
          strip_trailing_cr = true;
          break;
@@ -724,7 +738,8 @@ main (int argc, char **argv)
   files_can_be_treated_as_binary =
     (brief & binary
      & ~ (ignore_blank_lines | ignore_case | strip_trailing_cr
-         | (ignore_regexp_list.regexps || ignore_white_space)));
+         | (ignore_regexp_list.regexps || ignore_white_space
+                 || initial_skip)));

   switch_string = option_list (argv + 1, optind - 1);

@@ -895,6 +910,7 @@ static char const * const option_help_msgid[] = {
   N_("-w, --ignore-all-space          ignore all white space"),
   N_("-B, --ignore-blank-lines        ignore changes where lines are
all blank"),
   N_("-I, --ignore-matching-lines=RE  ignore changes where all lines
match RE"),
+  N_("    --ignore-initial=SKIP       ignore the initial SKIP
characters of each line"),
   "",
   N_("-a, --text                      treat all files as text"),
   N_("    --strip-trailing-cr         strip trailing carriage return on
input"),
diff --git a/src/diff.h b/src/diff.h
index e9f0471..b638a3f 100644
--- a/src/diff.h
+++ b/src/diff.h
@@ -125,6 +125,9 @@ XTERN enum DIFF_white_space ignore_white_space;
 /* Ignore changes that affect only blank lines (-B).  */

+/* Skip this many initial characters on each line */
+XTERN size_t initial_skip;
+
 /* Files can be compared byte-by-byte, as if they were binary.
    This depends on various options.  */
 XTERN bool files_can_be_treated_as_binary;
diff --git a/src/io.c b/src/io.c
index 463ee35..7e15996 100644
--- a/src/io.c
+++ b/src/io.c
@@ -232,13 +232,18 @@ find_and_hash_each_line (struct file_data *current)
   bool diff_length_compare_anyway =
     ig_white_space != IGNORE_NO_WHITE_SPACE;
   bool same_length_diff_contents_compare_anyway =
-    diff_length_compare_anyway | ig_case;
+    diff_length_compare_anyway | ig_case || initial_skip != 0;

   while (p < suffix_begin)
     {
       char const *ip = p;
       hash_value h = 0;
       unsigned char c;
+      size_t skip = initial_skip;
+
+      while (skip--)
+       if ((c = *p++) == '\n')
+         goto hashing_done;

       /* Hash this line until we find a newline.  */
       switch (ig_white_space)
diff --git a/src/util.c b/src/util.c
index 016057d..0acba06 100644
--- a/src/util.c
+++ b/src/util.c
@@ -413,6 +413,16 @@ lines_differ (char const *s1, char const *s2)
   register char const *t1 = s1;
   register char const *t2 = s2;
   size_t column = 0;
+  size_t skip = initial_skip;
+
+  while (skip--)
+    {
+      register unsigned char c1 = *t1++;
+      register unsigned char c2 = *t2++;
+
+      if (c1 == '\n' || c2 == '\n')
+       return c1 != c2;
+    }

   while (1)
     {






reply via email to

[Prev in Thread] Current Thread [Next in Thread]