bug-datamash
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] vnlog support


From: Dima Kogan
Subject: [PATCH] vnlog support
Date: Mon, 23 Dec 2019 06:32:15 -0000
User-agent: mu4e 1.2.0; emacs 27.0.50

Hi.

I maintain vnlog, a toolkit for manipulating tabular ascii data:

  https://github.com/dkogan/vnlog

The cmdline tools are largely thin frontends around awk and GNU
coreutils. The capabilities are complementary with datamash, and it'd be
nice if datamash supported vnlog's data format. It already does 99% of
it, and I'm attaching a prototype patch (to the 1.4 stable release) that
adds the rest. The vnlog format:

- A whitespace-separated table of text

- Lines beginning with # are comments

- The first line that begins with a single # (not ## or #!) is a legend, naming 
each column

- Empty fields reported as -

As you can see, this is very close to what datamash does already. Would
you be interested in adding this support to datamash? You probably would
want to rework the patch somewhat, I imagine. It's not intrusive, so
that shouldn't be very effortful. Trivial demo:


  $ (echo '## comment'; echo '# x y'; seq 5 | awk '{print $1, $1*$1}') | 
./datamash -v sum y mean x
  # sum(y) mean(x)
  55 3


The patch handles most everything, except the "whitespace-delimited"
part. As far as I can tell there currently isn't a way to make datamash
work with \s+ as a field separator: -t' ' treats a sequence of N spaces
as N field separators. I can write a patch for that, if you're
interested. It probably would be good to support that regardless, since
that would match how awk does things.

Thanks.

Please Cc me when replying; I'm not subscribed to the list.

diff --git a/src/datamash.c b/src/datamash.c
index 4219c26..e2dbf2f 100644
--- a/src/datamash.c
+++ b/src/datamash.c
@@ -114,7 +114,7 @@ enum
   UNDOC_RMDUP_TEST
 };
 
-static char const short_options[] = "sfF:izg:t:HWR:C";
+static char const short_options[] = "sfF:izg:t:HWR:Cv";
 
 static struct option const long_options[] =
 {
@@ -127,6 +127,7 @@ static struct option const long_options[] =
   {"header-in", no_argument, NULL, INPUT_HEADER_OPTION},
   {"header-out", no_argument, NULL, OUTPUT_HEADER_OPTION},
   {"headers", no_argument, NULL, 'H'},
+  {"vnlog", no_argument, NULL, 'v'},
   {"full", no_argument, NULL, 'f'},
   {"filler", required_argument, NULL, 'F'},
   {"format", required_argument, NULL, CUSTOM_FORMAT_OPTION},
@@ -433,6 +434,9 @@ print_input_line (const struct line_record_t* lb)
 static void
 print_column_headers ()
 {
+  if ( vnlog )
+      printf ("# ");
+
   if (print_full_line)
     {
       /* Print the headers of all the input fields */
@@ -514,7 +518,9 @@ process_input_header (FILE *stream)
   struct line_record_t lr;
 
   line_record_init (&lr);
-  if (line_record_fread (&lr, stream, eolchar, skip_comments))
+
+  if ( (!vnlog && line_record_fread (&lr, stream, eolchar, skip_comments )) ||
+       ( vnlog && line_record_fread_vnlog_prologue (&lr, stream, eolchar )) )
     {
       build_input_line_headers (&lr, true);
       line_number++;
@@ -971,7 +977,8 @@ remove_dups_in_file ()
 
   if (input_header)
     {
-      if (line_record_fread (thisline, input_stream, eolchar, skip_comments))
+      if ( (!vnlog && line_record_fread (thisline, input_stream, eolchar, 
skip_comments )) ||
+           ( vnlog && line_record_fread_vnlog_prologue (thisline, 
input_stream, eolchar )) )
         {
           line_number++;
 
@@ -1157,6 +1164,15 @@ int main (int argc, char* argv[])
     {
       switch (optc)
         {
+        case 'v':
+          skip_comments        = true;
+          input_header         = output_header = true;
+          missing_field_filler = "-";
+          in_tab               = ' ';
+          out_tab              = ' ';
+          vnlog                = true;
+          break;
+
        case 'C':
          skip_comments = true;
          break;
diff --git a/src/text-lines.c b/src/text-lines.c
index dc9ea5f..c7aa2eb 100644
--- a/src/text-lines.c
+++ b/src/text-lines.c
@@ -34,6 +34,7 @@
 
 #include "text-options.h"
 #include "text-lines.h"
+#include "die.h"
 
 void
 line_record_init (struct line_record_t* lr)
@@ -91,12 +92,19 @@ line_record_reserve_fields (struct line_record_t* lr, const 
size_t n)
 }
 
 static void
-line_record_parse_fields (struct line_record_t *lr, int field_delim)
+line_record_parse_fields (/* The buffer. May or may not be the one in the
+                             following argument */
+                          const struct linebuffer* lbuf,
+
+                          /* Used ONLY for the fields. The buffer is picked up
+                             from the above argument */
+                          struct line_record_t *lr,
+                          int field_delim)
 {
   size_t num_fields = 0;
   size_t pos = 0;
-  const size_t buflen = line_record_length (lr);
-  const char* fptr = line_record_buffer (lr);
+  const size_t buflen = lbuf->length;
+  const char*  fptr   = lbuf->buffer;
 
   /* Move 'fptr' to point to the beginning of 'field' */
   if (field_delim != TAB_WHITESPACE)
@@ -157,33 +165,91 @@ line_record_parse_fields (struct line_record_t *lr, int 
field_delim)
 }
 
 
-static bool
-line_record_is_comment (const struct line_record_t* lr)
+// returns 0 if not a comment, 1 if a single comment, 2 if a double comment
+static int
+line_comment_count (const struct line_record_t* lr)
 {
   const char* pch = line_record_buffer (lr);
 
   /* Skip white space at beginning of line */
   size_t s = strspn (pch, " \t");
   /* First non-whitespace character */
-  char c = pch[s];
-  return (c=='#' || c==';');
+  const char* c = &pch[s];
+  if (!(c[0]=='#' || c[0]==';'))
+      // not any comment
+      return 0;
+  if(c[0] == '\0')
+      return 1;
+  if( c[0] == '#' && (c[1] == '#' || c[1] == '!') )
+      return 2;
+  if( c[0] == ';' && c[1] == ';')
+      return 2;
+  return 1;
 }
 
-bool
-line_record_fread (struct /* in/out */ line_record_t* lr,
-                  FILE *stream, char delimiter, bool skip_comments)
+static bool
+_line_record_fread (struct /* in/out */ line_record_t* lr,
+                    FILE *stream, char delimiter,
+                    bool skip_single_comments,
+                    bool vnlog_prologue)
 {
-  do {
+  while(1) {
     if (readlinebuffer_delim (&lr->lbuf, stream, delimiter) == 0)
       return false;
     linebuffer_nullify (&lr->lbuf);
-  } while (skip_comments && line_record_is_comment (lr));
+    int comment_count = line_comment_count (lr);
+    if( skip_single_comments && comment_count>=1)
+        continue;
+    if( vnlog_prologue )
+    {
+        // I skip double-comments
+        //
+        // I read single-commented lines that have anything following the 
single
+        // comment character. And I strip out the comment character
+        //
+        // I barf on anything else. No data before the header allowed
+        if( comment_count >= 2 )
+            continue;
+        if( comment_count == 1 )
+        {
+            // one comment. I need to strip the comment characters. Skip 
leading
+            // regex '^\s*#\s*'
+            const char* pch = line_record_buffer (lr);
+            size_t s = strspn (pch, " \t#");
+            struct linebuffer lbuf = lr->lbuf;
+            lbuf.buffer += s;
+            lbuf.length -= s;
+            if(lbuf.buffer[0] == '\0')
+                // empty comment line. ignore.
+                continue;
+            line_record_parse_fields (&lbuf, lr, in_tab);
+            return true;
+        }
+        // No comment. This is an illegal data line. Barf.
+        die (EXIT_FAILURE, 0, _("invalid vnlog data: received data line prior 
to the header: '%s'"),
+             line_record_buffer (lr));
 
+    }
+    break;
+  }
 
-  line_record_parse_fields (lr, in_tab);
+  line_record_parse_fields (&lr->lbuf, lr, in_tab);
   return true;
 }
 
+bool
+line_record_fread (struct /* in/out */ line_record_t* lr,
+                   FILE *stream, char delimiter, bool skip_comments)
+{
+    return _line_record_fread(lr, stream, delimiter, skip_comments, false);
+}
+bool
+line_record_fread_vnlog_prologue (struct /* in/out */ line_record_t* lr,
+                                  FILE *stream, char delimiter)
+{
+    return _line_record_fread(lr, stream, delimiter, false, true);
+}
+
 void
 line_record_free (struct line_record_t* lr)
 {
diff --git a/src/text-lines.h b/src/text-lines.h
index e318293..9d98c10 100644
--- a/src/text-lines.h
+++ b/src/text-lines.h
@@ -83,6 +83,9 @@ line_record_init (struct line_record_t* lr);
 bool
 line_record_fread (struct /* in/out */ line_record_t* lr,
                    FILE *stream, char delimiter, bool skip_comments);
+bool
+line_record_fread_vnlog_prologue (struct /* in/out */ line_record_t* lr,
+                                  FILE *stream, char delimiter);
 
 void
 line_record_free (struct line_record_t* lr);
diff --git a/src/text-options.c b/src/text-options.c
index 3e811b7..a775db1 100644
--- a/src/text-options.c
+++ b/src/text-options.c
@@ -68,6 +68,8 @@ char* missing_field_filler = "N/A";
    followed by '#' or ';'. See line_record_is_comment().  */
 bool skip_comments = false;
 
+bool vnlog = false;
+
 #define UCHAR_LIM (UCHAR_MAX + 1)
 bool blanks[UCHAR_LIM];
 
diff --git a/src/text-options.h b/src/text-options.h
index dc275cd..6cde4aa 100644
--- a/src/text-options.h
+++ b/src/text-options.h
@@ -66,6 +66,8 @@ extern char* missing_field_filler;
    followed by '#' or ';'. See line_record_is_comment().  */
 extern bool skip_comments;
 
+extern bool vnlog;
+
 #define UCHAR_LIM (UCHAR_MAX + 1)
 extern bool blanks[UCHAR_LIM];
 

reply via email to

[Prev in Thread] Current Thread [Next in Thread]