>From 59454b1b76be5947b3d3a2c10307cf7ac1ef3dc6 Mon Sep 17 00:00:00 2001 From: Dima Kogan Date: Sun, 15 May 2022 18:05:58 -0700 Subject: [PATCH] Fixed incomplete and incorrect treatment of comments and trailing whitespace Addresses two related issues: - Comments that didn't block out a whole line weren't being properly ignored by -C. Lines such as 'bar 5#xxx' didn't ignore the '#xxx' as they were supposed to - With -W, trailing whitespace was treated as its own field This patch adds tests for the problematic behaviors, and fixes them --- src/text-lines.c | 30 +++++++++++++++++++----------- tests/datamash-tests-2.pl | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 11 deletions(-) diff --git a/src/text-lines.c b/src/text-lines.c index 2d4544b..831d793 100644 --- a/src/text-lines.c +++ b/src/text-lines.c @@ -91,21 +91,23 @@ line_record_reserve_fields (struct line_record_t* lr, const size_t n) } static void -line_record_parse_fields (struct line_record_t *lr, int field_delim) +line_record_parse_fields (struct line_record_t *lr, int field_delim, bool skip_comments) { size_t num_fields = 0; size_t pos = 0; const size_t buflen = line_record_length (lr); const char* fptr = line_record_buffer (lr); +#define IS_COMMENT (skip_comments && (*fptr == '#' || *fptr == ';')) + /* Move 'fptr' to point to the beginning of 'field' */ if (field_delim != TAB_WHITESPACE) { - while (buflen && pos<=buflen) + while (buflen && pos<=buflen && !IS_COMMENT) { /* scan buffer until next delimiter */ const char* field_beg = fptr; - while ( (posfields[num_fields].len = fptr - field_beg; ++num_fields; + if(IS_COMMENT) + pos = buflen; + /* Skip the delimiter */ ++pos; ++fptr; @@ -127,10 +132,10 @@ line_record_parse_fields (struct line_record_t *lr, int field_delim) { /* delimiter is white-space transition (multiple whitespaces are one delimiter) */ - while (posfields[num_fields].buf = field_beg; - lr->fields[num_fields].len = flen; - ++num_fields; + if(flen > 0) + { + line_record_reserve_fields (lr, num_fields); + lr->fields[num_fields].buf = field_beg; + lr->fields[num_fields].len = flen; + ++num_fields; + } } lr->num_fields = num_fields; } @@ -180,7 +188,7 @@ line_record_fread (struct /* in/out */ line_record_t* lr, } while (skip_comments && line_record_is_comment (lr)); - line_record_parse_fields (lr, in_tab); + line_record_parse_fields (lr, in_tab, skip_comments); return true; } diff --git a/tests/datamash-tests-2.pl b/tests/datamash-tests-2.pl index 4356fa3..27017fa 100755 --- a/tests/datamash-tests-2.pl +++ b/tests/datamash-tests-2.pl @@ -299,6 +299,21 @@ bar 5 ;baz 7 EOF +# attached comment +my $in_comments2=<<'EOF'; + #foo 3 +bar 5#xxx +;baz 7 +EOF + +# trailing whitespace and attached comment +my $in_comments3=<<'EOF'; + #foo 3 +bar 5# xxx +bbb 4 +;baz 7 +EOF + my $in_esc_ident=<<'EOF'; A_Chlor_T1h_r1-metaG B 9C -bar 1 2 3 4 @@ -589,6 +604,28 @@ my @Tests = "7 ;baz\n"}], ['sc4', '-C reverse', {IN_PIPE=>$in_comments}, {OUT=>"5 bar\n"}], + # attached comment + ['sc5', 'sum 2', {IN_PIPE=>$in_comments2}, {EXIT=>1}, {ERR=>"datamash: invalid numeric value in line 2 field 2: '5#xxx'\n"}], + ['sc6', '-C sum 2', {IN_PIPE=>$in_comments2}, {OUT=>"5\n"}], + ['sc7', 'reverse', {IN_PIPE=>$in_comments2}, + {OUT=>"3 #foo\n" . + "5#xxx bar\n" . + "7 ;baz\n"}], + ['sc8', '-C reverse', {IN_PIPE=>$in_comments2}, {OUT=>"5 bar\n"}], + + # Test -C/--skip-comments option. Need to make sure we ignore full-line + # comments and partial comments. And that we treat trailing whitespace + # properly + ['sc9', '-C -W sum 2', {IN_PIPE=>$in_comments3}, {OUT=>"9\n"}], + ['sc10', '-C -W reverse', {IN_PIPE=>$in_comments3}, + {OUT=> "5 bar\n" . + "4 bbb\n" }], + ['sc11', '-C -W transpose', {IN_PIPE=>$in_comments3}, + {OUT=> <<'EOF' +bar bbb +5 4 +EOF + }], # Bug in mode/antimode in 1.4 and earlier ['bug_mode1', 'mode 1', {IN_PIPE=>"-1"}, {OUT=>"-1\n"}], -- 2.34.1