From fa5a086c05f9e943fcddff3cb0bd2528f0034f8c Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 12 Sep 2016 23:19:13 +0900 Subject: [PATCH] gawk: use dfa matcher for regex with anchor * awk.h (struct Regexp): Remove membar has_anchor. All uses removed. * re.c (make_regexp, research): Use dfa matcher for regex with anchor. * test/Makefile.am: Add new test. * test/anchor.awk: New test-related files. * test/anchor.in: * test/anchor.ok: --- awk.h | 1 - re.c | 10 ++-------- test/Makefile.am | 3 +++ test/anchor.awk | 33 +++++++++++++++++++++++++++++++++ test/anchor.in | 3 +++ test/anchor.ok | 6 ++++++ 6 files changed, 47 insertions(+), 9 deletions(-) create mode 100644 test/anchor.awk create mode 100644 test/anchor.in create mode 100644 test/anchor.ok diff --git a/awk.h b/awk.h index 2c40163..2353700 100644 --- a/awk.h +++ b/awk.h @@ -210,7 +210,6 @@ typedef struct Regexp { struct re_pattern_buffer pat; struct re_registers regs; struct dfa *dfareg; - bool has_anchor; /* re has anchors which dfa avoids */ bool non_empty; /* for use in fpat_parse_field */ bool has_meta; /* re has meta chars so (probably) isn't simple string */ bool maybe_long; /* re has meta chars that can match long text */ diff --git a/re.c b/re.c index 69cc50e..167a265 100644 --- a/re.c +++ b/re.c @@ -49,7 +49,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) int c, c2; static bool first = true; static bool no_dfa = false; - bool has_anchor = false; reg_syntax_t dfa_syn; int i; @@ -160,9 +159,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) } /* switch */ } else { c = *src; - if (c == '^' || c == '$') - has_anchor = true; - *dest++ = *src++; /* not '\\' */ } if (gawk_mb_cur_max > 1 && is_multibyte) @@ -228,11 +224,10 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) if (dfa && ! no_dfa) { rp->dfareg = dfaalloc(); dfasyntax(rp->dfareg, & localeinfo, dfa_syn, - ignorecase ? DFA_CASE_FOLD : 0); + (ignorecase ? DFA_CASE_FOLD : 0) | DFA_ANCHOR); dfacomp(buf, len, rp->dfareg, true); } else rp->dfareg = NULL; - rp->has_anchor = has_anchor; /* Additional flags that help with RS as regexp. */ for (i = 0; i < len; i++) { @@ -287,7 +282,7 @@ research(Regexp *rp, char *str, int start, ret = dfaexec(superset, str+start, str+start+len, true, NULL, NULL); - if (ret && ((! need_start && ! rp->has_anchor) + if (ret && (! need_start || (! superset && dfaisfast(rp->dfareg)))) ret = dfaexec(rp->dfareg, str+start, str+start+len, true, NULL, &try_backref); @@ -298,7 +293,6 @@ research(Regexp *rp, char *str, int start, || start != 0 || no_bol || need_start - || rp->has_anchor || try_backref) { /* * Passing NULL as last arg speeds up search for cases diff --git a/test/Makefile.am b/test/Makefile.am index a0b4f52..17e1bfc 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -47,6 +47,9 @@ EXTRA_DIST = \ anchgsub.awk \ anchgsub.in \ anchgsub.ok \ + anchor.awk \ + anchor.in \ + anchor.ok \ argarray.awk \ argarray.in \ argarray.ok \ diff --git a/test/anchor.awk b/test/anchor.awk new file mode 100644 index 0000000..56f4756 --- /dev/null +++ b/test/anchor.awk @@ -0,0 +1,33 @@ +BEGIN { RS = "" } + +{ + if (/^A/) + print "ok" + else + print "not ok" + + if (/B$/) + print "not ok" + else + print "ok" + + if (/^C/) + print "not ok" + else + print "ok" + + if (/D$/) + print "not ok" + else + print "ok" + + if (/^E/) + print "not ok" + else + print "ok" + + if (/F$/) + print "ok" + else + print "not ok" +} diff --git a/test/anchor.in b/test/anchor.in new file mode 100644 index 0000000..a97e985 --- /dev/null +++ b/test/anchor.in @@ -0,0 +1,3 @@ +A line1 B +C line2 D +E line3 F diff --git a/test/anchor.ok b/test/anchor.ok new file mode 100644 index 0000000..7780b88 --- /dev/null +++ b/test/anchor.ok @@ -0,0 +1,6 @@ +ok +ok +ok +ok +ok +ok -- 1.7.1