From 2ba4bc63bb7cf5f573b9aff929cf1e5cb045d683 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Fri, 2 Sep 2016 15:27:12 -0700 Subject: [PATCH] dfa: new option for anchored searches This follows up on a suggestion by Norihiro Tanaka (Bug#24262). * src/dfa.c (struct regex_syntax): New member 'anchor'. (char_context): Use it. (dfasyntax): Change signature to specify it, along with the old FOLD and EOL args, as a single DFAOPTS arg. All uses changed. * src/dfa.h (DFA_ANCHOR, DFA_CASE_FOLD, DFA_EOL_NUL): New constants for dfasyntax new last arg. --- src/dfa.c | 13 +++++++++---- src/dfa.h | 22 ++++++++++++++++++---- src/dfasearch.c | 4 +++- tests/dfa-match-aux.c | 2 +- 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 4cbaa75..ff3721c 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -335,6 +335,10 @@ struct regex_syntax /* Flag for case-folding letters into sets. */ bool case_fold; + /* True if ^ and $ match only the start and end of data, and do not match + end-of-line within data. */ + bool anchor; + /* End-of-line byte in data. */ unsigned char eolbyte; @@ -754,7 +758,7 @@ unibyte_word_constituent (struct dfa const *dfa, unsigned char c) static int char_context (struct dfa const *dfa, unsigned char c) { - if (c == dfa->syntax.eolbyte) + if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor) return CTX_NEWLINE; if (unibyte_word_constituent (dfa, c)) return CTX_LETTER; @@ -3987,7 +3991,7 @@ dfaalloc (void) /* Initialize DFA. */ void dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, - reg_syntax_t bits, bool fold, unsigned char eol) + reg_syntax_t bits, int dfaopts) { int i; memset (dfa, 0, offsetof (struct dfa, dfaexec)); @@ -4000,9 +4004,10 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, dfa->canychar = -1; dfa->lex.cur_mb_len = 1; dfa->syntax.syntax_bits_set = true; + dfa->syntax.case_fold = (dfaopts & DFA_CASE_FOLD) != 0; + dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0; + dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n'; dfa->syntax.syntax_bits = bits; - dfa->syntax.case_fold = fold; - dfa->syntax.eolbyte = eol; for (i = CHAR_MIN; i <= CHAR_MAX; ++i) { diff --git a/src/dfa.h b/src/dfa.h index 31baf7a..b8c44cc 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -46,15 +46,29 @@ struct dfa; calling dfafree() on it. */ extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC; +/* DFA options that can be ORed together, for dfasyntax's 4th arg. */ +enum + { + /* ^ and $ match only the start and end of data, and do not match + end-of-line within data. This is always false for grep, but + possibly true for other apps. */ + DFA_ANCHOR = 1 << 0, + + /* Ignore case while matching. */ + DFA_CASE_FOLD = 1 << 1, + + /* '\0' in data is end-of-line, instead of the traditional '\n'. */ + DFA_EOL_NUL = 1 << 2 + }; + /* Initialize or reinitialize a DFA. This must be called before any of the routines below. The arguments are: 1. The DFA to operate on. 2. Information about the current locale. - 3. The syntax bits described earlier in this file. - 4. The case-folding flag. - 5. The line terminator. */ + 3. Syntax bits described in regex.h. + 4. Additional DFA options described above. */ extern void dfasyntax (struct dfa *, struct localeinfo const *, - reg_syntax_t, bool, unsigned char); + reg_syntax_t, int); /* Build and return the struct dfamust from the given struct dfa. */ extern struct dfamust *dfamust (struct dfa const *); diff --git a/src/dfasearch.c b/src/dfasearch.c index 0838e1f..96be58f 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -123,7 +123,9 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) if (match_icase) syntax_bits |= RE_ICASE; re_set_syntax (syntax_bits); - dfasyntax (dfa, &localeinfo, syntax_bits, match_icase, eolbyte); + int dfaopts = ((match_icase ? DFA_CASE_FOLD : 0) + | (eolbyte ? 0 : DFA_EOL_NUL)); + dfasyntax (dfa, &localeinfo, syntax_bits, dfaopts); /* For GNU regex, pass the patterns separately to detect errors like "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and diff --git a/tests/dfa-match-aux.c b/tests/dfa-match-aux.c index e001b7d..070089c 100644 --- a/tests/dfa-match-aux.c +++ b/tests/dfa-match-aux.c @@ -58,7 +58,7 @@ main (int argc, char **argv) init_localeinfo (&localeinfo); dfa = dfaalloc (); - dfasyntax (dfa, &localeinfo, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n'); + dfasyntax (dfa, &localeinfo, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0); dfacomp (argv[1], strlen (argv[1]), dfa, 0); beg = argv[2]; -- 2.7.4