From ec2fcd6e4386c7e1ce34883f643cdff71552a9d6 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Thu, 11 Aug 2016 11:53:24 +0900 Subject: [PATCH 2/2] dfa: support not newline_anchor of regex * src/dfa.c (struct dfa): Add member sbit, letters and newline. (sbit, letters, newline): Remove global vars. (newline_anchor): Add new global var. (char_context): Define context for not newline_anchor. (dfasyntax): Add argument newline_anchor. Update prototype and all callers. And remove checking context. (lex): Use cached values to check whether each character is letter or not. (charclass_context): Avoid context from hard-coded for EOL byte (dfastate): Use cached values to check whether each character is newline, letter or none. letter or not. (dfaexec_main): Define transition after found newline in input and accepted condition for not newline_anchor. (dfainit): Add checking context. --- src/dfa.c | 118 +++++++++++++++++++++++++++++-------------------- src/dfa.h | 2 +- src/dfasearch.c | 2 +- tests/dfa-match-aux.c | 2 +- 4 files changed, 73 insertions(+), 51 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 59bb3bc..7db96e1 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -316,6 +316,11 @@ struct mb_char_classes /* A compiled regular expression. */ struct dfa { + /* Fields filled by the initializer. */ + int sbit[NOTCHAR]; /* Cache of char-context values. */ + charclass letters; /* Set of characters considered letters. */ + charclass newline; /* Set of characters that are newline. */ + /* Fields filled by the scanner. */ charclass *charclasses; /* Array of character sets for CSET tokens. */ size_t cindex; /* Index for adding new charclasses. */ @@ -661,18 +666,12 @@ static bool case_fold; /* End-of-line byte in data. */ static unsigned char eolbyte; -/* Cache of char-context values. */ -static int sbit[NOTCHAR]; - /* If never_trail[B], the byte B cannot be a non-initial byte in a multibyte character. */ static bool never_trail[NOTCHAR]; -/* Set of characters considered letters. */ -static charclass letters; - -/* Set of characters that are newline. */ -static charclass newline; +/* Whether ^ and $ match newline or not. */ +static bool newline_anchor; static bool unibyte_word_constituent (unsigned char c) @@ -683,7 +682,7 @@ unibyte_word_constituent (unsigned char c) static int char_context (unsigned char c) { - if (c == eolbyte) + if (c == eolbyte && newline_anchor) return CTX_NEWLINE; if (unibyte_word_constituent (c)) return CTX_LETTER; @@ -692,13 +691,14 @@ char_context (unsigned char c) /* Entry point to set syntax options. */ void -dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol) +dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol, bool anchor) { int i; syntax_bits_set = true; syntax_bits = bits; case_fold = fold; eolbyte = eol; + newline_anchor = anchor; for (i = CHAR_MIN; i <= CHAR_MAX; ++i) { @@ -707,23 +707,6 @@ dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol) mbstate_t s = { 0 }; wchar_t wc; mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; - - /* Now that mbrtowc_cache[uc] is set, use it to calculate sbit. */ - sbit[uc] = char_context (uc); - switch (sbit[uc]) - { - case CTX_LETTER: - setbit (uc, letters); - break; - case CTX_NEWLINE: - setbit (uc, newline); - break; - } - - /* POSIX requires that the five bytes in "\n\r./" (including the - terminating NUL) cannot occur inside a multibyte character. */ - never_trail[uc] = (using_utf8 () ? (uc & 0xc0) != 0x80 - : strchr ("\n\r./", uc) != NULL); } } @@ -1486,7 +1469,7 @@ lex (void) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) - if (unibyte_word_constituent (c2)) + if (dfa->sbit[c2] == CTX_LETTER) setbit (c2, ccl); if (c == 'W') notset (ccl); @@ -2216,19 +2199,18 @@ epsclosure (position_set *s, struct dfa const *d, char *visited) character included in C. */ static int -charclass_context (charclass c) +charclass_context (struct dfa *d, charclass c) { int context = 0; unsigned int j; - if (tstbit (eolbyte, c)) - context |= CTX_NEWLINE; - for (j = 0; j < CHARCLASS_WORDS; ++j) { - if (c[j] & letters[j]) + if (c[j] & d->newline[j]) + context |= CTX_NEWLINE; + if (c[j] & d->letters[j]) context |= CTX_LETTER; - if (c[j] & ~(letters[j] | newline[j])) + if (c[j] & ~(d->letters[j] | d->newline[j])) context |= CTX_NONE; } @@ -2623,15 +2605,15 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, CTX_NEWLINE)) for (j = 0; j < CHARCLASS_WORDS; ++j) - matches[j] &= ~newline[j]; + matches[j] &= ~d->newline[j]; if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, CTX_LETTER)) for (j = 0; j < CHARCLASS_WORDS; ++j) - matches[j] &= ~letters[j]; + matches[j] &= ~d->letters[j]; if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, CTX_NONE)) for (j = 0; j < CHARCLASS_WORDS; ++j) - matches[j] &= letters[j] | newline[j]; + matches[j] &= d->letters[j] | d->newline[j]; /* If there are no characters left, there's no point in going on. */ for (j = 0; j < CHARCLASS_WORDS && !matches[j]; ++j) @@ -2736,8 +2718,9 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) state_letter = state; for (i = 0; i < NOTCHAR; ++i) - trans[i] = unibyte_word_constituent (i) ? state_letter : state; - trans[eolbyte] = state_newline; + trans[i] = d->sbit[i] == CTX_LETTER ? state_letter : state; + if (d->sbit[eolbyte] == CTX_NEWLINE) + trans[eolbyte] = state_newline; } else for (i = 0; i < NOTCHAR; ++i) @@ -2793,7 +2776,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) } /* Find out if the new state will want any context information. */ - possible_contexts = charclass_context (labels[i]); + possible_contexts = charclass_context (d, labels[i]); separate_contexts = state_separate_contexts (&follows); /* Find the state(s) corresponding to the union of the follows. */ @@ -2840,12 +2823,21 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) { int c = j * CHARCLASS_WORD_BITS + k; - if (c == eolbyte) - trans[c] = state_newline; - else if (unibyte_word_constituent (c)) - trans[c] = state_letter; - else if (c < NOTCHAR) - trans[c] = state; + if (c >= NOTCHAR) + break; + + switch (d->sbit[c]) + { + case CTX_NEWLINE: + trans[c] = state_newline; + break; + case CTX_LETTER: + trans[c] = state_letter; + break; + default: + trans[c] = state; + break; + } } } @@ -3276,11 +3268,17 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, nlcount++; mbp = p; - s = allow_nl ? d->newlines[s1] : 0; + s = (allow_nl ? d->newlines[s1] + : (d->sbit[eol] == CTX_NEWLINE ? 0 + : (d->sbit[eol] == CTX_LETTER ? d->min_trcount - 1 + : d->initstate_notbol))); } else if (d->fails[s]) { - if (d->success[s] & sbit[*p]) + if (d->success[s] & d->sbit[*p] + || ((char *) p == end + && ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, + *d))) goto done; if (multibyte && s < d->min_trcount) @@ -3397,10 +3395,34 @@ free_mbdata (struct dfa *d) static void dfainit (struct dfa *d) { + int i; + memset (d, 0, sizeof *d); d->multibyte = MB_CUR_MAX > 1; d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb; d->fast = !d->multibyte; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + unsigned char uc = i; + + /* Now that mbrtowc_cache[uc] is set, use it to calculate sbit. */ + d->sbit[uc] = char_context (uc); + switch (d->sbit[uc]) + { + case CTX_LETTER: + setbit (uc, d->letters); + break; + case CTX_NEWLINE: + setbit (uc, d->newline); + break; + } + + /* POSIX requires that the five bytes in "\n\r./" (including the + terminating NUL) cannot occur inside a multibyte character. */ + never_trail[uc] = (using_utf8 () ? (uc & 0xc0) != 0x80 + : strchr ("\n\r./", uc) != NULL); + } } /* Return true if every construct in D is supported by this DFA matcher. */ diff --git a/src/dfa.h b/src/dfa.h index 60da0e4..0e259bf 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -53,7 +53,7 @@ extern void dfamustfree (struct dfamust *); /* dfasyntax() takes three arguments; the first sets the syntax bits described earlier in this file, the second sets the case-folding flag, and the third specifies the line terminator. */ -extern void dfasyntax (reg_syntax_t, bool, unsigned char); +extern void dfasyntax (reg_syntax_t, bool, unsigned char, bool); /* Compile the given string of the given length into the given struct dfa. Final argument is a flag specifying whether to build a searching or an diff --git a/src/dfasearch.c b/src/dfasearch.c index 9a523c8..17d6a74 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -128,7 +128,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) if (match_icase) syntax_bits |= RE_ICASE; re_set_syntax (syntax_bits); - dfasyntax (syntax_bits, match_icase, eolbyte); + dfasyntax (syntax_bits, match_icase, eolbyte, true); /* For GNU regex, pass the patterns separately to detect errors like "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and diff --git a/tests/dfa-match-aux.c b/tests/dfa-match-aux.c index af933ff..f8db72c 100644 --- a/tests/dfa-match-aux.c +++ b/tests/dfa-match-aux.c @@ -54,7 +54,7 @@ main (int argc, char **argv) setlocale (LC_ALL, ""); - dfasyntax (RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n'); + dfasyntax (RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n', 1); dfa = dfaalloc (); dfacomp (argv[1], strlen (argv[1]), dfa, 0); -- 1.7.1