From b94e2adefb22a5394324ec3681521214c51dc638 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 10 Aug 2015 20:18:23 +0900 Subject: [PATCH] dfa: simplify for non-POSIX locales Now dfa is not support range, collating element, equivalent class in non-POSIX locales. We can do dfa more simply by removing codes for them. * src/dfa.c (enum status_transit_state): Remove function. (transit_state_singlebyte): Update arguments and now returns next state instead of status_transit_state. (match_anychar): Remove function. (check_matching_with_multibyte_ops): Remove function. (transit_state_consume_1char): Remove function. (transit_state): Simplify it. (dfaexec_main): Now transit_state is called only when next character matches with ANYCHAR. (State_transition): Remove macro. --- src/dfa.c | 303 +++++++++++++++----------------------------------------------- 1 file changed, 73 insertions(+), 230 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index ac5129b..b5c31f7 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -2934,132 +2934,63 @@ build_state (state_num s, struct dfa *d) /* Multibyte character handling sub-routines for dfaexec. */ -/* Return values of transit_state_singlebyte, and - transit_state_consume_1char. */ -typedef enum -{ - TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */ - TRANSIT_STATE_DONE, /* State transition has finished. */ - TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */ -} status_transit_state; - /* Consume a single byte and transit state from 's' to '*next_state'. This function is almost same as the state transition routin in dfaexec. But state transition is done just once, otherwise matching succeed or reach the end of the buffer. */ -static status_transit_state -transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p, - state_num * next_state) +static state_num +transit_state_singlebyte (struct dfa *d, state_num const s, + unsigned char const **pp) { state_num *t; - state_num works = s; - status_transit_state rval = TRANSIT_STATE_IN_PROGRESS; - - while (rval == TRANSIT_STATE_IN_PROGRESS) + if (**pp == eolbyte) { - if ((t = d->trans[works]) != NULL) - { - works = t[*p]; - rval = TRANSIT_STATE_DONE; - if (works < 0) - works = 0; - } - else if (works < 0) - works = 0; - else if (d->fails[works]) - { - works = d->fails[works][*p]; - rval = TRANSIT_STATE_DONE; - } - else - { - build_state (works, d); - } - } - *next_state = works; - return rval; -} - -/* Match a "." against the current context. Return the length of the - match, in bytes. POS is the position of the ".". */ -static int -match_anychar (struct dfa *d, state_num s, position pos, - wint_t wc, size_t mbclen) -{ - int context; + /* S is always an initial state in transit_state in order that the + newline is the single. When transit_state is called, the + transition table for the state must have been built already. */ + assert (d->trans[s] != NULL || d->fails[s] != NULL); - /* Check syntax bits. */ - if (wc == (wchar_t) '\n') - { - if (!(syntax_bits & RE_DOT_NEWLINE)) - return 0; - } - else if (wc == (wchar_t) '\0') - { - if (syntax_bits & RE_DOT_NOT_NULL) - return 0; + ++*pp; + return d->newlines[s]; } - else if (wc == WEOF) - return 0; - - context = wchar_context (wc); - if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context)) - return 0; - - return mbclen; -} - -/* Check whether each of 'd->states[s].mbps.elem' can match. Then return the - array which corresponds to 'd->states[s].mbps.elem'; each element of the - array contains the number of bytes with which the element can match. - - The caller MUST free the array which this function return. */ -static int * -check_matching_with_multibyte_ops (struct dfa *d, state_num s, - char const *p, wint_t wc, size_t mbclen) -{ - size_t i; - int *rarray; - rarray = d->mb_match_lens; - for (i = 0; i < d->states[s].mbps.nelem; ++i) + if (d->trans[s] != NULL) + t = d->trans[s]; + else if (d->fails[s] != NULL) + t = d->fails[s]; + else { - position pos = d->states[s].mbps.elems[i]; - switch (d->tokens[pos.index]) - { - case ANYCHAR: - rarray[i] = match_anychar (d, s, pos, wc, mbclen); - break; - default: - break; /* cannot happen. */ - } + build_state (s, d); + if (d->trans[s]) + t = d->trans[s]; + else if (d->fails[s]) + t = d->fails[s]; + else + abort (); } - return rarray; -} - -/* Consume a single character and enumerate all of the positions which can - be the next position from the state 's'. - 'match_lens' is the input. It can be NULL, but it can also be the output - of check_matching_with_multibyte_ops for optimization. + return t[*(*pp)++]; +} - 'mbclen' and 'pps' are the output. 'mbclen' is the length of the - character consumed, and 'pps' is the set this function enumerates. */ -static status_transit_state -transit_state_consume_1char (struct dfa *d, state_num s, - unsigned char const **pp, - wint_t wc, size_t mbclen, - int *match_lens) +/* Transit state from s, then return new state and update the pointer of the + buffer. This function is for some operator which can match with a multi- + byte character or a collating element (which may be multi characters). */ +static state_num +transit_state (struct dfa *d, state_num s, unsigned char const **pp, + unsigned char const *end) { + state_num s1, s2; + int mbclen; /* The length of current input multibyte character. */ + wint_t wc; size_t i, j; int k; - state_num s1, s2; - status_transit_state rs = TRANSIT_STATE_DONE; - if (! match_lens && d->states[s].mbps.nelem != 0) - match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp, - wc, mbclen); + /* Note: caller must free the return value of this function. */ + mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); + + /* This state has some operators which can match a multibyte character. */ + d->mb_follows.nelem = 0; /* Calculate the state which can be reached from the state 's' by consuming 'mbclen' single bytes from the buffer. */ @@ -3067,7 +2998,7 @@ transit_state_consume_1char (struct dfa *d, state_num s, for (k = 0; k < mbclen; k++) { s2 = s1; - rs = transit_state_singlebyte (d, s2, (*pp)++, &s1); + s1 = transit_state_singlebyte (d, s2, pp); } copy (&d->states[s1].elems, &d->mb_follows); @@ -3075,94 +3006,15 @@ transit_state_consume_1char (struct dfa *d, state_num s, a single character. */ for (i = 0; i < d->states[s].mbps.nelem; i++) { - if (match_lens[i] == mbclen) - for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; - j++) - insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], - &d->mb_follows); - } - - /* FIXME: this return value is always ignored. */ - return rs; -} - -/* Transit state from s, then return new state and update the pointer of the - buffer. This function is for some operator which can match with a multi- - byte character or a collating element (which may be multi characters). */ -static state_num -transit_state (struct dfa *d, state_num s, unsigned char const **pp, - unsigned char const *end) -{ - state_num s1; - int mbclen; /* The length of current input multibyte character. */ - int maxlen = 0; - size_t i, j; - int *match_lens = NULL; - size_t nelem = d->states[s].mbps.nelem; /* Just a alias. */ - unsigned char const *p1 = *pp; - wint_t wc; - - if (nelem > 0) - /* This state has (a) multibyte operator(s). - We check whether each of them can match or not. */ - { - /* Note: caller must free the return value of this function. */ - mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); - match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp, - wc, mbclen); - - for (i = 0; i < nelem; i++) - /* Search the operator which match the longest string, - in this state. */ - { - if (match_lens[i] > maxlen) - maxlen = match_lens[i]; - } - } - - if (nelem == 0 || maxlen == 0) - /* This state has no multibyte operator which can match. - We need to check only one single byte character. */ - { - status_transit_state rs; - rs = transit_state_singlebyte (d, s, *pp, &s1); - - /* We must update the pointer if state transition succeeded. */ - if (rs == TRANSIT_STATE_DONE) - ++*pp; - - return s1; + for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; + j++) + insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], + &d->mb_follows); } - /* This state has some operators which can match a multibyte character. */ - d->mb_follows.nelem = 0; - - /* 'maxlen' may be longer than the length of a character, because it may - not be a character but a (multi character) collating element. - We enumerate all of the positions which 's' can reach by consuming - 'maxlen' bytes. */ - transit_state_consume_1char (d, s, pp, wc, mbclen, match_lens); - s1 = state_index (d, &d->mb_follows, wchar_context (wc)); realloc_trans_if_necessary (d, s1); - while (*pp - p1 < maxlen) - { - mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); - transit_state_consume_1char (d, s1, pp, wc, mbclen, NULL); - - for (i = 0; i < nelem; i++) - { - if (match_lens[i] == *pp - p1) - for (j = 0; - j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) - insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j], - &d->mb_follows); - } - - s1 = state_index (d, &d->mb_follows, wchar_context (wc)); - realloc_trans_if_necessary (d, s1); - } return s1; } @@ -3292,44 +3144,22 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl, } } - if (d->states[s].mbps.nelem == 0) + if (d->states[s].mbps.nelem == 0 || + (*p == eol && !(allow_nl && (syntax_bits & RE_DOT_NEWLINE))) + || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL)) + || (char *) p >= end) + /* If a input character does not match ANYCHAR, do it + like a single-byte character. */ + s = t[*p++]; + else { - s = t[*p++]; - continue; - } - - /* The following code is used twice. - Use a macro to avoid the risk that they diverge. */ -#define State_transition() \ - do { \ - /* Can match with a multibyte character (and multi-character \ - collating element). Transition table might be updated. */ \ - s = transit_state (d, s, &p, (unsigned char *) end); \ - \ - /* If previous character is newline after a transition \ - for ANYCHAR or MBCSET in non-UTF8 multibyte locales, \ - check whether current position is beyond the end of \ - the input buffer. Also, transit to initial state if \ - !ALLOW_NL, even if RE_DOT_NEWLINE is set. */ \ - if (p[-1] == eol) \ - { \ - if ((char *) p > end) \ - { \ - p = NULL; \ - goto done; \ - } \ - \ - nlcount++; \ - \ - if (!allow_nl) \ - s = 0; \ - } \ - \ - mbp = p; \ - trans = d->trans; \ - } while (0) + s = transit_state (d, s, &p, (unsigned char *) end); - State_transition(); + if (s >= 0 && p[-1] == eol) + nlcount++; + mbp = p; + trans = d->trans; + } } } else @@ -3378,10 +3208,23 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl, goto done; s1 = s; - if (multibyte) - State_transition(); - else + if (!multibyte || d->states[s].mbps.nelem == 0 || + (*p == eol && !(allow_nl && (syntax_bits & RE_DOT_NEWLINE))) + || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL)) + || (char *) p >= end) + /* If a input character does not match ANYCHAR, do it + like a single-byte character. */ s = d->fails[s][*p++]; + else + { + s = transit_state (d, s, &p, (unsigned char *) end); + + if (s >= 0 && p[-1] == eol) + nlcount++; + + mbp = p; + trans = d->trans; + } } else { -- 2.4.6