>From 5eae16cfb8d40ca75691605e8f2f4a40eab70c7a Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Tue, 18 Sep 2018 15:25:51 -0700 Subject: [PATCH 1/4] dfa: reorder enum for efficiency * lib/dfa.c (END): Now -1 again. Reorder other elements of the enumeration to make it easier for GCC to generate efficient code by using fewer comparisons to check for ranges of values. (atom): Take advantage of the reordering. --- ChangeLog | 9 ++++ lib/dfa.c | 130 +++++++++++++++++++++++++++++------------------------- 2 files changed, 78 insertions(+), 61 deletions(-) diff --git a/ChangeLog b/ChangeLog index 59581e3c5..64926503a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2018-09-18 Paul Eggert + + dfa: reorder enum for efficiency + * lib/dfa.c (END): Now -1 again. Reorder other elements + of the enumeration to make it easier for GCC to generate + efficient code by using fewer comparisons to check for + ranges of values. + (atom): Take advantage of the reordering. + 2018-09-18 Norihiro Tanaka dfa: optimize alternation in NFA diff --git a/lib/dfa.c b/lib/dfa.c index 3991112ef..849645154 100644 --- a/lib/dfa.c +++ b/lib/dfa.c @@ -223,50 +223,24 @@ typedef ptrdiff_t state_num; /* Predefined token values. */ enum { - END = -2, /* END is a terminal symbol that matches the + END = -1, /* END is a terminal symbol that matches the end of input; any value of END or less in the parse tree is such a symbol. Accepting states of the DFA are those that would have - a transition on END. */ - - BEG = -1, /* BEG is a beginning symbol that matches the - biginning of input. */ + a transition on END. This is -1, not some + more-negative value, to tweak the speed of + comparisons to END. */ /* Ordinary character values are terminal symbols that match themselves. */ + /* CSET must come last in the following list of special tokens. Otherwise, + the list order matters only for performance. Related special tokens + should have nearby values so that code like (t == ANYCHAR || t == MBCSET + || CSET <= t) can be done with a single machine-level comparison. */ + EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches the empty string. */ - BACKREF, /* BACKREF is generated by \ - or by any other construct that - is not completely handled. If the scanner - detects a transition on backref, it returns - a kind of "semi-success" indicating that - the match will have to be verified with - a backtracking matcher. */ - - BEGLINE, /* BEGLINE is a terminal symbol that matches - the empty string at the beginning of a - line. */ - - ENDLINE, /* ENDLINE is a terminal symbol that matches - the empty string at the end of a line. */ - - BEGWORD, /* BEGWORD is a terminal symbol that matches - the empty string at the beginning of a - word. */ - - ENDWORD, /* ENDWORD is a terminal symbol that matches - the empty string at the end of a word. */ - - LIMWORD, /* LIMWORD is a terminal symbol that matches - the empty string at the beginning or the - end of a word. */ - - NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that - matches the empty string not at - the beginning or end of a word. */ - QMARK, /* QMARK is an operator of one argument that matches zero or one occurrences of its argument. */ @@ -296,16 +270,49 @@ enum RPAREN, /* RPAREN never appears in the parse tree. */ + WCHAR, /* Only returned by lex. wctok contains + the wide character representation. */ + ANYCHAR, /* ANYCHAR is a terminal symbol that matches a valid multibyte (or single byte) character. It is used only if MB_CUR_MAX > 1. */ + BEG, /* BEG is an initial symbol that matches the + beginning of input. */ + + BEGLINE, /* BEGLINE is a terminal symbol that matches + the empty string at the beginning of a + line. */ + + ENDLINE, /* ENDLINE is a terminal symbol that matches + the empty string at the end of a line. */ + + BEGWORD, /* BEGWORD is a terminal symbol that matches + the empty string at the beginning of a + word. */ + + ENDWORD, /* ENDWORD is a terminal symbol that matches + the empty string at the end of a word. */ + + LIMWORD, /* LIMWORD is a terminal symbol that matches + the empty string at the beginning or the + end of a word. */ + + NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that + matches the empty string not at + the beginning or end of a word. */ + + BACKREF, /* BACKREF is generated by \ + or by any other construct that + is not completely handled. If the scanner + detects a transition on backref, it returns + a kind of "semi-success" indicating that + the match will have to be verified with + a backtracking matcher. */ + MBCSET, /* MBCSET is similar to CSET, but for multibyte characters. */ - WCHAR, /* Only returned by lex. wctok contains - the wide character representation. */ - CSET /* CSET and (and any value greater) is a terminal symbol that matches any of a class of characters. */ @@ -1803,7 +1810,30 @@ add_utf8_anychar (struct dfa *dfa) static void atom (struct dfa *dfa) { - if (dfa->parse.tok == WCHAR) + if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR) + || dfa->parse.tok >= CSET + || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF + || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE + || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD + || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD + || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET) + { + if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8) + { + /* For UTF-8 expand the period to a series of CSETs that define a + valid UTF-8 character. This avoids using the slow multibyte + path. I'm pretty sure it would be both profitable and correct to + do it for any encoding; however, the optimization must be done + manually as it is done above in add_utf8_anychar. So, let's + start with UTF-8: it is the most used, and the structure of the + encoding makes the correctness more obvious. */ + add_utf8_anychar (dfa); + } + else + addtok (dfa, dfa->parse.tok); + dfa->parse.tok = lex (dfa); + } + else if (dfa->parse.tok == WCHAR) { if (dfa->lex.wctok == WEOF) addtok (dfa, BACKREF); @@ -1826,28 +1856,6 @@ atom (struct dfa *dfa) dfa->parse.tok = lex (dfa); } - else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8) - { - /* For UTF-8 expand the period to a series of CSETs that define a valid - UTF-8 character. This avoids using the slow multibyte path. I'm - pretty sure it would be both profitable and correct to do it for - any encoding; however, the optimization must be done manually as - it is done above in add_utf8_anychar. So, let's start with - UTF-8: it is the most used, and the structure of the encoding - makes the correctness more obvious. */ - add_utf8_anychar (dfa); - dfa->parse.tok = lex (dfa); - } - else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR) - || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF - || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE - || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR - || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD - || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD) - { - addtok (dfa, dfa->parse.tok); - dfa->parse.tok = lex (dfa); - } else if (dfa->parse.tok == LPAREN) { dfa->parse.tok = lex (dfa); -- 2.17.1