[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 3/6] dfa: reorder enum for efficiency
From: |
Paul Eggert |
Subject: |
[PATCH 3/6] dfa: reorder enum for efficiency |
Date: |
Tue, 18 Sep 2018 19:17:32 -0700 |
* lib/dfa.c (END): Now -1 again. Reorder other elements
of the enumeration to make it easier for GCC to generate
efficient code by using fewer comparisons to check for
ranges of values.
(atom): Take advantage of the reordering.
---
ChangeLog | 9 ++++
lib/dfa.c | 130 +++++++++++++++++++++++++++++-------------------------
2 files changed, 78 insertions(+), 61 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 59581e3c5..64926503a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2018-09-18 Paul Eggert <address@hidden>
+
+ dfa: reorder enum for efficiency
+ * lib/dfa.c (END): Now -1 again. Reorder other elements
+ of the enumeration to make it easier for GCC to generate
+ efficient code by using fewer comparisons to check for
+ ranges of values.
+ (atom): Take advantage of the reordering.
+
2018-09-18 Norihiro Tanaka <address@hidden>
dfa: optimize alternation in NFA
diff --git a/lib/dfa.c b/lib/dfa.c
index 3991112ef..849645154 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -223,50 +223,24 @@ typedef ptrdiff_t state_num;
/* Predefined token values. */
enum
{
- END = -2, /* END is a terminal symbol that matches the
+ END = -1, /* END is a terminal symbol that matches the
end of input; any value of END or less in
the parse tree is such a symbol. Accepting
states of the DFA are those that would have
- a transition on END. */
-
- BEG = -1, /* BEG is a beginning symbol that matches the
- biginning of input. */
+ a transition on END. This is -1, not some
+ more-negative value, to tweak the speed of
+ comparisons to END. */
/* Ordinary character values are terminal symbols that match themselves. */
+ /* CSET must come last in the following list of special tokens. Otherwise,
+ the list order matters only for performance. Related special tokens
+ should have nearby values so that code like (t == ANYCHAR || t == MBCSET
+ || CSET <= t) can be done with a single machine-level comparison. */
+
EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
the empty string. */
- BACKREF, /* BACKREF is generated by \<digit>
- or by any other construct that
- is not completely handled. If the scanner
- detects a transition on backref, it returns
- a kind of "semi-success" indicating that
- the match will have to be verified with
- a backtracking matcher. */
-
- BEGLINE, /* BEGLINE is a terminal symbol that matches
- the empty string at the beginning of a
- line. */
-
- ENDLINE, /* ENDLINE is a terminal symbol that matches
- the empty string at the end of a line. */
-
- BEGWORD, /* BEGWORD is a terminal symbol that matches
- the empty string at the beginning of a
- word. */
-
- ENDWORD, /* ENDWORD is a terminal symbol that matches
- the empty string at the end of a word. */
-
- LIMWORD, /* LIMWORD is a terminal symbol that matches
- the empty string at the beginning or the
- end of a word. */
-
- NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
- matches the empty string not at
- the beginning or end of a word. */
-
QMARK, /* QMARK is an operator of one argument that
matches zero or one occurrences of its
argument. */
@@ -296,16 +270,49 @@ enum
RPAREN, /* RPAREN never appears in the parse tree. */
+ WCHAR, /* Only returned by lex. wctok contains
+ the wide character representation. */
+
ANYCHAR, /* ANYCHAR is a terminal symbol that matches
a valid multibyte (or single byte)
character.
It is used only if MB_CUR_MAX > 1. */
+ BEG, /* BEG is an initial symbol that matches the
+ beginning of input. */
+
+ BEGLINE, /* BEGLINE is a terminal symbol that matches
+ the empty string at the beginning of a
+ line. */
+
+ ENDLINE, /* ENDLINE is a terminal symbol that matches
+ the empty string at the end of a line. */
+
+ BEGWORD, /* BEGWORD is a terminal symbol that matches
+ the empty string at the beginning of a
+ word. */
+
+ ENDWORD, /* ENDWORD is a terminal symbol that matches
+ the empty string at the end of a word. */
+
+ LIMWORD, /* LIMWORD is a terminal symbol that matches
+ the empty string at the beginning or the
+ end of a word. */
+
+ NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
+ matches the empty string not at
+ the beginning or end of a word. */
+
+ BACKREF, /* BACKREF is generated by \<digit>
+ or by any other construct that
+ is not completely handled. If the scanner
+ detects a transition on backref, it returns
+ a kind of "semi-success" indicating that
+ the match will have to be verified with
+ a backtracking matcher. */
+
MBCSET, /* MBCSET is similar to CSET, but for
multibyte characters. */
- WCHAR, /* Only returned by lex. wctok contains
- the wide character representation. */
-
CSET /* CSET and (and any value greater) is a
terminal symbol that matches any of a
class of characters. */
@@ -1803,7 +1810,30 @@ add_utf8_anychar (struct dfa *dfa)
static void
atom (struct dfa *dfa)
{
- if (dfa->parse.tok == WCHAR)
+ if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
+ || dfa->parse.tok >= CSET
+ || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
+ || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
+ || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
+ || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
+ || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
+ {
+ if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
+ {
+ /* For UTF-8 expand the period to a series of CSETs that define a
+ valid UTF-8 character. This avoids using the slow multibyte
+ path. I'm pretty sure it would be both profitable and correct to
+ do it for any encoding; however, the optimization must be done
+ manually as it is done above in add_utf8_anychar. So, let's
+ start with UTF-8: it is the most used, and the structure of the
+ encoding makes the correctness more obvious. */
+ add_utf8_anychar (dfa);
+ }
+ else
+ addtok (dfa, dfa->parse.tok);
+ dfa->parse.tok = lex (dfa);
+ }
+ else if (dfa->parse.tok == WCHAR)
{
if (dfa->lex.wctok == WEOF)
addtok (dfa, BACKREF);
@@ -1826,28 +1856,6 @@ atom (struct dfa *dfa)
dfa->parse.tok = lex (dfa);
}
- else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
- {
- /* For UTF-8 expand the period to a series of CSETs that define a valid
- UTF-8 character. This avoids using the slow multibyte path. I'm
- pretty sure it would be both profitable and correct to do it for
- any encoding; however, the optimization must be done manually as
- it is done above in add_utf8_anychar. So, let's start with
- UTF-8: it is the most used, and the structure of the encoding
- makes the correctness more obvious. */
- add_utf8_anychar (dfa);
- dfa->parse.tok = lex (dfa);
- }
- else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
- || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF
- || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
- || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR
- || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD
- || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)
- {
- addtok (dfa, dfa->parse.tok);
- dfa->parse.tok = lex (dfa);
- }
else if (dfa->parse.tok == LPAREN)
{
dfa->parse.tok = lex (dfa);
--
2.17.1