>From 5eae16cfb8d40ca75691605e8f2f4a40eab70c7a Mon Sep 17 00:00:00 2001
From: Paul Eggert <address@hidden>
Date: Tue, 18 Sep 2018 15:25:51 -0700
Subject: [PATCH 1/4] dfa: reorder enum for efficiency

* lib/dfa.c (END): Now -1 again.  Reorder other elements
of the enumeration to make it easier for GCC to generate
efficient code by using fewer comparisons to check for
ranges of values.
(atom): Take advantage of the reordering.
---
 ChangeLog |   9 ++++
 lib/dfa.c | 130 +++++++++++++++++++++++++++++-------------------------
 2 files changed, 78 insertions(+), 61 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 59581e3c5..64926503a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2018-09-18  Paul Eggert  <address@hidden>
+
+	dfa: reorder enum for efficiency
+	* lib/dfa.c (END): Now -1 again.  Reorder other elements
+	of the enumeration to make it easier for GCC to generate
+	efficient code by using fewer comparisons to check for
+	ranges of values.
+	(atom): Take advantage of the reordering.
+
 2018-09-18  Norihiro Tanaka  <address@hidden>
 
 	dfa: optimize alternation in NFA
diff --git a/lib/dfa.c b/lib/dfa.c
index 3991112ef..849645154 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -223,50 +223,24 @@ typedef ptrdiff_t state_num;
 /* Predefined token values.  */
 enum
 {
-  END = -2,                     /* END is a terminal symbol that matches the
+  END = -1,                     /* END is a terminal symbol that matches the
                                    end of input; any value of END or less in
                                    the parse tree is such a symbol.  Accepting
                                    states of the DFA are those that would have
-                                   a transition on END.  */
-
-  BEG = -1,                     /* BEG is a beginning symbol that matches the
-                                   biginning of input.  */
+                                   a transition on END.  This is -1, not some
+                                   more-negative value, to tweak the speed of
+                                   comparisons to END.  */
 
   /* Ordinary character values are terminal symbols that match themselves.  */
 
+  /* CSET must come last in the following list of special tokens.  Otherwise,
+     the list order matters only for performance.  Related special tokens
+     should have nearby values so that code like (t == ANYCHAR || t == MBCSET
+     || CSET <= t) can be done with a single machine-level comparison.  */
+
   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
                                    the empty string.  */
 
-  BACKREF,                      /* BACKREF is generated by \<digit>
-                                   or by any other construct that
-                                   is not completely handled.  If the scanner
-                                   detects a transition on backref, it returns
-                                   a kind of "semi-success" indicating that
-                                   the match will have to be verified with
-                                   a backtracking matcher.  */
-
-  BEGLINE,                      /* BEGLINE is a terminal symbol that matches
-                                   the empty string at the beginning of a
-                                   line.  */
-
-  ENDLINE,                      /* ENDLINE is a terminal symbol that matches
-                                   the empty string at the end of a line.  */
-
-  BEGWORD,                      /* BEGWORD is a terminal symbol that matches
-                                   the empty string at the beginning of a
-                                   word.  */
-
-  ENDWORD,                      /* ENDWORD is a terminal symbol that matches
-                                   the empty string at the end of a word.  */
-
-  LIMWORD,                      /* LIMWORD is a terminal symbol that matches
-                                   the empty string at the beginning or the
-                                   end of a word.  */
-
-  NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
-                                   matches the empty string not at
-                                   the beginning or end of a word.  */
-
   QMARK,                        /* QMARK is an operator of one argument that
                                    matches zero or one occurrences of its
                                    argument.  */
@@ -296,16 +270,49 @@ enum
 
   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 
+  WCHAR,                        /* Only returned by lex.  wctok contains
+                                   the wide character representation.  */
+
   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
                                    a valid multibyte (or single byte) character.
                                    It is used only if MB_CUR_MAX > 1.  */
 
+  BEG,                          /* BEG is an initial symbol that matches the
+                                   beginning of input.  */
+
+  BEGLINE,                      /* BEGLINE is a terminal symbol that matches
+                                   the empty string at the beginning of a
+                                   line.  */
+
+  ENDLINE,                      /* ENDLINE is a terminal symbol that matches
+                                   the empty string at the end of a line.  */
+
+  BEGWORD,                      /* BEGWORD is a terminal symbol that matches
+                                   the empty string at the beginning of a
+                                   word.  */
+
+  ENDWORD,                      /* ENDWORD is a terminal symbol that matches
+                                   the empty string at the end of a word.  */
+
+  LIMWORD,                      /* LIMWORD is a terminal symbol that matches
+                                   the empty string at the beginning or the
+                                   end of a word.  */
+
+  NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
+                                   matches the empty string not at
+                                   the beginning or end of a word.  */
+
+  BACKREF,                      /* BACKREF is generated by \<digit>
+                                   or by any other construct that
+                                   is not completely handled.  If the scanner
+                                   detects a transition on backref, it returns
+                                   a kind of "semi-success" indicating that
+                                   the match will have to be verified with
+                                   a backtracking matcher.  */
+
   MBCSET,                       /* MBCSET is similar to CSET, but for
                                    multibyte characters.  */
 
-  WCHAR,                        /* Only returned by lex.  wctok contains
-                                   the wide character representation.  */
-
   CSET                          /* CSET and (and any value greater) is a
                                    terminal symbol that matches any of a
                                    class of characters.  */
@@ -1803,7 +1810,30 @@ add_utf8_anychar (struct dfa *dfa)
 static void
 atom (struct dfa *dfa)
 {
-  if (dfa->parse.tok == WCHAR)
+  if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
+      || dfa->parse.tok >= CSET
+      || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
+      || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
+      || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
+      || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
+      || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
+    {
+      if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
+        {
+          /* For UTF-8 expand the period to a series of CSETs that define a
+             valid UTF-8 character.  This avoids using the slow multibyte
+             path.  I'm pretty sure it would be both profitable and correct to
+             do it for any encoding; however, the optimization must be done
+             manually as it is done above in add_utf8_anychar.  So, let's
+             start with UTF-8: it is the most used, and the structure of the
+             encoding makes the correctness more obvious.  */
+          add_utf8_anychar (dfa);
+        }
+      else
+        addtok (dfa, dfa->parse.tok);
+      dfa->parse.tok = lex (dfa);
+    }
+  else if (dfa->parse.tok == WCHAR)
     {
       if (dfa->lex.wctok == WEOF)
         addtok (dfa, BACKREF);
@@ -1826,28 +1856,6 @@ atom (struct dfa *dfa)
 
       dfa->parse.tok = lex (dfa);
     }
-  else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
-    {
-      /* For UTF-8 expand the period to a series of CSETs that define a valid
-         UTF-8 character.  This avoids using the slow multibyte path.  I'm
-         pretty sure it would be both profitable and correct to do it for
-         any encoding; however, the optimization must be done manually as
-         it is done above in add_utf8_anychar.  So, let's start with
-         UTF-8: it is the most used, and the structure of the encoding
-         makes the correctness more obvious.  */
-      add_utf8_anychar (dfa);
-      dfa->parse.tok = lex (dfa);
-    }
-  else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
-           || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF
-           || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
-           || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR
-           || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD
-           || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)
-    {
-      addtok (dfa, dfa->parse.tok);
-      dfa->parse.tok = lex (dfa);
-    }
   else if (dfa->parse.tok == LPAREN)
     {
       dfa->parse.tok = lex (dfa);
-- 
2.17.1