bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 15/17] dfa: run simple UTF-8 regexps as a single-byte character s


From: Paolo Bonzini
Subject: [PATCH 15/17] dfa: run simple UTF-8 regexps as a single-byte character set
Date: Fri, 12 Mar 2010 18:49:16 +0100

This partially works around https://savannah.gnu.org/bugs/?29117,
but in general provides a speedup whenever fgrep is "almost" sufficient
but not quite (e.g. grep ^abc).  Speedup is too good to be true :-)
(can get to 1000x on some not-too-contrived testcases).

* src/dfa.c (dfaoptimize): New.
(dfacomp): Call it.
---
 src/dfa.c |   25 +++++++++++++++++++++++++
 1 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 9703c4f..526eb4e 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -2975,6 +2975,30 @@ dfainit (struct dfa *d)
 #endif
 }
 
+static void
+dfaoptimize (struct dfa *d)
+{
+  int i;
+  if (!using_utf8)
+    return;
+
+  for (i = 0; i < d->tindex; ++i)
+    {
+      switch(d->tokens[i])
+       {
+       case ANYCHAR:
+         return;
+       case MBCSET:
+         return;
+       default:
+         break; /* can not happen.  */
+       }
+    }
+
+  free_mbdata (d);
+  d->mb_cur_max = 1;
+}
+
 /* Parse and analyze a single string of the given length. */
 void
 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
@@ -2982,6 +3006,7 @@ dfacomp (char const *s, size_t len, struct dfa *d, int 
searchflag)
   check_utf8();
   dfainit(d);
   dfaparse(s, len, d);
+  dfaoptimize(d);
   dfamust(d);
   dfaanalyze(d, searchflag);
 }
-- 
1.6.6






reply via email to

[Prev in Thread] Current Thread [Next in Thread]