diff --git a/src/dfa.c b/src/dfa.c
index 42a9736..65fc03d 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -339,9 +339,8 @@ struct dfa
                                    with dfaparse.  */
   unsigned int mb_cur_max;      /* Cached value of MB_CUR_MAX.  */
   token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
-  mbstate_t mbs;		/* Multibyte conversion state.  */
 
-  /* The following are valid only if mb_cur_max > 1.  */
+  /* The following are used only if MB_CUR_MAX > 1.  */
 
   /* The value of multibyte_prop[i] is defined by following rule.
      if tokens[i] < NOTCHAR
@@ -423,7 +422,7 @@ struct dfa
   struct dfamust *musts;        /* List of strings, at least one of which
                                    is known to appear in any r.e. matching
                                    the dfa.  */
-  position_set mb_follows;	/* Follow set added by ANYCHAR and/or MBCSET
+  position_set *mb_follows;     /* Follow set added by ANYCHAR and/or MBCSET
                                    on demand.  */
   int *mb_match_lens;           /* Array of length reduced by ANYCHAR and/or
                                    MBCSET.  */
@@ -463,34 +462,33 @@ dfambcache (struct dfa *d)
     }
 }
 
-/* Store into *PWC the result of converting the leading bytes of the
-   multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
-   and updating the conversion state in *D.  On conversion error,
-   convert just a single byte as-is.  Return the number of bytes
-   converted.
+/* Given the dfa D, store into *PWC the result of converting the
+   leading bytes of the multibyte buffer S of length N bytes, updating
+   the conversion state in *MBS.  On conversion error, convert just a
+   single byte as-is.  Return the number of bytes converted.
 
-   This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
+   This differs from mbrtowc (PWC, S, N, MBS) as follows:
 
-   * The last arg is a dfa *D instead of merely a multibyte conversion
-     state D->mbs.  D also contains an mbrtowc_cache for speed.
+   * Extra arg D, containing an mbrtowc_cache for speed.
    * N must be at least 1.
    * S[N - 1] must be a sentinel byte.
    * Shift encodings are not supported.
    * The return value is always in the range 1..N.
-   * D->mbs is always valid afterwards.
+   * *MBS is always valid afterwards.
    * *PWC is always set to something.  */
 static size_t
-mbs_to_wchar (wchar_t *pwc, char const *s, size_t n, struct dfa *d)
+mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n,
+              mbstate_t *mbs)
 {
   unsigned char uc = s[0];
   wint_t wc = d->mbrtowc_cache[uc];
 
   if (wc == WEOF)
     {
-      size_t nbytes = mbrtowc (pwc, s, n, &d->mbs);
+      size_t nbytes = mbrtowc (pwc, s, n, mbs);
       if (0 < nbytes && nbytes < (size_t) -2)
         return nbytes;
-      memset (&d->mbs, 0, sizeof d->mbs);
+      memset (mbs, 0, sizeof *mbs);
       wc = uc;
     }
 
@@ -840,6 +838,7 @@ static int minrep, maxrep;      /* Repeat counts for {m,n}.  */
 static int cur_mb_len = 1;      /* Length of the multibyte representation of
                                    wctok.  */
 /* These variables are used only if (MB_CUR_MAX > 1).  */
+static mbstate_t mbs;           /* mbstate for mbrtowc.  */
 static wchar_t wctok;           /* Wide character representation of the current
                                    multibyte character.  */
 
@@ -857,7 +856,7 @@ static wchar_t wctok;           /* Wide character representation of the current
     else					\
       {						\
         wchar_t _wc;				\
-        size_t nbytes = mbs_to_wchar (&_wc, lexptr, lexleft, dfa); \
+        size_t nbytes = mbs_to_wchar (dfa, &_wc, lexptr, lexleft, &mbs); \
         cur_mb_len = nbytes;			\
         (wc) = _wc;				\
         (c) = nbytes == 1 ? to_uchar (*lexptr) : EOF;    \
@@ -1933,7 +1932,7 @@ dfaparse (char const *s, size_t len, struct dfa *d)
   if (MB_CUR_MAX > 1)
     {
       cur_mb_len = 0;
-      memset (&d->mbs, 0, sizeof d->mbs);
+      memset (&mbs, 0, sizeof mbs);
     }
 
   if (!syntax_bits_set)
@@ -3113,7 +3112,8 @@ transit_state_consume_1char (struct dfa *d, state_num s,
       s2 = s1;
       rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
     }
-  copy (&d->states[s1].elems, &d->mb_follows);
+  /* Copy the positions contained by 's1' to the set 'd->mb_follows'.  */
+  copy (&(d->states[s1].elems), d->mb_follows);
 
   /* Add all of the positions which can be reached from 's' by consuming
      a single character.  */
@@ -3123,7 +3123,7 @@ transit_state_consume_1char (struct dfa *d, state_num s,
         for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
              j++)
           insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
-                  &d->mb_follows);
+                  d->mb_follows);
     }
 
   /* FIXME: this return value is always ignored.  */
@@ -3151,7 +3151,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
        We check whether each of them can match or not.  */
     {
       /* Note: caller must free the return value of this function.  */
-      mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
+      mbclen = mbs_to_wchar (d, &wc, (char const *) *pp, end - *pp, &mbs);
       match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp,
                                                       wc, mbclen);
 
@@ -3179,7 +3179,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
     }
 
   /* This state has some operators which can match a multibyte character.  */
-  d->mb_follows.nelem = 0;
+  d->mb_follows->nelem = 0;
 
   /* 'maxlen' may be longer than the length of a character, because it may
      not be a character but a (multi character) collating element.
@@ -3187,12 +3187,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
      'maxlen' bytes.  */
   transit_state_consume_1char (d, s, pp, wc, mbclen, match_lens);
 
-  s1 = state_index (d, &d->mb_follows, wchar_context (wc));
+  s1 = state_index (d, d->mb_follows, wchar_context (wc));
   realloc_trans_if_necessary (d, s1);
 
   while (*pp - p1 < maxlen)
     {
-      mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
+      mbclen = mbs_to_wchar (d, &wc, (char const *) *pp, end - *pp, &mbs);
       transit_state_consume_1char (d, s1, pp, wc, mbclen, NULL);
 
       for (i = 0; i < nelem; i++)
@@ -3201,10 +3201,10 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
             for (j = 0;
                  j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
               insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
-                      &d->mb_follows);
+                      d->mb_follows);
         }
 
-      s1 = state_index (d, &d->mb_follows, wchar_context (wc));
+      s1 = state_index (d, d->mb_follows, wchar_context (wc));
       realloc_trans_if_necessary (d, s1);
     }
   return s1;
@@ -3244,9 +3244,15 @@ dfaexec (struct dfa *d, char const *begin, char *end,
 
   if (d->mb_cur_max > 1)
     {
-      memset (&d->mbs, 0, sizeof d->mbs);
-      d->mb_match_lens = xnmalloc (d->nleaves, sizeof *d->mb_match_lens);
-      alloc_position_set (&d->mb_follows, d->nleaves);
+      static bool mb_alloc = false;
+      memset (&mbs, 0, sizeof (mbstate_t));
+      if (!mb_alloc)
+        {
+          d->mb_match_lens = xnmalloc (d->nleaves, sizeof *d->mb_match_lens);
+          d->mb_follows = xmalloc (sizeof *d->mb_follows);
+          alloc_position_set (d->mb_follows, d->nleaves);
+          mb_alloc = true;
+        }
     }
 
   for (;;)
@@ -3271,8 +3277,8 @@ dfaexec (struct dfa *d, char const *begin, char *end,
                      character.  */
                   wchar_t wc;
                   while (mbp < p)
-                    mbp += mbs_to_wchar (&wc, (char const *) mbp,
-                                         end - (char const *) mbp, d);
+                    mbp += mbs_to_wchar (d, &wc, (char const *) mbp,
+                                         end - (char const *) mbp, &mbs);
                   p = mbp;
 
                   if ((char *) p >= end)
@@ -3401,6 +3407,7 @@ free_mbdata (struct dfa *d)
   size_t i;
 
   free (d->multibyte_prop);
+  d->multibyte_prop = NULL;
 
   for (i = 0; i < d->nmbcsets; ++i)
     {
@@ -3420,7 +3427,14 @@ free_mbdata (struct dfa *d)
     }
 
   free (d->mbcsets);
-  free (d->mb_follows.elems);
+  d->mbcsets = NULL;
+  d->nmbcsets = 0;
+
+  if (d->mb_follows)
+    {
+      free (d->mb_follows->elems);
+      free (d->mb_follows);
+    }
   free (d->mb_match_lens);
 }
 
diff --git a/src/kwset.c b/src/kwset.c
index f86ee03..ce127e2 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -55,6 +55,35 @@
 
 #define U(c) (to_uchar (c))
 
+#if defined(__i386__) || defined(__x86_64__)
+#define SHIFT(p, d)	\
+  switch (d)		\
+    {			\
+    default:		\
+      p += d;		\
+      break;		\
+    case 8:		\
+      ++p;		\
+    case 7:		\
+      ++p;		\
+    case 6:		\
+      ++p;		\
+    case 5:		\
+      ++p;		\
+    case 4:		\
+      ++p;		\
+    case 3:		\
+      ++p;		\
+    case 2:		\
+      ++p;		\
+    case 1:		\
+      ++p;		\
+    }
+#else
+#define SHIFT(p, d)	\
+  p += d;
+#endif
+
 /* Balanced tree of edges and labels leaving a given trie node. */
 struct tree
 {
@@ -508,13 +537,14 @@ bm_delta2_search (char const **tpp, char const *ep, char const *sp, int len,
             }
         }
 
-      tp += d = kwset->shift[i - 2];
+      d = kwset->shift[i - 2];
+      SHIFT(tp, d);
       if (tp > ep)
         break;
       if (tr (trans, tp[-1]) != gc1)
         {
           if (d1)
-            tp += d1[U(tp[-1])];
+            SHIFT(tp, d1[U(tp[-1])]);
           break;
         }
       skip = i - 1;
@@ -524,20 +554,6 @@ bm_delta2_search (char const **tpp, char const *ep, char const *sp, int len,
   return false;
 }
 
-/* Return the address of the first byte in the buffer S that equals C.
-   S contains N bytes.  If TRANS is nonnull, use it to transliterate
-   S's bytes before comparing them.  */
-static char const *
-memchr_trans (char const *s, char c, size_t n, char const *trans)
-{
-  if (! trans)
-    return memchr (s, c, n);
-  char const *slim = s + n;
-  for (; s < slim; s++)
-    if (trans[U(*s)] == c)
-      return s;
-  return NULL;
-}
 
 /* Fast boyer-moore search. */
 static size_t _GL_ATTRIBUTE_PURE
@@ -555,8 +571,18 @@ bmexec (kwset_t kwset, char const *text, size_t size)
     return -1;
   if (len == 1)
     {
-      tp = memchr_trans (text, kwset->target[0], size, trans);
-      return tp ? tp - text : -1;
+      if (trans)
+        {
+          for (tp = text; tp < text + size; tp++)
+            if (trans[U(*tp)] == kwset->target[0])
+              return tp - text;
+          return -1;
+        }
+      else
+        {
+          tp = memchr (text, kwset->target[0], size);
+          return tp ? tp - text : -1;
+        }
     }
 
   d1 = kwset->delta;
@@ -568,33 +594,48 @@ bmexec (kwset_t kwset, char const *text, size_t size)
   /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
   if (size > 12 * len)
     /* 11 is not a bug, the initial offset happens only once. */
-    for (ep = text + size - 11 * len; tp <= ep; )
+    for (ep = text + size - 11 * len;;)
       {
-        d = d1[U(tp[-1])], tp += d;
-        d = d1[U(tp[-1])], tp += d;
-        if (d != 0)
+        while (tp <= ep)
           {
-            d = d1[U(tp[-1])], tp += d;
-            d = d1[U(tp[-1])], tp += d;
-            d = d1[U(tp[-1])], tp += d;
-            if (d != 0)
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            if (d == 0)
+              goto found;
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            if (d == 0)
+              goto found;
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            if (d == 0)
+              goto found;
+            /* memchar() of glibc is faster than seeking by delta1 on
+               some platforms.  When there is no chance to match for a
+               while, use it on them.  */
+#if defined(__GLIBC__) && (defined(__i386__) || defined(__x86_64__))
+            if (!trans)
               {
-                d = d1[U(tp[-1])], tp += d;
-                d = d1[U(tp[-1])], tp += d;
-                d = d1[U(tp[-1])], tp += d;
-                if (d != 0)
+                tp = memchr (tp - 1, gc1, size + text - tp + 1);
+                if (tp)
                   {
-                    /* Typically memchr is faster than seeking by
-                       delta1 when there is no chance to match for
-                       a while.  */
-                    tp--;
-                    tp = memchr_trans (tp, gc1, text + size - tp, trans);
-                    if (! tp)
-                      return -1;
-                    tp++;
+                    ++tp;
+                    goto found;
                   }
+                else
+                  return -1;
+              }
+            else
+#endif
+              {
+                d = d1[U(tp[-1])]; SHIFT(tp, d);
+                d = d1[U(tp[-1])]; SHIFT(tp, d);
               }
           }
+        break;
+      found:
         if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset))
           return tp - text;
       }
@@ -605,7 +646,8 @@ bmexec (kwset_t kwset, char const *text, size_t size)
   d = d1[U(tp[-1])];
   while (d <= ep - tp)
     {
-      d = d1[U((tp += d)[-1])];
+      SHIFT(tp, d);
+      d = d1[U(tp[-1])];
       if (d != 0)
         continue;
       if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, NULL, kwset))
@@ -659,17 +701,18 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch)
     {
       if (qlim && end <= qlim)
         {
-          end += d - 1;
           while ((d = delta[c = *end]) && end < qlim)
             {
-              end += d;
-              end += delta[U(*end)];
-              end += delta[U(*end)];
+              SHIFT(end, d);
+              d = delta[U(end[-1])]; SHIFT(end, d);
+              d = delta[U(end[-1])]; SHIFT(end, d);
             }
-          ++end;
         }
       else
-        d = delta[c = (end += d)[-1]];
+        {
+          SHIFT(end, d);
+          d = delta[c = end[-1]];
+        }
       if (d)
         continue;
       beg = end - 1;
@@ -718,7 +761,8 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch)
   d = 1;
   while (lim - end >= d)
     {
-      if ((d = delta[c = (end += d)[-1]]) != 0)
+      SHIFT(end, d);
+      if ((d = delta[c = end[-1]]) != 0)
         continue;
       beg = end - 1;
       if (!(trie = next[c]))