bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#17156: [PATCH 4/5] grep: optimization of DFA by reuse of multi-byte


From: Paolo Bonzini
Subject: bug#17156: [PATCH 4/5] grep: optimization of DFA by reuse of multi-byte buffers in non-UTF8 locales
Date: Tue, 1 Apr 2014 11:18:45 +0200

From: Norihiro Tanaka <address@hidden>

* src/dfa.c (struct dfa): New members `mblen_buf', `nmblen_buf',
`inputwcs', `ninputwcs', `mb_follows' and `mb_match_lens'.
(SKIP_REMAINS_MB_IF_INITIAL_STATE): Use them instead of global variables.
(match_anychar): Use them instead of global variables.
(match_mb_charset): Use them instead of global variables.
(check_matching_with_multibyte_ops): Use it instead of new allocation.
(transit_state_consume_1char): Use them instead of global variables.
(transit_state): Use them instead of global variables.
(prepare_wc_buf): Use them instead of global variables.
(dfaexec): Initializations of new members.
(free_mbdata): Memory Deallocations for new members.
---
 src/dfa.c | 133 ++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 68 insertions(+), 65 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index c06c922..e9cf7cd 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -432,6 +432,30 @@ struct dfa
   struct dfamust *musts;        /* List of strings, at least one of which
                                    is known to appear in any r.e. matching
                                    the dfa.  */
+  unsigned char *mblen_buf;     /* Correspond to the input buffer in dfaexec.
+                                   Each element stores the number of remaining
+                                   bytes of the corresponding multibyte
+                                   character in the input string.  A element's
+                                   value is 0 if the corresponding character is
+                                   single-byte.
+                                   e.g., input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
+                                   mblen_buf   :  0,       3,       2,       1
+                                 */
+  size_t nmblen_buf;            /* Length of the mblen buffer currently
+                                   allocated.  */
+  wchar_t *inputwcs;            /* Wide character representation of the input
+                                   string in dfaexec.
+                                   The length of this array is the same as
+                                   the length of input string (char array).
+                                   inputstring[i] is a single-byte char,
+                                   or the first byte of a multibyte char;
+                                   inputwcs[i] is the codepoint.  */
+  size_t ninputwcs;             /* Length of the input wide characters
+                                   currently allocated.  */
+  position_set *mb_follows;     /* Follow set added by ANYCHAR and/or MBCSET
+                                   on demand.  */
+  int *mb_match_lens;           /* Array of length reduced by ANYCHAR and/or
+                                   MBCSET.  */
 };
 
 /* Some macros for user access to dfa internals.  */
@@ -873,22 +897,6 @@ static int cur_mb_len = 1;      /* Length of the multibyte 
representation of
 static mbstate_t mbs;           /* mbstate for mbrtowc.  */
 static wchar_t wctok;           /* Wide character representation of the current
                                    multibyte character.  */
-static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec.
-                                   Each element stores the number of remaining
-                                   bytes of the corresponding multibyte
-                                   character in the input string.  A element's
-                                   value is 0 if the corresponding character is
-                                   single-byte.
-                                   e.g., input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
-                                   mblen_buf   :  0,       3,       2,       1
-                                 */
-static wchar_t *inputwcs;       /* Wide character representation of the input
-                                   string in dfaexec.
-                                   The length of this array is the same as
-                                   the length of input string (char array).
-                                   inputstring[i] is a single-byte char,
-                                   or the first byte of a multibyte char;
-                                   inputwcs[i] is the codepoint.  */
 static unsigned char const *buf_begin;  /* reference to begin in dfaexec.  */
 static unsigned char const *buf_end;    /* reference to end in dfaexec.  */
 
@@ -2956,14 +2964,12 @@ build_state_zero (struct dfa *d)
 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)         \
   if (s == 0)                                          \
     {                                                  \
-      while (inputwcs[p - buf_begin] == 0              \
-            && mblen_buf[p - buf_begin] > 0            \
+      while (d->inputwcs[p - buf_begin] == 0           \
+            && d->mblen_buf[p - buf_begin] > 0         \
             && (unsigned char const *) p < buf_end)    \
         ++p;                                           \
       if ((char *) p >= end)                           \
         {                                              \
-          free (mblen_buf);                            \
-          free (inputwcs);                             \
           *end = saved_end;                            \
           return NULL;                                 \
         }                                              \
@@ -3057,8 +3063,8 @@ match_anychar (struct dfa *d, state_num s, position pos, 
size_t idx)
   wchar_t wc;
   int mbclen;
 
-  wc = inputwcs[idx];
-  mbclen = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
+  wc = d->inputwcs[idx];
+  mbclen = (d->mblen_buf[idx] == 0) ? 1 : d->mblen_buf[idx];
 
   /* Check syntax bits.  */
   if (wc == (wchar_t) eolbyte)
@@ -3099,7 +3105,7 @@ match_mb_charset (struct dfa *d, state_num s, position 
pos, size_t idx)
   int context;
   wchar_t wc;                   /* Current referring character.  */
 
-  wc = inputwcs[idx];
+  wc = d->inputwcs[idx];
 
   /* Check syntax bits.  */
   if (wc == (wchar_t) eolbyte)
@@ -3120,7 +3126,7 @@ match_mb_charset (struct dfa *d, state_num s, position 
pos, size_t idx)
   /* Assign the current referring operator to work_mbc.  */
   work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
   match = !work_mbc->invert;
-  match_len = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
+  match_len = (d->mblen_buf[idx] == 0) ? 1 : d->mblen_buf[idx];
 
   /* Match in range 0-255?  */
   if (wc < NOTCHAR && work_mbc->cset != -1
@@ -3198,7 +3204,7 @@ check_matching_with_multibyte_ops (struct dfa *d, 
state_num s, size_t idx)
   size_t i;
   int *rarray;
 
-  MALLOC (rarray, d->states[s].mbps.nelem);
+  rarray = d->mb_match_lens;
   for (i = 0; i < d->states[s].mbps.nelem; ++i)
     {
       position pos = d->states[s].mbps.elems[i];
@@ -3228,7 +3234,7 @@ check_matching_with_multibyte_ops (struct dfa *d, 
state_num s, size_t idx)
 static status_transit_state
 transit_state_consume_1char (struct dfa *d, state_num s,
                              unsigned char const **pp,
-                             int *match_lens, int *mbclen, position_set * pps)
+                             int *match_lens, int *mbclen)
 {
   size_t i, j;
   int k;
@@ -3238,7 +3244,7 @@ transit_state_consume_1char (struct dfa *d, state_num s,
 
   /* Calculate the length of the (single/multi byte) character
      to which p points.  */
-  *mbclen = (mblen_buf[*pp - buf_begin] == 0) ? 1 : mblen_buf[*pp - buf_begin];
+  *mbclen = (d->mblen_buf[*pp - buf_begin] == 0) ? 1 : d->mblen_buf[*pp - 
buf_begin];
 
   /* Calculate the state which can be reached from the state 's' by
      consuming '*mbclen' single bytes from the buffer.  */
@@ -3248,8 +3254,8 @@ transit_state_consume_1char (struct dfa *d, state_num s,
       s2 = s1;
       rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
     }
-  /* Copy the positions contained by 's1' to the set 'pps'.  */
-  copy (&(d->states[s1].elems), pps);
+  /* Copy the positions contained by 's1' to the set 'd->mb_follows'.  */
+  copy (&(d->states[s1].elems), d->mb_follows);
 
   /* Check (input) match_lens, and initialize if it is NULL.  */
   if (match_lens == NULL && d->states[s].mbps.nelem != 0)
@@ -3264,12 +3270,10 @@ transit_state_consume_1char (struct dfa *d, state_num s,
       if (work_mbls[i] == *mbclen)
         for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
              j++)
-          insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], pps);
+          insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
+            d->mb_follows);
     }
 
-  if (match_lens == NULL && work_mbls != NULL)
-    free (work_mbls);
-
   /* FIXME: this return value is always ignored.  */
   return rs;
 }
@@ -3286,7 +3290,6 @@ transit_state (struct dfa *d, state_num s, unsigned char 
const **pp)
   size_t i, j;
   int *match_lens = NULL;
   size_t nelem = d->states[s].mbps.nelem;       /* Just a alias.  */
-  position_set follows;
   unsigned char const *p1 = *pp;
   wchar_t wc;
 
@@ -3317,26 +3320,25 @@ transit_state (struct dfa *d, state_num s, unsigned 
char const **pp)
       if (rs == TRANSIT_STATE_DONE)
         ++*pp;
 
-      free (match_lens);
       return s1;
     }
 
   /* This state has some operators which can match a multibyte character.  */
-  alloc_position_set (&follows, d->nleaves);
+  d->mb_follows->nelem = 0;
 
   /* 'maxlen' may be longer than the length of a character, because it may
      not be a character but a (multi character) collating element.
      We enumerate all of the positions which 's' can reach by consuming
      'maxlen' bytes.  */
-  transit_state_consume_1char (d, s, pp, match_lens, &mbclen, &follows);
+  transit_state_consume_1char (d, s, pp, match_lens, &mbclen);
 
-  wc = inputwcs[*pp - mbclen - buf_begin];
-  s1 = state_index (d, &follows, wchar_context (wc));
+  wc = d->inputwcs[*pp - mbclen - buf_begin];
+  s1 = state_index (d, d->mb_follows, wchar_context (wc));
   realloc_trans_if_necessary (d, s1);
 
   while (*pp - p1 < maxlen)
     {
-      transit_state_consume_1char (d, s1, pp, NULL, &mbclen, &follows);
+      transit_state_consume_1char (d, s1, pp, NULL, &mbclen);
 
       for (i = 0; i < nelem; i++)
         {
@@ -3344,15 +3346,13 @@ transit_state (struct dfa *d, state_num s, unsigned 
char const **pp)
             for (j = 0;
                  j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
               insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
-                      &follows);
+                      d->mb_follows);
         }
 
-      wc = inputwcs[*pp - mbclen - buf_begin];
-      s1 = state_index (d, &follows, wchar_context (wc));
+      wc = d->inputwcs[*pp - mbclen - buf_begin];
+      s1 = state_index (d, d->mb_follows, wchar_context (wc));
       realloc_trans_if_necessary (d, s1);
     }
-  free (match_lens);
-  free (follows.elems);
   return s1;
 }
 
@@ -3371,21 +3371,21 @@ prepare_wc_buf (struct dfa *d, const char *begin, const 
char *end)
 
   for (i = 0; i < ilim; i++)
     {
-      size_t nbytes = mbs_to_wchar (d, inputwcs + i, begin + i, ilim - i, 
&mbs);
-      mblen_buf[i] = nbytes - (nbytes == 1);
+      size_t nbytes = mbs_to_wchar (d, d->inputwcs + i, begin + i, ilim - i, 
&mbs);
+      d->mblen_buf[i] = nbytes - (nbytes == 1);
       if (begin[i] == eol)
         break;
       while (--nbytes != 0)
         {
           i++;
-          mblen_buf[i] = nbytes;
-          inputwcs[i] = 0;
+          d->mblen_buf[i] = nbytes;
+          d->inputwcs[i] = 0;
         }
     }
 
   buf_end = (unsigned char *) (begin + i);
-  mblen_buf[i] = 0;
-  inputwcs[i] = 0;              /* sentinel */
+  d->mblen_buf[i] = 0;
+  d->inputwcs[i] = 0;              /* sentinel */
 #endif /* MBS_SUPPORT */
 }
 
@@ -3423,10 +3423,18 @@ dfaexec (struct dfa *d, char const *begin, char *end,
 
   if (d->mb_cur_max > 1)
     {
-      MALLOC (mblen_buf, end - begin + 2);
-      MALLOC (inputwcs, end - begin + 2);
+      static bool mb_alloc = false;
+      REALLOC_IF_NECESSARY (d->mblen_buf, d->nmblen_buf, end - begin + 2);
+      REALLOC_IF_NECESSARY (d->inputwcs, d->ninputwcs, end - begin + 2);
       memset (&mbs, 0, sizeof (mbstate_t));
       prepare_wc_buf (d, (const char *) p, end);
+      if (!mb_alloc)
+        {
+          MALLOC (d->mb_match_lens, d->nleaves);
+          MALLOC (d->mb_follows, 1);
+          alloc_position_set (d->mb_follows, d->nleaves);
+          mb_alloc = true;
+        }
     }
 
   for (;;)
@@ -3453,8 +3461,6 @@ dfaexec (struct dfa *d, char const *begin, char *end,
               if (backref)
                 {
                   *backref = 1;
-                  free (mblen_buf);
-                  free (inputwcs);
                   *end = saved_end;
                   return (char *) p;
                 }
@@ -3487,11 +3493,6 @@ dfaexec (struct dfa *d, char const *begin, char *end,
             {
               if (backref)
                 *backref = (d->states[s].backref != 0);
-              if (d->mb_cur_max > 1)
-                {
-                  free (mblen_buf);
-                  free (inputwcs);
-                }
               *end = saved_end;
               return (char *) p;
             }
@@ -3522,11 +3523,6 @@ dfaexec (struct dfa *d, char const *begin, char *end,
       /* Check if we've run off the end of the buffer.  */
       if ((char *) p > end)
         {
-          if (d->mb_cur_max > 1)
-            {
-              free (mblen_buf);
-              free (inputwcs);
-            }
           *end = saved_end;
           return NULL;
         }
@@ -3578,6 +3574,13 @@ free_mbdata (struct dfa *d)
   free (d->mbcsets);
   d->mbcsets = NULL;
   d->nmbcsets = 0;
+
+  free (d->mblen_buf);
+  free (d->inputwcs);
+  if (d->mb_follows != NULL)
+    free (d->mb_follows->elems);
+  free (d->mb_follows);
+  free (d->mb_match_lens);
 }
 
 /* Initialize the components of a dfa that the other routines don't
-- 
1.9.0







reply via email to

[Prev in Thread] Current Thread [Next in Thread]