[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#17156: [PATCH 4/5] grep: optimization of DFA by reuse of multi-byte
From: |
Paolo Bonzini |
Subject: |
bug#17156: [PATCH 4/5] grep: optimization of DFA by reuse of multi-byte buffers in non-UTF8 locales |
Date: |
Tue, 1 Apr 2014 11:18:45 +0200 |
From: Norihiro Tanaka <address@hidden>
* src/dfa.c (struct dfa): New members `mblen_buf', `nmblen_buf',
`inputwcs', `ninputwcs', `mb_follows' and `mb_match_lens'.
(SKIP_REMAINS_MB_IF_INITIAL_STATE): Use them instead of global variables.
(match_anychar): Use them instead of global variables.
(match_mb_charset): Use them instead of global variables.
(check_matching_with_multibyte_ops): Use it instead of new allocation.
(transit_state_consume_1char): Use them instead of global variables.
(transit_state): Use them instead of global variables.
(prepare_wc_buf): Use them instead of global variables.
(dfaexec): Initializations of new members.
(free_mbdata): Memory Deallocations for new members.
---
src/dfa.c | 133 ++++++++++++++++++++++++++++++++------------------------------
1 file changed, 68 insertions(+), 65 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index c06c922..e9cf7cd 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -432,6 +432,30 @@ struct dfa
struct dfamust *musts; /* List of strings, at least one of which
is known to appear in any r.e. matching
the dfa. */
+ unsigned char *mblen_buf; /* Correspond to the input buffer in dfaexec.
+ Each element stores the number of remaining
+ bytes of the corresponding multibyte
+ character in the input string. A element's
+ value is 0 if the corresponding character is
+ single-byte.
+ e.g., input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
+ mblen_buf : 0, 3, 2, 1
+ */
+ size_t nmblen_buf; /* Length of the mblen buffer currently
+ allocated. */
+ wchar_t *inputwcs; /* Wide character representation of the input
+ string in dfaexec.
+ The length of this array is the same as
+ the length of input string (char array).
+ inputstring[i] is a single-byte char,
+ or the first byte of a multibyte char;
+ inputwcs[i] is the codepoint. */
+ size_t ninputwcs; /* Length of the input wide characters
+ currently allocated. */
+ position_set *mb_follows; /* Follow set added by ANYCHAR and/or MBCSET
+ on demand. */
+ int *mb_match_lens; /* Array of length reduced by ANYCHAR and/or
+ MBCSET. */
};
/* Some macros for user access to dfa internals. */
@@ -873,22 +897,6 @@ static int cur_mb_len = 1; /* Length of the multibyte
representation of
static mbstate_t mbs; /* mbstate for mbrtowc. */
static wchar_t wctok; /* Wide character representation of the current
multibyte character. */
-static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec.
- Each element stores the number of remaining
- bytes of the corresponding multibyte
- character in the input string. A element's
- value is 0 if the corresponding character is
- single-byte.
- e.g., input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
- mblen_buf : 0, 3, 2, 1
- */
-static wchar_t *inputwcs; /* Wide character representation of the input
- string in dfaexec.
- The length of this array is the same as
- the length of input string (char array).
- inputstring[i] is a single-byte char,
- or the first byte of a multibyte char;
- inputwcs[i] is the codepoint. */
static unsigned char const *buf_begin; /* reference to begin in dfaexec. */
static unsigned char const *buf_end; /* reference to end in dfaexec. */
@@ -2956,14 +2964,12 @@ build_state_zero (struct dfa *d)
#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
if (s == 0) \
{ \
- while (inputwcs[p - buf_begin] == 0 \
- && mblen_buf[p - buf_begin] > 0 \
+ while (d->inputwcs[p - buf_begin] == 0 \
+ && d->mblen_buf[p - buf_begin] > 0 \
&& (unsigned char const *) p < buf_end) \
++p; \
if ((char *) p >= end) \
{ \
- free (mblen_buf); \
- free (inputwcs); \
*end = saved_end; \
return NULL; \
} \
@@ -3057,8 +3063,8 @@ match_anychar (struct dfa *d, state_num s, position pos,
size_t idx)
wchar_t wc;
int mbclen;
- wc = inputwcs[idx];
- mbclen = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
+ wc = d->inputwcs[idx];
+ mbclen = (d->mblen_buf[idx] == 0) ? 1 : d->mblen_buf[idx];
/* Check syntax bits. */
if (wc == (wchar_t) eolbyte)
@@ -3099,7 +3105,7 @@ match_mb_charset (struct dfa *d, state_num s, position
pos, size_t idx)
int context;
wchar_t wc; /* Current referring character. */
- wc = inputwcs[idx];
+ wc = d->inputwcs[idx];
/* Check syntax bits. */
if (wc == (wchar_t) eolbyte)
@@ -3120,7 +3126,7 @@ match_mb_charset (struct dfa *d, state_num s, position
pos, size_t idx)
/* Assign the current referring operator to work_mbc. */
work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
match = !work_mbc->invert;
- match_len = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
+ match_len = (d->mblen_buf[idx] == 0) ? 1 : d->mblen_buf[idx];
/* Match in range 0-255? */
if (wc < NOTCHAR && work_mbc->cset != -1
@@ -3198,7 +3204,7 @@ check_matching_with_multibyte_ops (struct dfa *d,
state_num s, size_t idx)
size_t i;
int *rarray;
- MALLOC (rarray, d->states[s].mbps.nelem);
+ rarray = d->mb_match_lens;
for (i = 0; i < d->states[s].mbps.nelem; ++i)
{
position pos = d->states[s].mbps.elems[i];
@@ -3228,7 +3234,7 @@ check_matching_with_multibyte_ops (struct dfa *d,
state_num s, size_t idx)
static status_transit_state
transit_state_consume_1char (struct dfa *d, state_num s,
unsigned char const **pp,
- int *match_lens, int *mbclen, position_set * pps)
+ int *match_lens, int *mbclen)
{
size_t i, j;
int k;
@@ -3238,7 +3244,7 @@ transit_state_consume_1char (struct dfa *d, state_num s,
/* Calculate the length of the (single/multi byte) character
to which p points. */
- *mbclen = (mblen_buf[*pp - buf_begin] == 0) ? 1 : mblen_buf[*pp - buf_begin];
+ *mbclen = (d->mblen_buf[*pp - buf_begin] == 0) ? 1 : d->mblen_buf[*pp -
buf_begin];
/* Calculate the state which can be reached from the state 's' by
consuming '*mbclen' single bytes from the buffer. */
@@ -3248,8 +3254,8 @@ transit_state_consume_1char (struct dfa *d, state_num s,
s2 = s1;
rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
}
- /* Copy the positions contained by 's1' to the set 'pps'. */
- copy (&(d->states[s1].elems), pps);
+ /* Copy the positions contained by 's1' to the set 'd->mb_follows'. */
+ copy (&(d->states[s1].elems), d->mb_follows);
/* Check (input) match_lens, and initialize if it is NULL. */
if (match_lens == NULL && d->states[s].mbps.nelem != 0)
@@ -3264,12 +3270,10 @@ transit_state_consume_1char (struct dfa *d, state_num s,
if (work_mbls[i] == *mbclen)
for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
j++)
- insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], pps);
+ insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
+ d->mb_follows);
}
- if (match_lens == NULL && work_mbls != NULL)
- free (work_mbls);
-
/* FIXME: this return value is always ignored. */
return rs;
}
@@ -3286,7 +3290,6 @@ transit_state (struct dfa *d, state_num s, unsigned char
const **pp)
size_t i, j;
int *match_lens = NULL;
size_t nelem = d->states[s].mbps.nelem; /* Just a alias. */
- position_set follows;
unsigned char const *p1 = *pp;
wchar_t wc;
@@ -3317,26 +3320,25 @@ transit_state (struct dfa *d, state_num s, unsigned
char const **pp)
if (rs == TRANSIT_STATE_DONE)
++*pp;
- free (match_lens);
return s1;
}
/* This state has some operators which can match a multibyte character. */
- alloc_position_set (&follows, d->nleaves);
+ d->mb_follows->nelem = 0;
/* 'maxlen' may be longer than the length of a character, because it may
not be a character but a (multi character) collating element.
We enumerate all of the positions which 's' can reach by consuming
'maxlen' bytes. */
- transit_state_consume_1char (d, s, pp, match_lens, &mbclen, &follows);
+ transit_state_consume_1char (d, s, pp, match_lens, &mbclen);
- wc = inputwcs[*pp - mbclen - buf_begin];
- s1 = state_index (d, &follows, wchar_context (wc));
+ wc = d->inputwcs[*pp - mbclen - buf_begin];
+ s1 = state_index (d, d->mb_follows, wchar_context (wc));
realloc_trans_if_necessary (d, s1);
while (*pp - p1 < maxlen)
{
- transit_state_consume_1char (d, s1, pp, NULL, &mbclen, &follows);
+ transit_state_consume_1char (d, s1, pp, NULL, &mbclen);
for (i = 0; i < nelem; i++)
{
@@ -3344,15 +3346,13 @@ transit_state (struct dfa *d, state_num s, unsigned
char const **pp)
for (j = 0;
j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
- &follows);
+ d->mb_follows);
}
- wc = inputwcs[*pp - mbclen - buf_begin];
- s1 = state_index (d, &follows, wchar_context (wc));
+ wc = d->inputwcs[*pp - mbclen - buf_begin];
+ s1 = state_index (d, d->mb_follows, wchar_context (wc));
realloc_trans_if_necessary (d, s1);
}
- free (match_lens);
- free (follows.elems);
return s1;
}
@@ -3371,21 +3371,21 @@ prepare_wc_buf (struct dfa *d, const char *begin, const
char *end)
for (i = 0; i < ilim; i++)
{
- size_t nbytes = mbs_to_wchar (d, inputwcs + i, begin + i, ilim - i,
&mbs);
- mblen_buf[i] = nbytes - (nbytes == 1);
+ size_t nbytes = mbs_to_wchar (d, d->inputwcs + i, begin + i, ilim - i,
&mbs);
+ d->mblen_buf[i] = nbytes - (nbytes == 1);
if (begin[i] == eol)
break;
while (--nbytes != 0)
{
i++;
- mblen_buf[i] = nbytes;
- inputwcs[i] = 0;
+ d->mblen_buf[i] = nbytes;
+ d->inputwcs[i] = 0;
}
}
buf_end = (unsigned char *) (begin + i);
- mblen_buf[i] = 0;
- inputwcs[i] = 0; /* sentinel */
+ d->mblen_buf[i] = 0;
+ d->inputwcs[i] = 0; /* sentinel */
#endif /* MBS_SUPPORT */
}
@@ -3423,10 +3423,18 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (d->mb_cur_max > 1)
{
- MALLOC (mblen_buf, end - begin + 2);
- MALLOC (inputwcs, end - begin + 2);
+ static bool mb_alloc = false;
+ REALLOC_IF_NECESSARY (d->mblen_buf, d->nmblen_buf, end - begin + 2);
+ REALLOC_IF_NECESSARY (d->inputwcs, d->ninputwcs, end - begin + 2);
memset (&mbs, 0, sizeof (mbstate_t));
prepare_wc_buf (d, (const char *) p, end);
+ if (!mb_alloc)
+ {
+ MALLOC (d->mb_match_lens, d->nleaves);
+ MALLOC (d->mb_follows, 1);
+ alloc_position_set (d->mb_follows, d->nleaves);
+ mb_alloc = true;
+ }
}
for (;;)
@@ -3453,8 +3461,6 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (backref)
{
*backref = 1;
- free (mblen_buf);
- free (inputwcs);
*end = saved_end;
return (char *) p;
}
@@ -3487,11 +3493,6 @@ dfaexec (struct dfa *d, char const *begin, char *end,
{
if (backref)
*backref = (d->states[s].backref != 0);
- if (d->mb_cur_max > 1)
- {
- free (mblen_buf);
- free (inputwcs);
- }
*end = saved_end;
return (char *) p;
}
@@ -3522,11 +3523,6 @@ dfaexec (struct dfa *d, char const *begin, char *end,
/* Check if we've run off the end of the buffer. */
if ((char *) p > end)
{
- if (d->mb_cur_max > 1)
- {
- free (mblen_buf);
- free (inputwcs);
- }
*end = saved_end;
return NULL;
}
@@ -3578,6 +3574,13 @@ free_mbdata (struct dfa *d)
free (d->mbcsets);
d->mbcsets = NULL;
d->nmbcsets = 0;
+
+ free (d->mblen_buf);
+ free (d->inputwcs);
+ if (d->mb_follows != NULL)
+ free (d->mb_follows->elems);
+ free (d->mb_follows);
+ free (d->mb_match_lens);
}
/* Initialize the components of a dfa that the other routines don't
--
1.9.0
- bug#17157: [PATCH 1/5] Partially revert "dfa: improve port to freestanding DJGPP", (continued)
- bug#17157: [PATCH 1/5] Partially revert "dfa: improve port to freestanding DJGPP", arnold, 2014/04/02
- bug#17157: [PATCH 1/5] Partially revert "dfa: improve port to freestanding DJGPP", Paul Eggert, 2014/04/02
- bug#17157: [PATCH 1/5] Partially revert "dfa: improve port to freestanding DJGPP", Paul Eggert, 2014/04/03
- bug#17157: [PATCH 1/5] Partially revert "dfa: improve port to freestanding DJGPP", Jim Meyering, 2014/04/04
- bug#17157: [PATCH 1/5] Partially revert "dfa: improve port to freestanding DJGPP", Paul Eggert, 2014/04/05
- bug#17157: [PATCH 1/5] Partially revert "dfa: improve port to freestanding DJGPP", Jim Meyering, 2014/04/06
bug#17156: [PATCH 3/5] grep: avoid to re-build a state built previously., Paolo Bonzini, 2014/04/01
bug#17156: [PATCH 5/5] grep: pass a single line to regex, Paolo Bonzini, 2014/04/01
bug#17156: [PATCH 4/5] grep: optimization of DFA by reuse of multi-byte buffers in non-UTF8 locales,
Paolo Bonzini <=
bug#17156: [PATCH 2/5] Revert conversion to shell scripts, Paolo Bonzini, 2014/04/01
- bug#17156: [PATCH 2/5] Revert conversion to shell scripts, Paul Eggert, 2014/04/01
- bug#17156: [PATCH 2/5] Revert conversion to shell scripts, Paolo Bonzini, 2014/04/01
- bug#17156: [PATCH 2/5] Revert conversion to shell scripts, Paul Eggert, 2014/04/01
- bug#17156: [PATCH 2/5] Revert conversion to shell scripts, Paolo Bonzini, 2014/04/01
- bug#17156: [PATCH 2/5] Revert conversion to shell scripts, Paul Eggert, 2014/04/05
- bug#17156: [PATCH 2/5] Revert conversion to shell scripts, Paolo Bonzini, 2014/04/07