2004-01-14 Arnold D. Robbins Reinstated ability of dfa matcher to match embedded newlines. This is needed for GNU awk (gawk). * dfa.h (struct dfa): New member `newlines'. (dfaexec): Revised prototype and comment. * dfa.c (build_state, build_state_zero): Handle `newlines' member. (dfafree, realloc_trans_if_necessary): Handle `newlines' member. (SKIP_REMAINS_MB_IF_INITIAL_STATE): Add various casts, return NULL. (dfaexec): Protoype and matching loop revamped per earlier version of dfa.c (dfainit): Zero out `realtrans', `fails', `newlines' and `success' members of *d. * search.c (EGexecute): Revise call of `dfaexec' and pointer management. Index: src/dfa.c =================================================================== RCS file: /cvsroot/grep/grep/src/dfa.c,v retrieving revision 1.31 diff -u -r1.31 dfa.c --- src/dfa.c 16 Dec 2004 08:19:29 -0000 1.31 +++ src/dfa.c 16 Dec 2004 09:03:24 -0000 @@ -2334,6 +2334,7 @@ d->trans = d->realtrans + 1; REALLOC(d->fails, int *, d->tralloc); REALLOC(d->success, int, d->tralloc); + REALLOC(d->newlines, int, d->tralloc); while (oldalloc < d->tralloc) { d->trans[oldalloc] = NULL; @@ -2341,7 +2342,9 @@ } } - /* Newline is a sentinel. */ + /* Keep the newline transition in a special place so we can use it as + a sentinel. */ + d->newlines[s] = trans[eolbyte]; trans[eolbyte] = -1; if (ACCEPTING(s, *d)) @@ -2359,6 +2362,7 @@ d->trans = d->realtrans + 1; CALLOC(d->fails, int *, d->tralloc); MALLOC(d->success, int, d->tralloc); + MALLOC(d->newlines, int, d->tralloc); build_state(0, d); } @@ -2377,13 +2381,13 @@ { \ while (inputwcs[p - buf_begin] == 0 \ && mblen_buf[p - buf_begin] > 0 \ - && p < buf_end) \ + && (unsigned char const *)p < buf_end) \ ++p; \ - if (p >= end) \ + if ((char *)p >= end) \ { \ free(mblen_buf); \ free(inputwcs); \ - return (size_t) -1; \ + return NULL; \ } \ } @@ -2402,6 +2406,7 @@ d->trans = d->realtrans + 1; REALLOC(d->fails, int *, d->tralloc); REALLOC(d->success, int, d->tralloc); + REALLOC(d->newlines, int, d->tralloc); while (oldalloc < d->tralloc) { d->trans[oldalloc] = NULL; @@ -2790,19 +2795,23 @@ /* Search through a buffer looking for a match to the given struct dfa. Find the first occurrence of a string matching the regexp in the buffer, - and the shortest possible version thereof. Return the offset of the first - character after the match, or (size_t) -1 if none is found. BEGIN points to - the beginning of the buffer, and SIZE is the size of the buffer. If SIZE - is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place + and the shortest possible version thereof. Return a pointer to the first + character after the match, or NULL if none is found. Begin points to + the beginning of the buffer, and end points to the first character after + its end. We store a newline in *end to act as a sentinel, so end had + better point somewhere valid. Newline is a flag indicating whether to + allow newlines to be in the matching string. If count is non- + NULL it points to a place we're supposed to increment every time we + see a newline. Finally, if backref is non-NULL it points to a place where we're supposed to store a 1 if backreferencing happened and the match needs to be verified by a backtracking matcher. Otherwise we store a 0 in *backref. */ -size_t -dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) +char * +dfaexec (struct dfa *d, char const *begin, char *end, + int newline, int *count, int *backref) { - register int s; /* Current state. */ + register int s, s1, tmp; /* Current state. */ register unsigned char const *p; /* Current input character. */ - register unsigned char const *end; /* One past the last input character. */ register int **trans, *t; /* Copy of d->trans so it can be optimized into a register. */ register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ @@ -2822,10 +2831,10 @@ if (! d->tralloc) build_state_zero(d); - s = 0; + s = s1 = 0; p = (unsigned char const *) begin; - end = p + size; trans = d->trans; + *end = eol; #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) @@ -2835,17 +2844,16 @@ buf_end = end; /* initialize mblen_buf, and inputwcs. */ - MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2); - MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2); + MALLOC(mblen_buf, unsigned char, end - begin + 2); + MALLOC(inputwcs, wchar_t, end - begin + 2); memset(&mbs, 0, sizeof(mbstate_t)); remain_bytes = 0; - for (i = 0; i < end - (unsigned char const *)begin + 1; i++) + for (i = 0; i < end - begin + 1; i++) { if (remain_bytes == 0) { remain_bytes - = mbrtowc(inputwcs + i, begin + i, - end - (unsigned char const *)begin - i + 1, &mbs); + = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs); if (remain_bytes <= 1) { remain_bytes = 0; @@ -2876,6 +2884,9 @@ if (MB_CUR_MAX > 1) while ((t = trans[s])) { + if ((char *) p > end) + break; + s1 = s; if (d->states[s].mbps.nelem != 0) { /* Can match with a multibyte character (and multi character @@ -2886,7 +2897,7 @@ nextp = p; s = transit_state(d, s, &nextp); - p = nextp; + p = (unsigned char *)nextp; /* Trans table might be updated. */ trans = d->trans; @@ -2899,25 +2910,16 @@ } else #endif /* MBS_SUPPORT */ - while ((t = trans[s])) - s = t[*p++]; - - if (s < 0) - { - if (p == end) - { -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - free(mblen_buf); - free(inputwcs); - } -#endif /* MBS_SUPPORT */ - return (size_t) -1; - } - s = 0; + while ((t = trans[s]) != 0) { /* hand-optimized loop */ + s1 = t[*p++]; + if ((t = trans[s1]) == 0) { + tmp = s ; s = s1 ; s1 = tmp ; /* swap */ + break; } - else if ((t = d->fails[s])) + s = t[*p++]; + } + + if (s >= 0 && p <= (unsigned char *) end && d->fails[s]) { if (d->success[s] & sbit[*p]) { @@ -2930,37 +2932,58 @@ free(inputwcs); } #endif /* MBS_SUPPORT */ - return (char const *) p - begin; + return (char *) p; } + s1 = s; #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { - SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); - if (d->states[s].mbps.nelem != 0) - { - /* Can match with a multibyte character (and multi - character collating element). */ unsigned char const *nextp; nextp = p; s = transit_state(d, s, &nextp); - p = nextp; + p = (unsigned char *)nextp; /* Trans table might be updated. */ trans = d->trans; } else - s = t[*p++]; +#endif /* MBS_SUPPORT */ + s = d->fails[s][*p++]; + continue; + } + + /* If the previous character was a newline, count it. */ + if (count && (char *) p <= end && p[-1] == eol) + ++*count; + + /* Check if we've run off the end of the buffer. */ + if ((char *) p > end) + { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + free(mblen_buf); + free(inputwcs); } - else #endif /* MBS_SUPPORT */ - s = t[*p++]; + return NULL; } - else + + if (s >= 0) { build_state(s, d); trans = d->trans; + continue; } + + if (p[-1] == eol && newline) + { + s = d->newlines[s1]; + continue; + } + + s = 0; } } @@ -2991,6 +3014,10 @@ d->tralloc = 0; d->musts = 0; + d->realtrans = 0; + d->fails = 0; + d->newlines = 0; + d->success = 0; } /* Parse and analyze a single string of the given length. */ @@ -3087,6 +3114,7 @@ free((ptr_t) d->fails[i]); if (d->realtrans) free((ptr_t) d->realtrans); if (d->fails) free((ptr_t) d->fails); + if (d->newlines) free((ptr_t) d->newlines); if (d->success) free((ptr_t) d->success); for (dm = d->musts; dm; dm = ndm) { Index: src/dfa.h =================================================================== RCS file: /cvsroot/grep/grep/src/dfa.h,v retrieving revision 1.8 diff -u -r1.8 dfa.h --- src/dfa.h 16 Dec 2004 08:19:29 -0000 1.8 +++ src/dfa.h 16 Dec 2004 09:03:24 -0000 @@ -1,5 +1,5 @@ /* dfa.h - declarations for GNU deterministic regexp compiler - Copyright (C) 1988, 1998 Free Software Foundation, Inc. + Copyright (C) 1988, 1998, 2002 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -363,6 +363,13 @@ on a state that potentially could do so. */ int *success; /* Table of acceptance conditions used in dfaexec and computed in build_state. */ + int *newlines; /* Transitions on newlines. The entry for a + newline in any transition table is always + -1 so we can count lines without wasting + too many cycles. The transition for a + newline is stored separately and handled + as a special case. Newline is also used + as a sentinel at the end of the buffer. */ struct dfamust *musts; /* List of strings, at least one of which is known to appear in any r.e. matching the dfa. */ @@ -397,13 +404,18 @@ extern void dfacomp PARAMS ((char const *, size_t, struct dfa *, int)); /* Execute the given struct dfa on the buffer of characters. The - last byte of the buffer must equal the end-of-line byte. - The final argument points to a flag that will + first char * points to the beginning, and the second points to the + first character after the end of the buffer, which must be a writable + place so a sentinel end-of-buffer marker can be stored there. The + second-to-last argument is a flag telling whether to allow newlines to + be part of a string matching the regexp. The next-to-last argument, + if non-NULL, points to a place to increment every time we see a + newline. The final argument, if non-NULL, points to a flag that will be set if further examination by a backtracking matcher is needed in order to verify backreferencing; otherwise the flag will be cleared. - Returns (size_t) -1 if no match is found, or the offset of the first + Returns NULL if no match is found, or a pointer to the first character after the first & shortest matching string in the buffer. */ -extern size_t dfaexec PARAMS ((struct dfa *, char const *, size_t, int *)); +extern char *dfaexec PARAMS ((struct dfa *, char const *, char *, int, int *, int *)); /* Free the storage held by the components of a struct dfa. */ extern void dfafree PARAMS ((struct dfa *)); Index: src/search.c =================================================================== RCS file: /cvsroot/grep/grep/src/search.c,v retrieving revision 1.31 diff -u -r1.31 search.c --- src/search.c 16 Dec 2004 08:19:29 -0000 1.31 +++ src/search.c 16 Dec 2004 09:03:24 -0000 @@ -348,7 +348,8 @@ static size_t EGexecute (char const *buf, size_t size, size_t *match_size, int exact) { - register char const *buflim, *beg, *end; + register char const *beg; + char *buflim, *end, save; char eol = eolbyte; int backref, start, len; struct kwsmatch kwsm; @@ -368,9 +369,9 @@ } #endif /* MBS_SUPPORT */ - buflim = buf + size; + buflim = (char *)buf + size; - for (beg = end = buf; end < buflim; beg = end) + for (beg = end = (char *)buf; end < buflim; beg = end + 1) { if (!exact) { @@ -394,6 +395,8 @@ /* Narrow down to the line containing the candidate, and run it through DFA. */ end = memchr(beg, eol, buflim - beg); + if (!end) + end = buflim; end++; #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) @@ -401,20 +404,28 @@ #endif while (beg > buf && beg[-1] != eol) --beg; + save = *end; if (kwsm.index < kwset_exact_matches) goto success; - if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) - continue; + if (dfaexec(&dfa, beg, end, 0, (int *) 0, &backref) == NULL) + { + *end = save; + continue; + } + *end = save; } else { /* No good fixed strings; start with DFA. */ - size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); - if (offset == (size_t) -1) + save = *buflim; + beg = dfaexec(&dfa, beg, buflim, 0, (int *) 0, &backref); + *buflim = save; + if (!beg) break; /* Narrow down to the line we've found. */ - beg += offset; end = memchr (beg, eol, buflim - beg); + if (!end) + end = buflim; end++; while (beg > buf && beg[-1] != eol) --beg; @@ -424,7 +435,7 @@ goto success; } else - end = beg + size; + end = (char *)beg + size; /* If we've made it to this point, this means DFA has seen a probable match, and we need to run it through Regex. */