From 7e2b51d00133ab8a0dbcd21b5e0f39a6984f858f Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Thu, 3 Apr 2014 18:04:52 -0700 Subject: [PATCH] awk: simplify dfa.c by having it not include mbsupport.h directly This syncs dfa.c better with 'grep'. * awk.h, regex_internal.h, dfa.c: Don't include mbsupport.h. * custom.h: Include mbsupport.h here instead. (_GL_ATTRIBUTE_PURE): Move here from dfa.c, to lessen the number of differences between grep's dfa.c and ours. * dfa.c: Include wchar.h and wctype.h unconditionally, as this simplifies the use of dfa.c in grep, and it does no harm in gawk. (gawk_mb_cur_max, MB_CUR_MAX, mbrtowc) [LIBC_IS_BORKED]: Move to mbsupport.h (needed for consistency in all uses), and fix mbrtowc to return size_t. (setlocale, static_assert): Likewise. (struct dfa, dfambcache, mbs_to_wchar) (is_valid_unibyte_character, setbit_wc, using_utf8, FETCH_WC) (addtok_wc, add_utf8_anychar, atom, state_index, epsclosure) (dfaanalyze, dfastate, prepare_wc_buf, dfaoptimize, dfafree, dfamust): * dfasearch.c (EGexecute): * grep.c (main): * searchutils.c (mbtoupper): Assume MBS_SUPPORT. * mbsupport.h [!MBS_SUPPORT]: Include wchar.h, wctype.h before overriding their definitions. (WEOF, towupper, towlower, btowc, iswalnum, iswalpha, iswupper) (iswlower, mbrtowc, wcrtomb, wctype, iswctype, wcscoll): (btowc): Parenthesize properly. (mbrtowc, wcrtomb): New macros. (wctype, iswctype, wcscoll): Define to gawk_wctype etc. to avoid collisions with standard library. * missing_d/wcmisc.c: Remove now-unnecessary ifdefs. --- ChangeLog | 32 +++++++++++++++ awk.h | 2 - custom.h | 9 +++++ dfa.c | 111 +++++++--------------------------------------------- mbsupport.h | 54 +++++++++++++++++++++++-- missing_d/ChangeLog | 4 ++ missing_d/wcmisc.c | 10 ----- regex_internal.h | 2 - 8 files changed, 110 insertions(+), 114 deletions(-) diff --git a/ChangeLog b/ChangeLog index a0efd89..36fb0f4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,35 @@ +2014-04-03 Paul Eggert + + awk: simplify dfa.c by having it not include mbsupport.h directly + This syncs dfa.c better with 'grep'. + * awk.h, regex_internal.h, dfa.c: Don't include mbsupport.h. + * custom.h: Include mbsupport.h here instead. + (_GL_ATTRIBUTE_PURE): Move here from dfa.c, to lessen the + number of differences between grep's dfa.c and ours. + * dfa.c: Include wchar.h and wctype.h unconditionally, as + this simplifies the use of dfa.c in grep, and it does no harm + in gawk. + (gawk_mb_cur_max, MB_CUR_MAX, mbrtowc) [LIBC_IS_BORKED]: + Move to mbsupport.h (needed for consistency in all uses), + and fix mbrtowc to return size_t. + (struct dfa, dfambcache, mbs_to_wchar) + (is_valid_unibyte_character, setbit_wc, using_utf8, FETCH_WC) + (addtok_wc, add_utf8_anychar, atom, state_index, epsclosure) + (dfaanalyze, dfastate, prepare_wc_buf, dfaoptimize, dfafree, dfamust): + * dfasearch.c (EGexecute): + * grep.c (main): + * searchutils.c (mbtoupper): + Assume MBS_SUPPORT. + * mbsupport.h [!MBS_SUPPORT]: Include wchar.h, wctype.h + before overriding their definitions. + (WEOF, towupper, towlower, btowc, iswalnum, iswalpha, iswupper) + (iswlower, mbrtowc, wcrtomb, wctype, iswctype, wcscoll): + #undef before #defining. + (btowc): Parenthesize properly. + (mbrtowc, wcrtomb): New macros. + (wctype, iswctype, wcscoll): Define to gawk_wctype etc. to avoid + collisions with standard library. + 2014-04-03 Arnold D. Robbins * regcomp.c (parse_bracket_exp): Move a call to `re_free' inside diff --git a/awk.h b/awk.h index aefdd07..cdba7a8 100644 --- a/awk.h +++ b/awk.h @@ -95,8 +95,6 @@ extern int errno; #include "missing_d/gawkbool.h" #endif -#include "mbsupport.h" /* defines MBS_SUPPORT */ - #if MBS_SUPPORT /* We can handle multibyte strings. */ #include diff --git a/custom.h b/custom.h index 36b4aa0..bade4cf 100644 --- a/custom.h +++ b/custom.h @@ -76,3 +76,12 @@ extern int setenv(const char *name, const char *value, int rewrite); extern int unsetenv(const char *name); #endif + +/* The __pure__ attribute was added in gcc 2.96. */ +#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96) +# define _GL_ATTRIBUTE_PURE __attribute__ ((__pure__)) +#else +# define _GL_ATTRIBUTE_PURE /* empty */ +#endif + +#include "mbsupport.h" diff --git a/dfa.c b/dfa.c index 378305d..ee6edd8 100644 --- a/dfa.c +++ b/dfa.c @@ -43,16 +43,6 @@ #include "missing_d/gawkbool.h" #endif /* HAVE_STDBOOL_H */ -/* Gawk doesn't use Gnulib, so don't assume that setlocale and - static_assert are present. */ -#ifndef LC_ALL -# define setlocale(category, locale) NULL -#endif -#ifndef static_assert -# define static_assert(cond, diagnostic) \ - extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })] -#endif - #define STREQ(a, b) (strcmp (a, b) == 0) /* ISASCIIDIGIT differs from isdigit, as follows: @@ -69,21 +59,8 @@ #include "gettext.h" #define _(str) gettext (str) -#include "mbsupport.h" /* Define MBS_SUPPORT to 1 or 0, as appropriate. */ -#if MBS_SUPPORT -/* We can handle multibyte strings. */ -# include -# include -#endif - -#ifdef GAWK -/* The __pure__ attribute was added in gcc 2.96. */ -#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96) -# define _GL_ATTRIBUTE_PURE __attribute__ ((__pure__)) -#else -# define _GL_ATTRIBUTE_PURE /* empty */ -#endif -#endif /* GAWK */ +#include +#include #if HAVE_LANGINFO_CODESET # include @@ -101,14 +78,6 @@ is_blank (int c) } #endif /* GAWK */ -#ifdef LIBC_IS_BORKED -extern int gawk_mb_cur_max; -#undef MB_CUR_MAX -#define MB_CUR_MAX gawk_mb_cur_max -#undef mbrtowc -#define mbrtowc(a, b, c, d) (-1) -#endif - /* HPUX defines these as macros in sys/param.h. */ #ifdef setbit # undef setbit @@ -412,13 +381,11 @@ struct dfa size_t nmultibyte_prop; int *multibyte_prop; -#if MBS_SUPPORT /* A table indexed by byte values that contains the corresponding wide character (if any) for that byte. WEOF means the byte is the leading byte of a multibyte character. Invalid and null bytes are mapped to themselves. */ wint_t mbrtowc_cache[NOTCHAR]; -#endif /* Array of the bracket expression in the DFA. */ struct mb_char_classes *mbcsets; @@ -525,7 +492,6 @@ static void regexp (void); static void dfambcache (struct dfa *d) { -#if MBS_SUPPORT int i; for (i = CHAR_MIN; i <= CHAR_MAX; ++i) { @@ -542,10 +508,8 @@ dfambcache (struct dfa *d) } d->mbrtowc_cache[uc] = wi; } -#endif } -#if MBS_SUPPORT /* Given the dfa D, store into *PWC the result of converting the leading bytes of the multibyte buffer S of length N bytes, updating the conversion state in *MBS. On conversion error, convert just a @@ -579,7 +543,6 @@ mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n, *pwc = wc; return 1; } -#endif #ifdef DEBUG @@ -749,7 +712,7 @@ static charclass newline; #ifdef __GLIBC__ # define is_valid_unibyte_character(c) 1 #else -# define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF)) +# define is_valid_unibyte_character(c) (btowc (c) != WEOF) #endif /* Return non-zero if C is a "word-constituent" byte; zero otherwise. */ @@ -810,17 +773,12 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) static bool setbit_wc (wint_t wc, charclass c) { -#if MBS_SUPPORT int b = wctob (wc); if (b == EOF) return false; setbit (b, c); return true; -#else - abort (); - /*NOTREACHED*/ return false; -#endif } /* Set a bit for B and its case variants in the charclass C. @@ -845,7 +803,7 @@ using_utf8 (void) static int utf8 = -1; if (utf8 == -1) { -#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT +#if defined HAVE_LANGINFO_CODESET utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8")); #else utf8 = 0; @@ -938,7 +896,6 @@ static unsigned char const *buf_begin; /* reference to begin in dfaexec. */ static unsigned char const *buf_end; /* reference to end in dfaexec. */ -#if MBS_SUPPORT /* Note that characters become unsigned here. */ # define FETCH_WC(c, wc, eoferr) \ do { \ @@ -961,23 +918,6 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */ } \ } while (0) -#else -/* Note that characters become unsigned here. */ -# define FETCH_WC(c, unused, eoferr) \ - do { \ - if (! lexleft) \ - { \ - if ((eoferr) != 0) \ - dfaerror (eoferr); \ - else \ - return lasttok = END; \ - } \ - (c) = to_uchar (*lexptr++); \ - --lexleft; \ - } while (0) - -#endif /* MBS_SUPPORT */ - #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif @@ -1761,7 +1701,6 @@ addtok (token t) } } -#if MBS_SUPPORT /* We treat a multibyte character as a single atom, so that DFA can treat a multibyte character as a single expression. @@ -1793,17 +1732,10 @@ addtok_wc (wint_t wc) addtok (CAT); } } -#else -static void -addtok_wc (wint_t wc) -{ -} -#endif static void add_utf8_anychar (void) { -#if MBS_SUPPORT static const charclass utf8_classes[5] = { {0, 0, 0, 0, ~0, ~0, 0, 0}, /* 80-bf: non-leading bytes */ {~0, ~0, ~0, ~0, 0, 0, 0, 0}, /* 00-7f: 1-byte sequence */ @@ -1848,7 +1780,6 @@ add_utf8_anychar (void) addtok (CAT); addtok (OR); } -#endif } /* The grammar understood by the parser is as follows. @@ -1889,7 +1820,7 @@ add_utf8_anychar (void) static void atom (void) { - if (MBS_SUPPORT && tok == WCHAR) + if (tok == WCHAR) { addtok_wc (wctok); @@ -1906,7 +1837,7 @@ atom (void) tok = lex (); } - else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ()) + else if (tok == ANYCHAR && using_utf8 ()) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -1920,9 +1851,7 @@ atom (void) } else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD -#if MBS_SUPPORT || tok == ANYCHAR || tok == MBCSET -#endif /* MBS_SUPPORT */ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) { addtok (tok); @@ -2205,11 +2134,9 @@ state_index (struct dfa *d, position_set const *s, int context) d->states[i].backref = 0; d->states[i].constraint = 0; d->states[i].first_end = 0; - if (MBS_SUPPORT) - { - d->states[i].mbps.nelem = 0; - d->states[i].mbps.elems = NULL; - } + d->states[i].mbps.nelem = 0; + d->states[i].mbps.elems = NULL; + for (j = 0; j < s->nelem; ++j) if (d->tokens[s->elems[j].index] < 0) { @@ -2247,10 +2174,8 @@ epsclosure (position_set * s, struct dfa const *d) for (i = 0; i < s->nelem; ++i) if (d->tokens[s->elems[i].index] >= NOTCHAR && d->tokens[s->elems[i].index] != BACKREF -#if MBS_SUPPORT && d->tokens[s->elems[i].index] != ANYCHAR && d->tokens[s->elems[i].index] != MBCSET -#endif && d->tokens[s->elems[i].index] < CSET) { old = s->elems[i]; @@ -2567,9 +2492,7 @@ dfaanalyze (struct dfa *d, int searchflag) it with its epsilon closure. */ for (i = 0; i < d->tindex; ++i) if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF -#if MBS_SUPPORT || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET -#endif || d->tokens[i] >= CSET) { #ifdef DEBUG @@ -2679,9 +2602,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (MBS_SUPPORT - && (d->tokens[pos.index] == ANYCHAR - || d->tokens[pos.index] == MBCSET)) + else if (d->tokens[pos.index] == ANYCHAR + || d->tokens[pos.index] == MBCSET) /* MB_CUR_MAX > 1 */ { /* ANYCHAR and MBCSET must match with a single character, so we @@ -2855,7 +2777,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) /* If we are building a searching matcher, throw in the positions of state 0 as well. */ if (d->searchflag - && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte))) + && (d->mb_cur_max == 1 || !next_isnt_1st_byte)) for (j = 0; j < d->states[0].elems.nelem; ++j) insert (d->states[0].elems.elems[j], &follows); @@ -3407,7 +3329,6 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp) static void prepare_wc_buf (struct dfa *d, const char *begin, const char *end) { -#if MBS_SUPPORT unsigned char eol = eolbyte; size_t i; size_t ilim = end - begin + 1; @@ -3431,7 +3352,6 @@ prepare_wc_buf (struct dfa *d, const char *begin, const char *end) buf_end = (unsigned char *) (begin + i); mblen_buf[i] = 0; inputwcs[i] = 0; /* sentinel */ -#endif /* MBS_SUPPORT */ } /* Search through a buffer looking for a match to the given struct dfa. @@ -3653,7 +3573,7 @@ dfaoptimize (struct dfa *d) { size_t i; - if (!MBS_SUPPORT || !using_utf8 ()) + if (!using_utf8 ()) return; for (i = 0; i < d->tindex; ++i) @@ -3703,8 +3623,7 @@ dfafree (struct dfa *d) for (i = 0; i < d->sindex; ++i) { free (d->states[i].elems.elems); - if (MBS_SUPPORT) - free (d->states[i].mbps.elems); + free (d->states[i].mbps.elems); } free (d->states); for (i = 0; i < d->tindex; ++i) @@ -4179,7 +4098,7 @@ dfamust (struct dfa *d) /* not on *my* shift */ goto done; } - else if (t >= CSET || !MBS_SUPPORT || t == ANYCHAR || t == MBCSET) + else if (t >= CSET || t == ANYCHAR || t == MBCSET) { /* easy enough */ resetmust (mp); diff --git a/mbsupport.h b/mbsupport.h index 9a62486..ab33e91 100644 --- a/mbsupport.h +++ b/mbsupport.h @@ -66,6 +66,15 @@ #endif #if ! MBS_SUPPORT + +/* Include wchar.h and wctype.h so their definitions can be overridden. */ + +# include +# include + +/* Override the definitions of wchar.h and wctype.h to provide a + unibyte substitute that is good enough for Gawk. */ + # undef MB_CUR_MAX # define MB_CUR_MAX 1 @@ -78,15 +87,24 @@ #define wctype_t int #define wint_t int #define mbstate_t int +#undef WEOF #define WEOF EOF +#undef towupper #define towupper toupper +#undef towlower #define towlower tolower #ifndef __DJGPP__ -#define btowc(x) ((int)x) +#undef btowc +#define btowc(x) ((int) (x)) #endif +#undef iswalnum #define iswalnum isalnum +#undef iswalpha #define iswalpha isalpha +#undef iswupper #define iswupper isupper +#undef iswlower +#define iswlower islower #if defined(ZOS_USS) #undef towupper #undef towlower @@ -94,12 +112,40 @@ #undef iswalnum #undef iswalpha #undef iswupper -#undef wctype -#undef iswctype -#undef wcscoll #endif +#undef mbrtowc +#define mbrtowc(pwc, s, n, ps) ((size_t) -1) +#undef wcrtomb +#define wcrtomb(s, wc, ps) ((size_t) -1) + +#undef wctype +#define wctype gawk_wctype extern wctype_t wctype(const char *name); +#undef iswctype +#define iswctype gawk_iswctype extern int iswctype(wint_t wc, wctype_t desc); +#undef wcscoll +#define wcscoll gawk_wcscoll extern int wcscoll(const wchar_t *ws1, const wchar_t *ws2); #endif + +#ifdef LIBC_IS_BORKED +# include +extern int gawk_mb_cur_max; +# undef MB_CUR_MAX +# undef mbrtowc +# define MB_CUR_MAX gawk_mb_cur_max +# define mbrtowc(a, b, c, d) ((size_t) -1) +#endif + +#include +#ifndef LC_ALL +# define setlocale(category, locale) NULL +#endif + +#include +#ifndef static_assert +# define static_assert(cond, diagnostic) \ + extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })] +#endif diff --git a/missing_d/ChangeLog b/missing_d/ChangeLog index f94c070..7fa6541 100644 --- a/missing_d/ChangeLog +++ b/missing_d/ChangeLog @@ -1,3 +1,7 @@ +2014-04-03 Paul Eggert + + * wcmisc.c: Remove now-unnecessary ifdefs. + 2013-05-09 Arnold D. Robbins * 4.1.0: Release tar ball made. diff --git a/missing_d/wcmisc.c b/missing_d/wcmisc.c index d2b7aa0..89e24c9 100644 --- a/missing_d/wcmisc.c +++ b/missing_d/wcmisc.c @@ -16,7 +16,6 @@ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ -#if !defined(HAVE_WCTYPE) || !defined(HAVE_ISWCTYPE) static const char *classes[] = { "", "alnum", @@ -33,16 +32,12 @@ static const char *classes[] = { "xdigit", NULL }; -#endif -#ifndef HAVE_ISWCTYPE static int is_blank (int c) { return (c == ' ' || c == '\t'); } -#endif -#ifndef HAVE_WCTYPE wctype_t wctype(const char *name) { int i; @@ -53,9 +48,7 @@ wctype_t wctype(const char *name) return 0; } -#endif -#ifndef HAVE_ISWCTYPE int iswctype(wint_t wc, wctype_t desc) { int j = sizeof(classes) / sizeof(classes[0]); @@ -79,9 +72,7 @@ int iswctype(wint_t wc, wctype_t desc) default: return 0; } } -#endif -#ifndef HAVE_WCSCOLL int wcscoll(const wchar_t *ws1, const wchar_t *ws2) { size_t i; @@ -95,6 +86,5 @@ int wcscoll(const wchar_t *ws1, const wchar_t *ws2) return (ws1[i] - ws2[i]); } -#endif /*wcmisc.c*/ diff --git a/regex_internal.h b/regex_internal.h index c8981a0..758cf47 100644 --- a/regex_internal.h +++ b/regex_internal.h @@ -26,8 +26,6 @@ #include #include -#include "mbsupport.h" /* gawk */ - #if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC # include #endif -- 1.9.0