From f6112aca41ea8bd2028ea5b00a3a75db14a32eef Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Thu, 1 May 2014 23:09:00 -0700 Subject: [PATCH] awk: simplify dfa.c by having it not include mbsupport.h directly This syncs dfa.c better with 'grep'. * Makefile.am (STDBOOL_H, WCHAR_H, WCTYPE_H): New macros. ($(gawk_OBJCETS)): Depend on them. (stdbool.h, wchar.h, wctype.h): New rules. (CLEANFILES): Add the new files to this list. * awk.h, regex_internal.h, dfa.c: Don't include mbsupport.h. * configure.ac: Arrange for config.h to include it instead. (STDBOOL_H, WCHAR_H, WCTYPE_H): New configuration items. * custom.h (_GL_ATTRIBUTE_PURE): Move here from dfa.c, to lessen the number of differences between grep's dfa.c and ours. * dfa.c: Include wchar.h and wctype.h unconditionally, as this simplifies the use of dfa.c in grep, and it does no harm in gawk. (setlocale) [!LC_ALL]: (gawk_mb_cur_max, MB_CUR_MAX, mbrtowc) [LIBC_IS_BORKED]: Move to mbsupport.h (needed for consistency in all uses), and fix mbrtowc to return size_t. (struct dfa, dfambcache, mbs_to_wchar) (is_valid_unibyte_character, setbit_wc, using_utf8, FETCH_WC) (addtok_wc, add_utf8_anychar, atom, state_index, epsclosure) (dfaanalyze, dfastate, prepare_wc_buf, dfaoptimize, dfafree, dfamust): * dfasearch.c (EGexecute): * grep.c (main): * searchutils.c (mbtoupper): Assume MBS_SUPPORT. * dfa.h: Include stdbool.h unconditionally, so that this file is closer to what's in grep. * mbsupport.h [!MBS_SUPPORT]: Include wchar.h, wctype.h before overriding their definitions. (WEOF, towupper, towlower, btowc, iswalnum, iswalpha, iswupper) (iswlower, mbrtowc, wcrtomb, wctype, iswctype, wcscoll): (btowc): Parenthesize properly. (mbrtowc, wcrtomb): New macros. (wctype, iswctype, wcscoll): Define to gawk_wctype etc. to avoid collisions with standard library. --- ChangeLog | 40 +++++++++++++++++++++++ Makefile.am | 11 +++++++ awk.h | 2 -- configure.ac | 9 ++++++ custom.h | 7 ++++ dfa.c | 93 +++++++---------------------------------------------- dfa.h | 4 --- mbsupport.h | 57 +++++++++++++++++++++++++++++--- missing_d/ChangeLog | 4 +++ missing_d/wcmisc.c | 10 ------ regex.h | 18 ++++++++--- regex_internal.h | 2 -- 12 files changed, 149 insertions(+), 108 deletions(-) diff --git a/ChangeLog b/ChangeLog index c1b294b..8ebfaed 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,43 @@ +2014-05-01 Paul Eggert + + awk: simplify dfa.c by having it not include mbsupport.h directly + This syncs dfa.c better with 'grep'. + * Makefile.am (STDBOOL_H, WCHAR_H, WCTYPE_H): New macros. + ($(gawk_OBJCETS)): Depend on them. + (stdbool.h, wchar.h, wctype.h): New rules. + (CLEANFILES): Add the new files to this list. + * awk.h, regex_internal.h, dfa.c: Don't include mbsupport.h. + * configure.ac: Arrange for config.h to include it instead. + (STDBOOL_H, WCHAR_H, WCTYPE_H): New configuration items. + * custom.h (_GL_ATTRIBUTE_PURE): Move here from dfa.c, to lessen the + number of differences between grep's dfa.c and ours. + * dfa.c: Include wchar.h and wctype.h unconditionally, as + this simplifies the use of dfa.c in grep, and it does no harm + in gawk. + (setlocale) [!LC_ALL]: + (gawk_mb_cur_max, MB_CUR_MAX, mbrtowc) [LIBC_IS_BORKED]: + Move to mbsupport.h (needed for consistency in all uses), + and fix mbrtowc to return size_t. + (struct dfa, dfambcache, mbs_to_wchar) + (is_valid_unibyte_character, setbit_wc, using_utf8, FETCH_WC) + (addtok_wc, add_utf8_anychar, atom, state_index, epsclosure) + (dfaanalyze, dfastate, prepare_wc_buf, dfaoptimize, dfafree, dfamust): + * dfasearch.c (EGexecute): + * grep.c (main): + * searchutils.c (mbtoupper): + Assume MBS_SUPPORT. + * dfa.h: Include stdbool.h unconditionally, so that this file is + closer to what's in grep. + * mbsupport.h [!MBS_SUPPORT]: Include wchar.h, wctype.h + before overriding their definitions. + (WEOF, towupper, towlower, btowc, iswalnum, iswalpha, iswupper) + (iswlower, mbrtowc, wcrtomb, wctype, iswctype, wcscoll): + #undef before #defining. + (btowc): Parenthesize properly. + (mbrtowc, wcrtomb): New macros. + (wctype, iswctype, wcscoll): Define to gawk_wctype etc. to avoid + collisions with standard library. + 2014-04-25 Andrew J. Schorr * io.c (two_way_open): In forked child, reset SIGPIPE to SIG_DFL. diff --git a/Makefile.am b/Makefile.am index 6e5715d..f1a725a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -196,6 +196,17 @@ command.c: command.y $(YACC) -p zz $< sed 's/parse error/syntax error/g' < y.tab.c | awk -f $(srcdir)/bisonfix.awk command > $*.c && rm y.tab.c +# Arrange for some standard headers on platforms that lack them. +STDBOOL_H = @STDBOOL_H@ +WCHAR_H = @WCHAR_H@ +WCTYPE_H = @WCTYPE_H@ +$(gawk_OBJECTS): $(STDBOOL_H) $(WCHAR_H) $(WCTYPE_H) +stdbool.h: + echo '#include "missing_d/gawkbool.h"' >$@ +wchar.h wctype.h: + echo '' >$@ +CLEANFILES += stdbool.h wchar.h wctype.h + # This is for my development & testing. efence: gawk $(CC) $(LDFLAGS) -o gawk $$(ls *.o | grep -v '_p.o$$') $(LIBS) -lefence diff --git a/awk.h b/awk.h index aefdd07..cdba7a8 100644 --- a/awk.h +++ b/awk.h @@ -95,8 +95,6 @@ extern int errno; #include "missing_d/gawkbool.h" #endif -#include "mbsupport.h" /* defines MBS_SUPPORT */ - #if MBS_SUPPORT /* We can handle multibyte strings. */ #include diff --git a/configure.ac b/configure.ac index e7e2d5f..2447c32 100644 --- a/configure.ac +++ b/configure.ac @@ -153,6 +153,14 @@ else AC_CHECK_HEADERS(strings.h) fi +STDBOOL_H= WCHAR_H= WCTYPE_H= +test "$ac_cv_header_stdbool_h" != yes && STDBOOL_H=stdbool.h +test "$ac_cv_header_wchar_h" != yes && WCHAR_H=wchar.h +test "$ac_cv_header_wctype_h" != yes && WCTYPE_H=wctype.h +AC_SUBST([STDBOOL_H]) +AC_SUBST([WCHAR_H]) +AC_SUBST([WCTYPE_H]) + dnl Check cross compiling AM_CONDITIONAL([TEST_CROSS_COMPILE], [test "x$build_alias" != "x$host_alias"]) @@ -390,6 +398,7 @@ AC_C_STRINGIZE AC_CONFIG_HEADERS([config.h:configh.in]) AH_BOTTOM([#include "custom.h"]) +AH_BOTTOM([#include "mbsupport.h"]) dnl Crude but small hack to make plug-ins work on Mac OS X dnl We should really use the libtool value for shrext_cmds, but that diff --git a/custom.h b/custom.h index 36b4aa0..5b19dd4 100644 --- a/custom.h +++ b/custom.h @@ -76,3 +76,10 @@ extern int setenv(const char *name, const char *value, int rewrite); extern int unsetenv(const char *name); #endif + +/* The __pure__ attribute was added in gcc 2.96. */ +#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96) +# define _GL_ATTRIBUTE_PURE __attribute__ ((__pure__)) +#else +# define _GL_ATTRIBUTE_PURE /* empty */ +#endif diff --git a/dfa.c b/dfa.c index d306d5c..9c41fd1 100644 --- a/dfa.c +++ b/dfa.c @@ -22,6 +22,8 @@ #include +#include "dfa.h" + #include #include #include @@ -38,11 +40,6 @@ #include #endif -/* Gawk doesn't use Gnulib, so don't assume that setlocale is present. */ -#ifndef LC_ALL -# define setlocale(category, locale) NULL -#endif - #define STREQ(a, b) (strcmp (a, b) == 0) /* ISASCIIDIGIT differs from isdigit, as follows: @@ -59,26 +56,11 @@ #include "gettext.h" #define _(str) gettext (str) -#include "mbsupport.h" /* Define MBS_SUPPORT to 1 or 0, as appropriate. */ -#if MBS_SUPPORT -/* We can handle multibyte strings. */ -# include -# include -#endif - -#ifdef GAWK -/* The __pure__ attribute was added in gcc 2.96. */ -#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96) -# define _GL_ATTRIBUTE_PURE __attribute__ ((__pure__)) -#else -# define _GL_ATTRIBUTE_PURE /* empty */ -#endif -#endif /* GAWK */ +#include +#include #include "xalloc.h" -#include "dfa.h" - #ifdef GAWK static int is_blank (int c) @@ -87,14 +69,6 @@ is_blank (int c) } #endif /* GAWK */ -#ifdef LIBC_IS_BORKED -extern int gawk_mb_cur_max; -#undef MB_CUR_MAX -#define MB_CUR_MAX gawk_mb_cur_max -#undef mbrtowc -#define mbrtowc(a, b, c, d) (-1) -#endif - /* HPUX defines these as macros in sys/param.h. */ #ifdef setbit # undef setbit @@ -402,13 +376,11 @@ struct dfa */ int *multibyte_prop; -#if MBS_SUPPORT /* A table indexed by byte values that contains the corresponding wide character (if any) for that byte. WEOF means the byte is the leading byte of a multibyte character. Invalid and null bytes are mapped to themselves. */ wint_t mbrtowc_cache[NOTCHAR]; -#endif /* Array of the bracket expression in the DFA. */ struct mb_char_classes *mbcsets; @@ -488,7 +460,6 @@ static void regexp (void); static void dfambcache (struct dfa *d) { -#if MBS_SUPPORT int i; for (i = CHAR_MIN; i <= CHAR_MAX; ++i) { @@ -505,10 +476,8 @@ dfambcache (struct dfa *d) } d->mbrtowc_cache[uc] = wi; } -#endif } -#if MBS_SUPPORT /* Store into *PWC the result of converting the leading bytes of the multibyte buffer S of length N bytes, using the mbrtowc_cache in *D and updating the conversion state in *D. On conversion error, @@ -543,7 +512,6 @@ mbs_to_wchar (wchar_t *pwc, char const *s, size_t n, struct dfa *d) *pwc = wc; return 1; } -#endif #ifdef DEBUG @@ -737,7 +705,7 @@ static charclass newline; #ifdef __GLIBC__ # define is_valid_unibyte_character(c) 1 #else -# define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF)) +# define is_valid_unibyte_character(c) (btowc (c) != WEOF) #endif /* Return non-zero if C is a "word-constituent" byte; zero otherwise. */ @@ -798,17 +766,12 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) static bool setbit_wc (wint_t wc, charclass c) { -#if MBS_SUPPORT int b = wctob (wc); if (b == EOF) return false; setbit (b, c); return true; -#else - abort (); - /*NOTREACHED*/ return false; -#endif } /* Set a bit for B and its case variants in the charclass C. @@ -904,7 +867,6 @@ static wchar_t wctok; /* Wide character representation of the current multibyte character. */ -#if MBS_SUPPORT /* Note that characters become unsigned here. */ # define FETCH_WC(c, wc, eoferr) \ do { \ @@ -927,23 +889,6 @@ static wchar_t wctok; /* Wide character representation of the current } \ } while (0) -#else -/* Note that characters become unsigned here. */ -# define FETCH_WC(c, unused, eoferr) \ - do { \ - if (! lexleft) \ - { \ - if ((eoferr) != 0) \ - dfaerror (eoferr); \ - else \ - return lasttok = END; \ - } \ - (c) = to_uchar (*lexptr++); \ - --lexleft; \ - } while (0) - -#endif /* MBS_SUPPORT */ - #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif @@ -1728,7 +1673,6 @@ addtok (token t) } } -#if MBS_SUPPORT /* We treat a multibyte character as a single atom, so that DFA can treat a multibyte character as a single expression. @@ -1760,17 +1704,10 @@ addtok_wc (wint_t wc) addtok (CAT); } } -#else -static void -addtok_wc (wint_t wc) -{ -} -#endif static void add_utf8_anychar (void) { -#if MBS_SUPPORT static const charclass utf8_classes[5] = { {0, 0, 0, 0, ~0, ~0, 0, 0}, /* 80-bf: non-leading bytes */ {~0, ~0, ~0, ~0, 0, 0, 0, 0}, /* 00-7f: 1-byte sequence */ @@ -1815,7 +1752,6 @@ add_utf8_anychar (void) addtok (CAT); addtok (OR); } -#endif } /* The grammar understood by the parser is as follows. @@ -1856,7 +1792,7 @@ add_utf8_anychar (void) static void atom (void) { - if (MBS_SUPPORT && tok == WCHAR) + if (tok == WCHAR) { addtok_wc (wctok); @@ -1873,7 +1809,7 @@ atom (void) tok = lex (); } - else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ()) + else if (tok == ANYCHAR && using_utf8 ()) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -1887,9 +1823,7 @@ atom (void) } else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD -#if MBS_SUPPORT || tok == ANYCHAR || tok == MBCSET -#endif /* MBS_SUPPORT */ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) { addtok (tok); @@ -2224,10 +2158,8 @@ epsclosure (position_set * s, struct dfa const *d) for (i = 0; i < s->nelem; ++i) if (d->tokens[s->elems[i].index] >= NOTCHAR && d->tokens[s->elems[i].index] != BACKREF -#if MBS_SUPPORT && d->tokens[s->elems[i].index] != ANYCHAR && d->tokens[s->elems[i].index] != MBCSET -#endif && d->tokens[s->elems[i].index] < CSET) { old = s->elems[i]; @@ -2541,9 +2473,7 @@ dfaanalyze (struct dfa *d, int searchflag) it with its epsilon closure. */ for (i = 0; i < d->tindex; ++i) if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF -#if MBS_SUPPORT || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET -#endif || d->tokens[i] >= CSET) { #ifdef DEBUG @@ -2643,9 +2573,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (MBS_SUPPORT - && (d->tokens[pos.index] == ANYCHAR - || d->tokens[pos.index] == MBCSET)) + else if (d->tokens[pos.index] == ANYCHAR + || d->tokens[pos.index] == MBCSET) /* MB_CUR_MAX > 1 */ { /* ANYCHAR and MBCSET must match with a single character, so we @@ -2820,7 +2749,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) /* If we are building a searching matcher, throw in the positions of state 0 as well. */ if (d->searchflag - && (!MBS_SUPPORT || (!d->multibyte || !next_isnt_1st_byte))) + && (MB_CUR_MAX == 1 || !next_isnt_1st_byte)) for (j = 0; j < d->states[0].elems.nelem; ++j) insert (d->states[0].elems.elems[j], &follows); @@ -3541,7 +3470,7 @@ dfaoptimize (struct dfa *d) { size_t i; - if (!MBS_SUPPORT || !using_utf8 ()) + if (!using_utf8 ()) return; for (i = 0; i < d->tindex; ++i) diff --git a/dfa.h b/dfa.h index 1514236..60aff11 100644 --- a/dfa.h +++ b/dfa.h @@ -19,11 +19,7 @@ /* Written June, 1988 by Mike Haertel */ #include -#ifdef HAVE_STDBOOL_H #include -#else -#include "missing_d/gawkbool.h" -#endif /* HAVE_STDBOOL_H */ #include /* Element of a list of strings, at least one of which is known to diff --git a/mbsupport.h b/mbsupport.h index 9a62486..198a0f3 100644 --- a/mbsupport.h +++ b/mbsupport.h @@ -66,6 +66,15 @@ #endif #if ! MBS_SUPPORT + +/* Include wchar.h and wctype.h so their definitions can be overridden. */ + +# include +# include + +/* Override the definitions of wchar.h and wctype.h to provide a + unibyte substitute that is good enough for Gawk. */ + # undef MB_CUR_MAX # define MB_CUR_MAX 1 @@ -78,15 +87,24 @@ #define wctype_t int #define wint_t int #define mbstate_t int +#undef WEOF #define WEOF EOF +#undef towupper #define towupper toupper +#undef towlower #define towlower tolower #ifndef __DJGPP__ -#define btowc(x) ((int)x) +#undef btowc +#define btowc(x) ((int) (x)) #endif +#undef iswalnum #define iswalnum isalnum +#undef iswalpha #define iswalpha isalpha +#undef iswupper #define iswupper isupper +#undef iswlower +#define iswlower islower #if defined(ZOS_USS) #undef towupper #undef towlower @@ -94,12 +112,43 @@ #undef iswalnum #undef iswalpha #undef iswupper -#undef wctype -#undef iswctype -#undef wcscoll #endif +#undef mbrtowc +#define mbrtowc(pwc, s, n, ps) ((size_t) -1) +#undef wcrtomb +#define wcrtomb(s, wc, ps) ((size_t) -1) + +#undef wctype +#define wctype gawk_wctype extern wctype_t wctype(const char *name); +#undef iswctype +#define iswctype gawk_iswctype extern int iswctype(wint_t wc, wctype_t desc); +#undef wcscoll +#define wcscoll gawk_wcscoll extern int wcscoll(const wchar_t *ws1, const wchar_t *ws2); #endif + +#ifdef LIBC_IS_BORKED +# include +extern int gawk_mb_cur_max; +# undef MB_CUR_MAX +# undef mbrtowc +# define MB_CUR_MAX gawk_mb_cur_max +# define mbrtowc(a, b, c, d) ((size_t) -1) +#endif + +#include +#ifndef LC_ALL +# define setlocale(category, locale) NULL +#endif + +#include +#ifndef static_assert +# define static_assert(cond, diagnostic) \ + extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })] +#endif + +/* Make sure RE_DUP_MAX gets the correct value. */ +#define _REGEX_INCLUDE_LIMITS_H diff --git a/missing_d/ChangeLog b/missing_d/ChangeLog index 70fbde6..4686c74 100644 --- a/missing_d/ChangeLog +++ b/missing_d/ChangeLog @@ -1,3 +1,7 @@ +2014-05-01 Paul Eggert + + * wcmisc.c: Remove now-unnecessary ifdefs. + 2014-04-08 Arnold D. Robbins * 4.1.1: Release tar ball made. diff --git a/missing_d/wcmisc.c b/missing_d/wcmisc.c index d2b7aa0..89e24c9 100644 --- a/missing_d/wcmisc.c +++ b/missing_d/wcmisc.c @@ -16,7 +16,6 @@ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ -#if !defined(HAVE_WCTYPE) || !defined(HAVE_ISWCTYPE) static const char *classes[] = { "", "alnum", @@ -33,16 +32,12 @@ static const char *classes[] = { "xdigit", NULL }; -#endif -#ifndef HAVE_ISWCTYPE static int is_blank (int c) { return (c == ' ' || c == '\t'); } -#endif -#ifndef HAVE_WCTYPE wctype_t wctype(const char *name) { int i; @@ -53,9 +48,7 @@ wctype_t wctype(const char *name) return 0; } -#endif -#ifndef HAVE_ISWCTYPE int iswctype(wint_t wc, wctype_t desc) { int j = sizeof(classes) / sizeof(classes[0]); @@ -79,9 +72,7 @@ int iswctype(wint_t wc, wctype_t desc) default: return 0; } } -#endif -#ifndef HAVE_WCSCOLL int wcscoll(const wchar_t *ws1, const wchar_t *ws2) { size_t i; @@ -95,6 +86,5 @@ int wcscoll(const wchar_t *ws1, const wchar_t *ws2) return (ws1[i] - ws2[i]); } -#endif /*wcmisc.c*/ diff --git a/regex.h b/regex.h index 5660296..400b407 100644 --- a/regex.h +++ b/regex.h @@ -264,14 +264,24 @@ extern reg_syntax_t re_syntax_options; | RE_NO_BK_PARENS | RE_NO_BK_REFS \ | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) /* [[[end syntaxes]]] */ - -/* Maximum number of duplicates an interval can allow. Some systems - (erroneously) define this in other header files, but we want our + +/* Maximum number of duplicates an interval can allow. POSIX-conforming + systems might define this in , but we want our value, so remove any previous define. */ +# ifdef _REGEX_INCLUDE_LIMITS_H +# include +# endif # ifdef RE_DUP_MAX # undef RE_DUP_MAX # endif -/* If sizeof(int) == 2, then ((1 << 15) - 1) overflows. */ + +/* RE_DUP_MAX is 2**15 - 1 because an earlier implementation stored + the counter as a 2-byte signed integer. This is no longer true, so + RE_DUP_MAX could be increased to (INT_MAX / 10 - 1), or to + ((SIZE_MAX - 9) / 10) if _REGEX_LARGE_OFFSETS is defined. + However, there would be a huge performance problem if someone + actually used a pattern like a\{214748363\}, so RE_DUP_MAX retains + its historical value. */ # define RE_DUP_MAX (0x7fff) #endif diff --git a/regex_internal.h b/regex_internal.h index c8981a0..758cf47 100644 --- a/regex_internal.h +++ b/regex_internal.h @@ -26,8 +26,6 @@ #include #include -#include "mbsupport.h" /* gawk */ - #if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC # include #endif -- 1.9.0