From 7e2b51d00133ab8a0dbcd21b5e0f39a6984f858f Mon Sep 17 00:00:00 2001
From: Paul Eggert
Date: Thu, 3 Apr 2014 18:04:52 -0700
Subject: [PATCH] awk: simplify dfa.c by having it not include mbsupport.h
directly
This syncs dfa.c better with 'grep'.
* awk.h, regex_internal.h, dfa.c: Don't include mbsupport.h.
* custom.h: Include mbsupport.h here instead.
(_GL_ATTRIBUTE_PURE): Move here from dfa.c, to lessen the
number of differences between grep's dfa.c and ours.
* dfa.c: Include wchar.h and wctype.h unconditionally, as
this simplifies the use of dfa.c in grep, and it does no harm
in gawk.
(gawk_mb_cur_max, MB_CUR_MAX, mbrtowc) [LIBC_IS_BORKED]:
Move to mbsupport.h (needed for consistency in all uses),
and fix mbrtowc to return size_t.
(setlocale, static_assert): Likewise.
(struct dfa, dfambcache, mbs_to_wchar)
(is_valid_unibyte_character, setbit_wc, using_utf8, FETCH_WC)
(addtok_wc, add_utf8_anychar, atom, state_index, epsclosure)
(dfaanalyze, dfastate, prepare_wc_buf, dfaoptimize, dfafree, dfamust):
* dfasearch.c (EGexecute):
* grep.c (main):
* searchutils.c (mbtoupper):
Assume MBS_SUPPORT.
* mbsupport.h [!MBS_SUPPORT]: Include wchar.h, wctype.h
before overriding their definitions.
(WEOF, towupper, towlower, btowc, iswalnum, iswalpha, iswupper)
(iswlower, mbrtowc, wcrtomb, wctype, iswctype, wcscoll):
(btowc): Parenthesize properly.
(mbrtowc, wcrtomb): New macros.
(wctype, iswctype, wcscoll): Define to gawk_wctype etc. to avoid
collisions with standard library.
* missing_d/wcmisc.c: Remove now-unnecessary ifdefs.
---
ChangeLog | 32 +++++++++++++++
awk.h | 2 -
custom.h | 9 +++++
dfa.c | 111 +++++++---------------------------------------------
mbsupport.h | 54 +++++++++++++++++++++++--
missing_d/ChangeLog | 4 ++
missing_d/wcmisc.c | 10 -----
regex_internal.h | 2 -
8 files changed, 110 insertions(+), 114 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index a0efd89..36fb0f4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,35 @@
+2014-04-03 Paul Eggert
+
+ awk: simplify dfa.c by having it not include mbsupport.h directly
+ This syncs dfa.c better with 'grep'.
+ * awk.h, regex_internal.h, dfa.c: Don't include mbsupport.h.
+ * custom.h: Include mbsupport.h here instead.
+ (_GL_ATTRIBUTE_PURE): Move here from dfa.c, to lessen the
+ number of differences between grep's dfa.c and ours.
+ * dfa.c: Include wchar.h and wctype.h unconditionally, as
+ this simplifies the use of dfa.c in grep, and it does no harm
+ in gawk.
+ (gawk_mb_cur_max, MB_CUR_MAX, mbrtowc) [LIBC_IS_BORKED]:
+ Move to mbsupport.h (needed for consistency in all uses),
+ and fix mbrtowc to return size_t.
+ (struct dfa, dfambcache, mbs_to_wchar)
+ (is_valid_unibyte_character, setbit_wc, using_utf8, FETCH_WC)
+ (addtok_wc, add_utf8_anychar, atom, state_index, epsclosure)
+ (dfaanalyze, dfastate, prepare_wc_buf, dfaoptimize, dfafree, dfamust):
+ * dfasearch.c (EGexecute):
+ * grep.c (main):
+ * searchutils.c (mbtoupper):
+ Assume MBS_SUPPORT.
+ * mbsupport.h [!MBS_SUPPORT]: Include wchar.h, wctype.h
+ before overriding their definitions.
+ (WEOF, towupper, towlower, btowc, iswalnum, iswalpha, iswupper)
+ (iswlower, mbrtowc, wcrtomb, wctype, iswctype, wcscoll):
+ #undef before #defining.
+ (btowc): Parenthesize properly.
+ (mbrtowc, wcrtomb): New macros.
+ (wctype, iswctype, wcscoll): Define to gawk_wctype etc. to avoid
+ collisions with standard library.
+
2014-04-03 Arnold D. Robbins
* regcomp.c (parse_bracket_exp): Move a call to `re_free' inside
diff --git a/awk.h b/awk.h
index aefdd07..cdba7a8 100644
--- a/awk.h
+++ b/awk.h
@@ -95,8 +95,6 @@ extern int errno;
#include "missing_d/gawkbool.h"
#endif
-#include "mbsupport.h" /* defines MBS_SUPPORT */
-
#if MBS_SUPPORT
/* We can handle multibyte strings. */
#include
diff --git a/custom.h b/custom.h
index 36b4aa0..bade4cf 100644
--- a/custom.h
+++ b/custom.h
@@ -76,3 +76,12 @@
extern int setenv(const char *name, const char *value, int rewrite);
extern int unsetenv(const char *name);
#endif
+
+/* The __pure__ attribute was added in gcc 2.96. */
+#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
+# define _GL_ATTRIBUTE_PURE __attribute__ ((__pure__))
+#else
+# define _GL_ATTRIBUTE_PURE /* empty */
+#endif
+
+#include "mbsupport.h"
diff --git a/dfa.c b/dfa.c
index 378305d..ee6edd8 100644
--- a/dfa.c
+++ b/dfa.c
@@ -43,16 +43,6 @@
#include "missing_d/gawkbool.h"
#endif /* HAVE_STDBOOL_H */
-/* Gawk doesn't use Gnulib, so don't assume that setlocale and
- static_assert are present. */
-#ifndef LC_ALL
-# define setlocale(category, locale) NULL
-#endif
-#ifndef static_assert
-# define static_assert(cond, diagnostic) \
- extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
-#endif
-
#define STREQ(a, b) (strcmp (a, b) == 0)
/* ISASCIIDIGIT differs from isdigit, as follows:
@@ -69,21 +59,8 @@
#include "gettext.h"
#define _(str) gettext (str)
-#include "mbsupport.h" /* Define MBS_SUPPORT to 1 or 0, as appropriate. */
-#if MBS_SUPPORT
-/* We can handle multibyte strings. */
-# include
-# include
-#endif
-
-#ifdef GAWK
-/* The __pure__ attribute was added in gcc 2.96. */
-#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
-# define _GL_ATTRIBUTE_PURE __attribute__ ((__pure__))
-#else
-# define _GL_ATTRIBUTE_PURE /* empty */
-#endif
-#endif /* GAWK */
+#include
+#include
#if HAVE_LANGINFO_CODESET
# include
@@ -101,14 +78,6 @@ is_blank (int c)
}
#endif /* GAWK */
-#ifdef LIBC_IS_BORKED
-extern int gawk_mb_cur_max;
-#undef MB_CUR_MAX
-#define MB_CUR_MAX gawk_mb_cur_max
-#undef mbrtowc
-#define mbrtowc(a, b, c, d) (-1)
-#endif
-
/* HPUX defines these as macros in sys/param.h. */
#ifdef setbit
# undef setbit
@@ -412,13 +381,11 @@ struct dfa
size_t nmultibyte_prop;
int *multibyte_prop;
-#if MBS_SUPPORT
/* A table indexed by byte values that contains the corresponding wide
character (if any) for that byte. WEOF means the byte is the
leading byte of a multibyte character. Invalid and null bytes are
mapped to themselves. */
wint_t mbrtowc_cache[NOTCHAR];
-#endif
/* Array of the bracket expression in the DFA. */
struct mb_char_classes *mbcsets;
@@ -525,7 +492,6 @@ static void regexp (void);
static void
dfambcache (struct dfa *d)
{
-#if MBS_SUPPORT
int i;
for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
{
@@ -542,10 +508,8 @@ dfambcache (struct dfa *d)
}
d->mbrtowc_cache[uc] = wi;
}
-#endif
}
-#if MBS_SUPPORT
/* Given the dfa D, store into *PWC the result of converting the
leading bytes of the multibyte buffer S of length N bytes, updating
the conversion state in *MBS. On conversion error, convert just a
@@ -579,7 +543,6 @@ mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n,
*pwc = wc;
return 1;
}
-#endif
#ifdef DEBUG
@@ -749,7 +712,7 @@ static charclass newline;
#ifdef __GLIBC__
# define is_valid_unibyte_character(c) 1
#else
-# define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF))
+# define is_valid_unibyte_character(c) (btowc (c) != WEOF)
#endif
/* Return non-zero if C is a "word-constituent" byte; zero otherwise. */
@@ -810,17 +773,12 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
static bool
setbit_wc (wint_t wc, charclass c)
{
-#if MBS_SUPPORT
int b = wctob (wc);
if (b == EOF)
return false;
setbit (b, c);
return true;
-#else
- abort ();
- /*NOTREACHED*/ return false;
-#endif
}
/* Set a bit for B and its case variants in the charclass C.
@@ -845,7 +803,7 @@ using_utf8 (void)
static int utf8 = -1;
if (utf8 == -1)
{
-#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT
+#if defined HAVE_LANGINFO_CODESET
utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
#else
utf8 = 0;
@@ -938,7 +896,6 @@ static unsigned char const *buf_begin; /* reference to begin in dfaexec. */
static unsigned char const *buf_end; /* reference to end in dfaexec. */
-#if MBS_SUPPORT
/* Note that characters become unsigned here. */
# define FETCH_WC(c, wc, eoferr) \
do { \
@@ -961,23 +918,6 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */
} \
} while (0)
-#else
-/* Note that characters become unsigned here. */
-# define FETCH_WC(c, unused, eoferr) \
- do { \
- if (! lexleft) \
- { \
- if ((eoferr) != 0) \
- dfaerror (eoferr); \
- else \
- return lasttok = END; \
- } \
- (c) = to_uchar (*lexptr++); \
- --lexleft; \
- } while (0)
-
-#endif /* MBS_SUPPORT */
-
#ifndef MIN
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
@@ -1761,7 +1701,6 @@ addtok (token t)
}
}
-#if MBS_SUPPORT
/* We treat a multibyte character as a single atom, so that DFA
can treat a multibyte character as a single expression.
@@ -1793,17 +1732,10 @@ addtok_wc (wint_t wc)
addtok (CAT);
}
}
-#else
-static void
-addtok_wc (wint_t wc)
-{
-}
-#endif
static void
add_utf8_anychar (void)
{
-#if MBS_SUPPORT
static const charclass utf8_classes[5] = {
{0, 0, 0, 0, ~0, ~0, 0, 0}, /* 80-bf: non-leading bytes */
{~0, ~0, ~0, ~0, 0, 0, 0, 0}, /* 00-7f: 1-byte sequence */
@@ -1848,7 +1780,6 @@ add_utf8_anychar (void)
addtok (CAT);
addtok (OR);
}
-#endif
}
/* The grammar understood by the parser is as follows.
@@ -1889,7 +1820,7 @@ add_utf8_anychar (void)
static void
atom (void)
{
- if (MBS_SUPPORT && tok == WCHAR)
+ if (tok == WCHAR)
{
addtok_wc (wctok);
@@ -1906,7 +1837,7 @@ atom (void)
tok = lex ();
}
- else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ())
+ else if (tok == ANYCHAR && using_utf8 ())
{
/* For UTF-8 expand the period to a series of CSETs that define a valid
UTF-8 character. This avoids using the slow multibyte path. I'm
@@ -1920,9 +1851,7 @@ atom (void)
}
else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
|| tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
-#if MBS_SUPPORT
|| tok == ANYCHAR || tok == MBCSET
-#endif /* MBS_SUPPORT */
|| tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
{
addtok (tok);
@@ -2205,11 +2134,9 @@ state_index (struct dfa *d, position_set const *s, int context)
d->states[i].backref = 0;
d->states[i].constraint = 0;
d->states[i].first_end = 0;
- if (MBS_SUPPORT)
- {
- d->states[i].mbps.nelem = 0;
- d->states[i].mbps.elems = NULL;
- }
+ d->states[i].mbps.nelem = 0;
+ d->states[i].mbps.elems = NULL;
+
for (j = 0; j < s->nelem; ++j)
if (d->tokens[s->elems[j].index] < 0)
{
@@ -2247,10 +2174,8 @@ epsclosure (position_set * s, struct dfa const *d)
for (i = 0; i < s->nelem; ++i)
if (d->tokens[s->elems[i].index] >= NOTCHAR
&& d->tokens[s->elems[i].index] != BACKREF
-#if MBS_SUPPORT
&& d->tokens[s->elems[i].index] != ANYCHAR
&& d->tokens[s->elems[i].index] != MBCSET
-#endif
&& d->tokens[s->elems[i].index] < CSET)
{
old = s->elems[i];
@@ -2567,9 +2492,7 @@ dfaanalyze (struct dfa *d, int searchflag)
it with its epsilon closure. */
for (i = 0; i < d->tindex; ++i)
if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
-#if MBS_SUPPORT
|| d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
-#endif
|| d->tokens[i] >= CSET)
{
#ifdef DEBUG
@@ -2679,9 +2602,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
setbit (d->tokens[pos.index], matches);
else if (d->tokens[pos.index] >= CSET)
copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
- else if (MBS_SUPPORT
- && (d->tokens[pos.index] == ANYCHAR
- || d->tokens[pos.index] == MBCSET))
+ else if (d->tokens[pos.index] == ANYCHAR
+ || d->tokens[pos.index] == MBCSET)
/* MB_CUR_MAX > 1 */
{
/* ANYCHAR and MBCSET must match with a single character, so we
@@ -2855,7 +2777,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
/* If we are building a searching matcher, throw in the positions
of state 0 as well. */
if (d->searchflag
- && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))
+ && (d->mb_cur_max == 1 || !next_isnt_1st_byte))
for (j = 0; j < d->states[0].elems.nelem; ++j)
insert (d->states[0].elems.elems[j], &follows);
@@ -3407,7 +3329,6 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp)
static void
prepare_wc_buf (struct dfa *d, const char *begin, const char *end)
{
-#if MBS_SUPPORT
unsigned char eol = eolbyte;
size_t i;
size_t ilim = end - begin + 1;
@@ -3431,7 +3352,6 @@ prepare_wc_buf (struct dfa *d, const char *begin, const char *end)
buf_end = (unsigned char *) (begin + i);
mblen_buf[i] = 0;
inputwcs[i] = 0; /* sentinel */
-#endif /* MBS_SUPPORT */
}
/* Search through a buffer looking for a match to the given struct dfa.
@@ -3653,7 +3573,7 @@ dfaoptimize (struct dfa *d)
{
size_t i;
- if (!MBS_SUPPORT || !using_utf8 ())
+ if (!using_utf8 ())
return;
for (i = 0; i < d->tindex; ++i)
@@ -3703,8 +3623,7 @@ dfafree (struct dfa *d)
for (i = 0; i < d->sindex; ++i)
{
free (d->states[i].elems.elems);
- if (MBS_SUPPORT)
- free (d->states[i].mbps.elems);
+ free (d->states[i].mbps.elems);
}
free (d->states);
for (i = 0; i < d->tindex; ++i)
@@ -4179,7 +4098,7 @@ dfamust (struct dfa *d)
/* not on *my* shift */
goto done;
}
- else if (t >= CSET || !MBS_SUPPORT || t == ANYCHAR || t == MBCSET)
+ else if (t >= CSET || t == ANYCHAR || t == MBCSET)
{
/* easy enough */
resetmust (mp);
diff --git a/mbsupport.h b/mbsupport.h
index 9a62486..ab33e91 100644
--- a/mbsupport.h
+++ b/mbsupport.h
@@ -66,6 +66,15 @@
#endif
#if ! MBS_SUPPORT
+
+/* Include wchar.h and wctype.h so their definitions can be overridden. */
+
+# include
+# include
+
+/* Override the definitions of wchar.h and wctype.h to provide a
+ unibyte substitute that is good enough for Gawk. */
+
# undef MB_CUR_MAX
# define MB_CUR_MAX 1
@@ -78,15 +87,24 @@
#define wctype_t int
#define wint_t int
#define mbstate_t int
+#undef WEOF
#define WEOF EOF
+#undef towupper
#define towupper toupper
+#undef towlower
#define towlower tolower
#ifndef __DJGPP__
-#define btowc(x) ((int)x)
+#undef btowc
+#define btowc(x) ((int) (x))
#endif
+#undef iswalnum
#define iswalnum isalnum
+#undef iswalpha
#define iswalpha isalpha
+#undef iswupper
#define iswupper isupper
+#undef iswlower
+#define iswlower islower
#if defined(ZOS_USS)
#undef towupper
#undef towlower
@@ -94,12 +112,40 @@
#undef iswalnum
#undef iswalpha
#undef iswupper
-#undef wctype
-#undef iswctype
-#undef wcscoll
#endif
+#undef mbrtowc
+#define mbrtowc(pwc, s, n, ps) ((size_t) -1)
+#undef wcrtomb
+#define wcrtomb(s, wc, ps) ((size_t) -1)
+
+#undef wctype
+#define wctype gawk_wctype
extern wctype_t wctype(const char *name);
+#undef iswctype
+#define iswctype gawk_iswctype
extern int iswctype(wint_t wc, wctype_t desc);
+#undef wcscoll
+#define wcscoll gawk_wcscoll
extern int wcscoll(const wchar_t *ws1, const wchar_t *ws2);
#endif
+
+#ifdef LIBC_IS_BORKED
+# include
+extern int gawk_mb_cur_max;
+# undef MB_CUR_MAX
+# undef mbrtowc
+# define MB_CUR_MAX gawk_mb_cur_max
+# define mbrtowc(a, b, c, d) ((size_t) -1)
+#endif
+
+#include
+#ifndef LC_ALL
+# define setlocale(category, locale) NULL
+#endif
+
+#include
+#ifndef static_assert
+# define static_assert(cond, diagnostic) \
+ extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
+#endif
diff --git a/missing_d/ChangeLog b/missing_d/ChangeLog
index f94c070..7fa6541 100644
--- a/missing_d/ChangeLog
+++ b/missing_d/ChangeLog
@@ -1,3 +1,7 @@
+2014-04-03 Paul Eggert
+
+ * wcmisc.c: Remove now-unnecessary ifdefs.
+
2013-05-09 Arnold D. Robbins
* 4.1.0: Release tar ball made.
diff --git a/missing_d/wcmisc.c b/missing_d/wcmisc.c
index d2b7aa0..89e24c9 100644
--- a/missing_d/wcmisc.c
+++ b/missing_d/wcmisc.c
@@ -16,7 +16,6 @@
Foundation, Inc.,
51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
-#if !defined(HAVE_WCTYPE) || !defined(HAVE_ISWCTYPE)
static const char *classes[] = {
"",
"alnum",
@@ -33,16 +32,12 @@ static const char *classes[] = {
"xdigit",
NULL
};
-#endif
-#ifndef HAVE_ISWCTYPE
static int is_blank (int c)
{
return (c == ' ' || c == '\t');
}
-#endif
-#ifndef HAVE_WCTYPE
wctype_t wctype(const char *name)
{
int i;
@@ -53,9 +48,7 @@ wctype_t wctype(const char *name)
return 0;
}
-#endif
-#ifndef HAVE_ISWCTYPE
int iswctype(wint_t wc, wctype_t desc)
{
int j = sizeof(classes) / sizeof(classes[0]);
@@ -79,9 +72,7 @@ int iswctype(wint_t wc, wctype_t desc)
default: return 0;
}
}
-#endif
-#ifndef HAVE_WCSCOLL
int wcscoll(const wchar_t *ws1, const wchar_t *ws2)
{
size_t i;
@@ -95,6 +86,5 @@ int wcscoll(const wchar_t *ws1, const wchar_t *ws2)
return (ws1[i] - ws2[i]);
}
-#endif
/*wcmisc.c*/
diff --git a/regex_internal.h b/regex_internal.h
index c8981a0..758cf47 100644
--- a/regex_internal.h
+++ b/regex_internal.h
@@ -26,8 +26,6 @@
#include
#include
-#include "mbsupport.h" /* gawk */
-
#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
# include
#endif
--
1.9.0