>From 20ff668e934fa7e3bb3ce27027cd56ebe8716188 Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Sat, 4 Jan 2020 02:32:52 +0100 Subject: [PATCH 4/5] mbrtoc32: New module. * lib/uchar.in.h (mbrtoc32): New declaration. * lib/mbrtoc32.c: New file, based on lib/mbrtowc.c. * m4/mbrtoc32.m4: New file, based on m4/mbrtowc.m4. * m4/uchar.m4 (gl_UCHAR_H): Test whether mbrtoc32 is declared. (gl_UCHAR_H_DEFAULTS): Initialize GNULIB_MBRTOC32, HAVE_MBRTOC32, REPLACE_MBRTOC32. * modules/uchar (Makefile.am): Substitute GNULIB_MBRTOC32, HAVE_MBRTOC32, REPLACE_MBRTOC32. * modules/mbrtoc32: New file, based on modules/mbrtowc. * tests/test-uchar-c++.cc (mbrtoc32): Verify the signature. * modules/uchar-c++-tests (Makefile.am): Link test-uchar-c++ with $(LIB_MBRTOWC). * doc/posix-functions/mbrtoc32.texi: Document the new module. * doc/posix-functions/mbrtowc.texi: Mention the new module. --- ChangeLog | 18 +++ doc/posix-functions/mbrtoc32.texi | 16 ++- doc/posix-functions/mbrtowc.texi | 7 +- lib/mbrtoc32.c | 227 ++++++++++++++++++++++++++++++++++++++ lib/uchar.in.h | 29 +++++ m4/mbrtoc32.m4 | 117 ++++++++++++++++++++ m4/uchar.m4 | 12 +- modules/mbrtoc32 | 51 +++++++++ modules/uchar | 3 + modules/uchar-c++-tests | 1 + tests/test-uchar-c++.cc | 5 + 11 files changed, 479 insertions(+), 7 deletions(-) create mode 100644 lib/mbrtoc32.c create mode 100644 m4/mbrtoc32.m4 create mode 100644 modules/mbrtoc32 diff --git a/ChangeLog b/ChangeLog index f8c6793..c4d0968 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,23 @@ 2020-01-03 Bruno Haible + mbrtoc32: New module. + * lib/uchar.in.h (mbrtoc32): New declaration. + * lib/mbrtoc32.c: New file, based on lib/mbrtowc.c. + * m4/mbrtoc32.m4: New file, based on m4/mbrtowc.m4. + * m4/uchar.m4 (gl_UCHAR_H): Test whether mbrtoc32 is declared. + (gl_UCHAR_H_DEFAULTS): Initialize GNULIB_MBRTOC32, HAVE_MBRTOC32, + REPLACE_MBRTOC32. + * modules/uchar (Makefile.am): Substitute GNULIB_MBRTOC32, + HAVE_MBRTOC32, REPLACE_MBRTOC32. + * modules/mbrtoc32: New file, based on modules/mbrtowc. + * tests/test-uchar-c++.cc (mbrtoc32): Verify the signature. + * modules/uchar-c++-tests (Makefile.am): Link test-uchar-c++ with + $(LIB_MBRTOWC). + * doc/posix-functions/mbrtoc32.texi: Document the new module. + * doc/posix-functions/mbrtowc.texi: Mention the new module. + +2020-01-03 Bruno Haible + mbrtowc: Refactor to share code with mbrtoc32. * lib/mbrtowc-impl.h: New file, extracted from lib/mbrtowc.c. * lib/mbrtowc-impl-utf8.h: Likewise. diff --git a/doc/posix-functions/mbrtoc32.texi b/doc/posix-functions/mbrtoc32.texi index 92241c9..1aa15a3 100644 --- a/doc/posix-functions/mbrtoc32.texi +++ b/doc/posix-functions/mbrtoc32.texi @@ -2,15 +2,23 @@ @section @code{mbrtoc32} @findex mbrtoc32 -Gnulib module: --- +Gnulib module: mbrtoc32 Portability problems fixed by Gnulib: @itemize +@item +This function is missing on most non-glibc platforms: +glibc 2.15, Mac OS X 10.5, FreeBSD 6.4, NetBSD 5.0, OpenBSD 3.8, Minix 3.1.8, AIX 7.1, HP-UX 11.31, IRIX 6.5, Solaris 11.3, Cygwin, mingw, MSVC 9, Android 4.4. +@item +In the C or POSIX locales, this function can return @code{(size_t) -1} +and set @code{errno} to @code{EILSEQ}: +glibc 2.23. +@item +This function returns 0 instead of @code{(size_t) -2} when the input +is empty: +glibc 2.19. @end itemize Portability problems not fixed by Gnulib: @itemize -@item -This function is missing on most non-glibc platforms: -glibc 2.15, Mac OS X 10.5, FreeBSD 6.4, NetBSD 5.0, OpenBSD 3.8, Minix 3.1.8, AIX 7.1, HP-UX 11.31, IRIX 6.5, Solaris 11.3, Cygwin, mingw, MSVC 9, Android 4.4. @end itemize diff --git a/doc/posix-functions/mbrtowc.texi b/doc/posix-functions/mbrtowc.texi index 3b7aed0..897e4da 100644 --- a/doc/posix-functions/mbrtowc.texi +++ b/doc/posix-functions/mbrtowc.texi @@ -44,6 +44,9 @@ Solaris 9. Portability problems not fixed by Gnulib: @itemize @item -On Windows and 32-bit AIX platforms, @code{wchar_t} is a 16-bit type and therefore cannot -accommodate all Unicode characters. +On Windows and 32-bit AIX platforms, @code{wchar_t} is a 16-bit type and +therefore cannot accommodate all Unicode characters. +However, the ISO C11 function @code{mbrtoc32}, provided by Gnulib module +@code{mbrtoc32}, operates on 32-bit wide characters and therefore does not have +this limitation. @end itemize diff --git a/lib/mbrtoc32.c b/lib/mbrtoc32.c new file mode 100644 index 0000000..f2cf71e --- /dev/null +++ b/lib/mbrtoc32.c @@ -0,0 +1,227 @@ +/* Convert multibyte character to 32-bit wide character. + Copyright (C) 2020 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Bruno Haible , 2020. */ + +#include + +/* Specification. */ +#include + +#include +#include + +# ifndef FALLTHROUGH +# if __GNUC__ < 7 +# define FALLTHROUGH ((void) 0) +# else +# define FALLTHROUGH __attribute__ ((__fallthrough__)) +# endif +# endif + +#if GNULIB_defined_mbstate_t /* AIX, IRIX */ +/* Implement mbrtoc32() on top of mbtowc() for the non-UTF-8 locales + and directly for the UTF-8 locales. */ + +# if defined _WIN32 && !defined __CYGWIN__ + +# define WIN32_LEAN_AND_MEAN /* avoid including junk */ +# include + +# elif HAVE_PTHREAD_API + +# include +# if HAVE_THREADS_H && HAVE_WEAK_SYMBOLS +# include +# pragma weak thrd_exit +# define c11_threads_in_use() (thrd_exit != NULL) +# else +# define c11_threads_in_use() 0 +# endif + +# elif HAVE_THREADS_H + +# include + +# endif + +# include "verify.h" +# include "lc-charset-dispatch.h" +# include "mbtowc-lock.h" + +verify (sizeof (mbstate_t) >= 4); +static char internal_state[4]; + +size_t +mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) +{ +# define FITS_IN_CHAR_TYPE(wc) 1 +# include "mbrtowc-impl.h" +} + +#else /* glibc, macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Cygwin, mingw, MSVC, Minix, Android */ + +/* Implement mbrtoc32() based on mbrtowc(). */ + +# include + +# include "localcharset.h" +# include "streq.h" + +static mbstate_t internal_state; + +size_t +mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) +{ + /* It's simpler to handle the case s == NULL upfront, than to worry about + this case later, before every test of pwc and n. */ + if (s == NULL) + { + pwc = NULL; + s = ""; + n = 1; + } + +# if MBRTOC32_EMPTY_INPUT_BUG || _GL_LARGE_CHAR32_T + if (n == 0) + return (size_t) -2; +# endif + + if (ps == NULL) + ps = &internal_state; + +# if _GL_LARGE_CHAR32_T + + /* Special-case all encodings that may produce wide character values + > WCHAR_MAX. */ + const char *encoding = locale_charset (); + if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) + { + /* Special-case the UTF-8 encoding. Assume that the wide-character + encoding in a UTF-8 locale is UCS-2 or, equivalently, UTF-16. */ + /* Here n > 0. */ + char *pstate = (char *)ps; + size_t nstate = pstate[0]; + char buf[4]; + const char *p; + size_t m; + int res; + + switch (nstate) + { + case 0: + p = s; + m = n; + break; + case 3: + buf[2] = pstate[3]; + FALLTHROUGH; + case 2: + buf[1] = pstate[2]; + FALLTHROUGH; + case 1: + buf[0] = pstate[1]; + p = buf; + m = nstate; + buf[m++] = s[0]; + if (n >= 2 && m < 4) + { + buf[m++] = s[1]; + if (n >= 3 && m < 4) + buf[m++] = s[2]; + } + break; + default: + errno = EINVAL; + return (size_t)(-1); + } + + /* Here m > 0. */ + + { +# define FITS_IN_CHAR_TYPE(wc) 1 +# include "mbrtowc-impl-utf8.h" + } + + success: + if (nstate >= (res > 0 ? res : 1)) + abort (); + res -= nstate; + /* Set *ps to the initial state. */ +# if defined _WIN32 && !defined __CYGWIN__ + /* Native Windows. */ + /* MSVC defines 'mbstate_t' as an 8-byte struct; the first 4 bytes matter. + On mingw, 'mbstate_t' is sometimes defined as 'int', sometimes defined + as an 8-byte struct, of which the first 4 bytes matter. */ + *(unsigned int *)pstate = 0; +# elif defined __CYGWIN__ + /* Cygwin defines 'mbstate_t' as an 8-byte struct; the first 4 bytes + matter. */ + ps->__count = 0; +# else + pstate[0] = 0; +# endif + return res; + + incomplete: + { + size_t k = nstate; + /* Here 0 <= k < m < 4. */ + pstate[++k] = s[0]; + if (k < m) + { + pstate[++k] = s[1]; + if (k < m) + pstate[++k] = s[2]; + } + if (k != m) + abort (); + } + pstate[0] = m; + return (size_t)(-2); + + invalid: + errno = EILSEQ; + /* The conversion state is undefined, says POSIX. */ + return (size_t)(-1); + } + else + { + wchar_t wc; + size_t ret = mbrtowc (&wc, s, n, ps); + if (ret < (size_t) -2 && pwc != NULL) + *pwc = wc; + return ret; + } + +# else + + /* char32_t and wchar_t are equivalent. + Two implementations are possible: + - We can call the original mbrtoc32 (if it exists) and handle + MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ. + - We can call mbrtowc. + The latter is simpler. */ + wchar_t wc; + size_t ret = mbrtowc (&wc, s, n, ps); + if (ret < (size_t) -2 && pwc != NULL) + *pwc = wc; + return ret; + +# endif +} + +#endif diff --git a/lib/uchar.in.h b/lib/uchar.in.h index 9cba39b..6f533a1 100644 --- a/lib/uchar.in.h +++ b/lib/uchar.in.h @@ -70,4 +70,33 @@ _GL_CXXALIASWARN (c32tob); #endif +/* Converts a multibyte character to a 32-bit wide character. */ +#if @GNULIB_MBRTOC32@ +# if @REPLACE_MBRTOC32@ +# if !(defined __cplusplus && defined GNULIB_NAMESPACE) +# undef mbrtoc32 +# define mbrtoc32 rpl_mbrtoc32 +# endif +_GL_FUNCDECL_RPL (mbrtoc32, size_t, + (char32_t *pc, const char *s, size_t n, mbstate_t *ps)); +_GL_CXXALIAS_RPL (mbrtoc32, size_t, + (char32_t *pc, const char *s, size_t n, mbstate_t *ps)); +# else +# if !@HAVE_MBRTOC32@ +_GL_FUNCDECL_SYS (mbrtoc32, size_t, + (char32_t *pc, const char *s, size_t n, mbstate_t *ps)); +# endif +_GL_CXXALIAS_SYS (mbrtoc32, size_t, + (char32_t *pc, const char *s, size_t n, mbstate_t *ps)); +# endif +_GL_CXXALIASWARN (mbrtoc32); +#elif defined GNULIB_POSIXCHECK +# undef mbrtoc32 +# if HAVE_RAW_DECL_MBRTOC32 +_GL_WARN_ON_USE (mbrtoc32, "mbrtoc32 is not portable - " + "use gnulib module mbrtoc32 for portability"); +# endif +#endif + + #endif /* _@GUARD_PREFIX@_UCHAR_H */ diff --git a/m4/mbrtoc32.m4 b/m4/mbrtoc32.m4 new file mode 100644 index 0000000..5039fc7 --- /dev/null +++ b/m4/mbrtoc32.m4 @@ -0,0 +1,117 @@ +# mbrtoc32.m4 serial 1 +dnl Copyright (C) 2014-2020 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. + +AC_DEFUN([gl_FUNC_MBRTOC32], +[ + AC_REQUIRE([gl_UCHAR_H_DEFAULTS]) + + AC_REQUIRE([AC_TYPE_MBSTATE_T]) + gl_MBSTATE_T_BROKEN + + AC_CHECK_FUNCS_ONCE([mbrtoc32]) + if test $ac_cv_func_mbrtoc32 = no; then + HAVE_MBRTOC32=0 + else + if test $REPLACE_MBSTATE_T = 1; then + REPLACE_MBRTOC32=1 + else + gl_MBRTOC32_EMPTY_INPUT + gl_MBRTOC32_C_LOCALE + case "$gl_cv_func_mbrtoc32_empty_input" in + *yes) ;; + *) AC_DEFINE([MBRTOC32_EMPTY_INPUT_BUG], [1], + [Define if the mbrtoc32 function does not return (size_t) -2 for empty input.]) + REPLACE_MBRTOC32=1 + ;; + esac + case "$gl_cv_func_mbrtoc32_C_locale_sans_EILSEQ" in + *yes) ;; + *) AC_DEFINE([MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ], [1], + [Define if the mbrtoc32 function may signal encoding errors in the C locale.]) + REPLACE_MBRTOC32=1 + ;; + esac + fi + fi +]) + +AC_DEFUN([gl_MBRTOC32_EMPTY_INPUT], +[ + AC_REQUIRE([AC_PROG_CC]) + AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles + AC_CACHE_CHECK([whether mbrtoc32 works on empty input], + [gl_cv_func_mbrtoc32_empty_input], + [ + dnl Initial guess, used when cross-compiling or when no suitable locale + dnl is present. +changequote(,)dnl + case "$host_os" in + # Guess no on glibc systems. + *-gnu* | gnu*) gl_cv_func_mbrtoc32_empty_input="guessing no" ;; + *) gl_cv_func_mbrtoc32_empty_input="guessing yes" ;; + esac +changequote([,])dnl + AC_RUN_IFELSE( + [AC_LANG_SOURCE([[ + #include + static char32_t wc; + static mbstate_t mbs; + int + main (void) + { + return mbrtoc32 (&wc, "", 0, &mbs) != (size_t) -2; + }]])], + [gl_cv_func_mbrtoc32_empty_input=yes], + [gl_cv_func_mbrtoc32_empty_input=no], + [:]) + ]) +]) + +AC_DEFUN([gl_MBRTOC32_C_LOCALE], +[ + AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles + AC_CACHE_CHECK([whether the C locale is free of encoding errors], + [gl_cv_func_mbrtoc32_C_locale_sans_EILSEQ], + [ + dnl Initial guess, used when cross-compiling or when no suitable locale + dnl is present. + gl_cv_func_mbrtoc32_C_locale_sans_EILSEQ="$gl_cross_guess_normal" + + AC_RUN_IFELSE( + [AC_LANG_PROGRAM( + [[#include + #include + #include + ]], [[ + int i; + char *locale = setlocale (LC_ALL, "C"); + if (! locale) + return 2; + for (i = CHAR_MIN; i <= CHAR_MAX; i++) + { + char c = i; + char32_t wc; + mbstate_t mbs = { 0, }; + size_t ss = mbrtoc32 (&wc, &c, 1, &mbs); + if (1 < ss) + return 3; + } + return 0; + ]])], + [gl_cv_func_mbrtoc32_C_locale_sans_EILSEQ=yes], + [gl_cv_func_mbrtoc32_C_locale_sans_EILSEQ=no], + [case "$host_os" in + # Guess yes on native Windows. + mingw*) gl_cv_func_mbrtoc32_C_locale_sans_EILSEQ="guessing yes" ;; + esac + ]) + ]) +]) + +# Prerequisites of lib/mbrtoc32.c and lib/lc-charset-dispatch.c. +AC_DEFUN([gl_PREREQ_MBRTOC32], [ + : +]) diff --git a/m4/uchar.m4 b/m4/uchar.m4 index c5a3594..4d5f046 100644 --- a/m4/uchar.m4 +++ b/m4/uchar.m4 @@ -1,4 +1,4 @@ -# uchar.m4 serial 2 +# uchar.m4 serial 3 dnl Copyright (C) 2019-2020 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, @@ -18,6 +18,12 @@ AC_DEFUN_ONCE([gl_UCHAR_H], HAVE_UCHAR_H=0 fi AC_SUBST([HAVE_UCHAR_H]) + + dnl Check for declarations of anything we want to poison if the + dnl corresponding gnulib module is not in use, and which is not + dnl guaranteed by C11. + gl_WARN_ON_USE_PREPARE([[#include + ]], [mbrtoc32]) ]) AC_DEFUN([gl_UCHAR_MODULE_INDICATOR], @@ -32,4 +38,8 @@ AC_DEFUN([gl_UCHAR_MODULE_INDICATOR], AC_DEFUN([gl_UCHAR_H_DEFAULTS], [ GNULIB_C32TOB=0; AC_SUBST([GNULIB_C32TOB]) + GNULIB_MBRTOC32=0; AC_SUBST([GNULIB_MBRTOC32]) + dnl Assume proper GNU behavior unless another module says otherwise. + HAVE_MBRTOC32=1; AC_SUBST([HAVE_MBRTOC32]) + REPLACE_MBRTOC32=0; AC_SUBST([REPLACE_MBRTOC32]) ]) diff --git a/modules/mbrtoc32 b/modules/mbrtoc32 new file mode 100644 index 0000000..011b7a9 --- /dev/null +++ b/modules/mbrtoc32 @@ -0,0 +1,51 @@ +Description: +mbrtoc32() function: convert multibyte character to 32-bit wide character. + +Files: +lib/mbrtoc32.c +lib/mbrtowc-impl.h +lib/mbrtowc-impl-utf8.h +lib/lc-charset-dispatch.h +lib/lc-charset-dispatch.c +lib/mbtowc-lock.h +lib/mbtowc-lock.c +lib/windows-initguard.h +m4/mbrtoc32.m4 +m4/mbrtowc.m4 +m4/mbstate_t.m4 +m4/threadlib.m4 +m4/visibility.m4 + +Depends-on: +uchar +hard-locale [test $HAVE_MBRTOC32 = 0 || test $REPLACE_MBRTOC32 = 1] +mbrtowc [test $HAVE_MBRTOC32 = 0 || test $REPLACE_MBRTOC32 = 1] +mbsinit [test $HAVE_MBRTOC32 = 0 || test $REPLACE_MBRTOC32 = 1] +localcharset [test $HAVE_MBRTOC32 = 0 || test $REPLACE_MBRTOC32 = 1] +streq [test $HAVE_MBRTOC32 = 0 || test $REPLACE_MBRTOC32 = 1] +verify [test $HAVE_MBRTOC32 = 0 || test $REPLACE_MBRTOC32 = 1] + +configure.ac: +gl_FUNC_MBRTOC32 +if test $HAVE_MBRTOC32 = 0 || test $REPLACE_MBRTOC32 = 1; then + AC_LIBOBJ([mbrtoc32]) + AC_LIBOBJ([lc-charset-dispatch]) + AC_LIBOBJ([mbtowc-lock]) + gl_PREREQ_MBRTOC32 + gl_PREREQ_MBTOWC_LOCK +fi +gl_UCHAR_MODULE_INDICATOR([mbrtoc32]) + +Makefile.am: + +Include: + + +Link: +$(LIB_MBRTOWC) + +License: +LGPLv2+ + +Maintainer: +Bruno Haible diff --git a/modules/uchar b/modules/uchar index 67a8866..165fae6 100644 --- a/modules/uchar +++ b/modules/uchar @@ -27,6 +27,9 @@ uchar.h: uchar.in.h $(top_builddir)/config.status $(CXXDEFS_H) -e 's|@''PRAGMA_COLUMNS''@|@PRAGMA_COLUMNS@|g' \ -e 's|@''NEXT_UCHAR_H''@|$(NEXT_UCHAR_H)|g' \ -e 's/@''GNULIB_C32TOB''@/$(GNULIB_C32TOB)/g' \ + -e 's/@''GNULIB_MBRTOC32''@/$(GNULIB_MBRTOC32)/g' \ + -e 's|@''HAVE_MBRTOC32''@|$(HAVE_MBRTOC32)|g' \ + -e 's|@''REPLACE_MBRTOC32''@|$(REPLACE_MBRTOC32)|g' \ -e '/definitions of _GL_FUNCDECL_RPL/r $(CXXDEFS_H)' \ < $(srcdir)/uchar.in.h; \ } > $@-t && \ diff --git a/modules/uchar-c++-tests b/modules/uchar-c++-tests index 69058e3..4f179f0 100644 --- a/modules/uchar-c++-tests +++ b/modules/uchar-c++-tests @@ -16,4 +16,5 @@ if ANSICXX TESTS += test-uchar-c++ check_PROGRAMS += test-uchar-c++ test_uchar_c___SOURCES = test-uchar-c++.cc test-uchar-c++2.cc +test_uchar_c___LDADD = $(LDADD) $(LIB_MBRTOWC) endif diff --git a/tests/test-uchar-c++.cc b/tests/test-uchar-c++.cc index 9a11a13..392b104 100644 --- a/tests/test-uchar-c++.cc +++ b/tests/test-uchar-c++.cc @@ -28,6 +28,11 @@ SIGNATURE_CHECK (GNULIB_NAMESPACE::c32tob, int, (wint_t)); #endif +#if GNULIB_TEST_MBRTOC32 +SIGNATURE_CHECK (GNULIB_NAMESPACE::mbrtoc32, size_t, + (char32_t *, const char *, size_t, mbstate_t *)); +#endif + int main () -- 2.7.4