>From c04ff0df5dfe788a38162cb2609b38495e765383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Sat, 23 Feb 2019 21:23:47 -0800 Subject: [PATCH] wc: treat non break space as a word separator * src/wc.c (iswnbspace): A new function to match characters in this class. (main): Initialize posixly_correct from the environment, to allow disabling honoring NBSP in non C locales. (wc): Call is[w]nbspace() as well as is[w]space. * bootstrap.conf: Ensure btowc is available. * tests/misc/wc-nbsp.sh: A new test. * tests/local.mk: Reference the new test. * NEWS: Mention the change in behavior. --- NEWS | 3 +++ bootstrap.conf | 1 + src/wc.c | 25 +++++++++++++++++++++++-- tests/local.mk | 1 + tests/misc/wc-nbsp.sh | 42 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 70 insertions(+), 2 deletions(-) create mode 100755 tests/misc/wc-nbsp.sh diff --git a/NEWS b/NEWS index e400554..9bfa3c3 100644 --- a/NEWS +++ b/NEWS @@ -53,6 +53,9 @@ GNU coreutils NEWS -*- outline -*- operator, so POSIX changed this to 'test -e FILE'. Scripts using it were already broken and non-portable; the -a unary operator was never documented. + wc now treats non breaking space characters as word delimiters + unless the POSIXLY_CORRECT environment variable is set. + ** New features id now supports specifying multiple users. diff --git a/bootstrap.conf b/bootstrap.conf index a525ef4..4926152 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -38,6 +38,7 @@ gnulib_modules=" backup-rename base32 base64 + btowc buffer-lcm c-strcase cl-strtod diff --git a/src/wc.c b/src/wc.c index 179abbe..2381804 100644 --- a/src/wc.c +++ b/src/wc.c @@ -74,6 +74,9 @@ static bool have_read_stdin; /* Used to determine if file size can be determined without reading. */ static size_t page_size; +/* Enable to _not_ treat non breaking space as a word separator. */ +static bool posixly_correct; + /* The result of calling fstat or stat on a file descriptor or file. */ struct fstatus { @@ -147,6 +150,21 @@ the following order: newline, word, character, byte, maximum line length.\n\ exit (status); } +/* Return non zero if a non breaking space. */ +static int _GL_ATTRIBUTE_PURE +iswnbspace (wint_t wc) +{ + return ! posixly_correct + && (wc == 0x00A0 || wc == 0x2007 + || wc == 0x202F || wc == 0x2060); +} + +static int +isnbspace (int c) +{ + return iswnbspace (btowc (c)); +} + /* FILE is the name of the file (or NULL for standard input) associated with the specified counters. */ static void @@ -455,7 +473,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) if (width > 0) linepos += width; } - if (iswspace (wide_char)) + if (iswspace (wide_char) || iswnbspace (wide_char)) goto mb_word_separator; in_word = true; } @@ -538,7 +556,8 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) if (isprint (to_uchar (p[-1]))) { linepos++; - if (isspace (to_uchar (p[-1]))) + if (isspace (to_uchar (p[-1])) + || isnbspace (to_uchar (p[-1]))) goto word_separator; in_word = true; } @@ -681,6 +700,8 @@ main (int argc, char **argv) so that processes running in parallel do not intersperse their output. */ setvbuf (stdout, NULL, _IOLBF, 0); + posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL); + print_lines = print_words = print_chars = print_bytes = false; print_linelength = false; total_lines = total_words = total_chars = total_bytes = max_line_length = 0; diff --git a/tests/local.mk b/tests/local.mk index 4751886..bacc5d2 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -272,6 +272,7 @@ all_tests = \ tests/misc/wc.pl \ tests/misc/wc-files0-from.pl \ tests/misc/wc-files0.sh \ + tests/misc/wc-nbsp.sh \ tests/misc/wc-parallel.sh \ tests/misc/wc-proc.sh \ tests/misc/cat-proc.sh \ diff --git a/tests/misc/wc-nbsp.sh b/tests/misc/wc-nbsp.sh new file mode 100755 index 0000000..11ee0d6 --- /dev/null +++ b/tests/misc/wc-nbsp.sh @@ -0,0 +1,42 @@ +#!/bin/sh +# Test non breaking space handling + +# Copyright (C) 2019 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ wc printf + +# Before coreutils 8.31 nbsp was treated as part of a word, +# rather than a word delimiter + +export LC_ALL=en_US.ISO-8859-1 +if test "$(locale charmap 2>/dev/null)" = ISO-8859-1; then + test $(env printf '=\xA0=' | wc -w) = 2 || fail=1 + test $(env printf '=\xA0=' | POSIXLY_CORRECT=1 wc -w) = 1 || fail=1 +fi +export LC_ALL=en_US.UTF-8 +if test "$(locale charmap 2>/dev/null)" = UTF-8; then + test $(env printf '=\u00A0=' | wc -w) = 2 || fail=1 + test $(env printf '=\u2007=' | wc -w) = 2 || fail=1 + test $(env printf '=\u202F=' | wc -w) = 2 || fail=1 + test $(env printf '=\u2060=' | wc -w) = 2 || fail=1 +fi +export LC_ALL=ru_RU.KOI8-R +if test "$(locale charmap 2>/dev/null)" = KOI8-R; then + test $(env printf '=\x9A=' | wc -w) = 2 || fail=1 +fi + +Exit $fail -- 2.9.3