bug-bash
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: ctype.h functions on bytes 0x80..0xFF


From: Grisha Levit
Subject: Re: ctype.h functions on bytes 0x80..0xFF
Date: Sat, 27 May 2023 23:57:49 -0400

The below seems like a cheap fix for UTF-8 locales. Since Bash falls
back to using the single-byte glob matching functions when presented
with invalid multibyte strings, this patch makes the glob code avoid
calling the ctype functions or strcoll when handling individual bytes
>0x7F (in a UTF-8 locale).

This makes the following no longer evaluate to true on macos:

[[ $'\xC0' == [[:upper:]] ]]
[[ $'\xC0' == [[=A=]] ]]
[[ $'\xC0' == $'\xE0' ]]  # with nocasematch

And on Linux with glibc (tested on Ubuntu 22.04) in en_US.UTF-8,
strcoll returns 0 for any two invalid bytes, so the following is also
no longer true:

x=$'\x80'; [[ $'\xC0' == [[=$x=]] ]]

The locale_setblanks change is for the macos issue with 0xA0 being
treated as a blank (as U+00A0).  There's no other code that changes
CSHBRK in sh_syntaxtab so I think the simplifications are OK.

---
diff --git a/lib/glob/smatch.c b/lib/glob/smatch.c
index 12eb9d27..1c6b0229 100644
--- a/lib/glob/smatch.c
+++ b/lib/glob/smatch.c
@@ -141,6 +141,9 @@ rangecmp (int c1, int c2, int forcecoll)
 static int
 collseqcmp (int c, int equiv)
 {
+  if (locale_utf8locale && (UTF8_SINGLEBYTE (c) == 0 ||
UTF8_SINGLEBYTE (equiv) == 0))
+    return (c == equiv);
+
   if (charcmp (c, equiv, 1) == 0)
     return 1;

@@ -281,6 +284,9 @@ is_cclass (int c, const char *name)
   enum char_class char_class;
   int result;

+  if (locale_utf8locale && UTF8_SINGLEBYTE(c) == 0)
+    return -1;
+
   char_class = is_valid_cclass (name);
   if (char_class == CC_NO_CLASS)
     return -1;
@@ -291,7 +297,8 @@ is_cclass (int c, const char *name)

 /* Now include `sm_loop.c' for single-byte characters. */
 /* The result of FOLD is an `unsigned char' */
-# define FOLD(c) ((flags & FNM_CASEFOLD) \
+# define FOLD(c) (((flags & FNM_CASEFOLD) && \
+        (locale_utf8locale == 0 || UTF8_SINGLEBYTE (c))) \
  ? TOLOWER ((unsigned char)c) \
  : ((unsigned char)c))

diff --git a/locale.c b/locale.c
index eb24a517..b918db37 100644
--- a/locale.c
+++ b/locale.c
@@ -584,15 +584,10 @@ locale_setblanks (void)

   for (x = 0; x < sh_syntabsiz; x++)
     {
-      if (isblank ((unsigned char)x))
- sh_syntaxtab[x] |= CSHBRK|CBLANK;
-      else if (member (x, shell_break_chars))
- {
-   sh_syntaxtab[x] |= CSHBRK;
-   sh_syntaxtab[x] &= ~CBLANK;
- }
+      if ((locale_utf8locale == 0 || (x & 0x80) == 0) && isblank
((unsigned char)x))
+ sh_syntaxtab[x] |= CBLANK;
       else
- sh_syntaxtab[x] &= ~(CSHBRK|CBLANK);
+ sh_syntaxtab[x] &= ~CBLANK;
     }
 }



reply via email to

[Prev in Thread] Current Thread [Next in Thread]