grep branch, master, updated. v3.10-2-gc63a095

grep-commit
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v3.10-2-gc63a095

From:	Paul Eggert
Subject:	grep branch, master, updated. v3.10-2-gc63a095
Date:	Sun, 2 Apr 2023 12:47:26 -0400 (EDT)
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  c63a0950ff852c94e27d14b6d0eea001eddb7de1 (commit)
      from  1d59f1b342e1ec681b87cb21788ec04ebd7a1c75 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=c63a0950ff852c94e27d14b6d0eea001eddb7de1


commit c63a0950ff852c94e27d14b6d0eea001eddb7de1
Author: Paul Eggert <eggert@cs.ucla.edu>
Date:   Sat Apr 1 13:55:26 2023 -0700

    grep: fix -P [\d] by fixing \w only if PCRE2 10.43
    
    Our prepass-based fixes for the -P \d bug have caused repeated
    further bugs.  Avoid the need for a prepass, by using PCRE2_UCP
    only if PCRE2_EXTRA_ASCII_BSD is also supported.  Since the -P \w
    bug was present from grep 2.5 through 3.8 itâs OK if we wait a
    little longer to fix it.
    * NEWS: Mention this.
    * src/pcresearch.c (pcre_pattern_expand_backslash_d}: Remove.
    Remove its use.
    (Pcompile): Use PCRE2_UCP only if PCRE2_EXTRA_ASCII_BSD.
    * tests/pcre-ascii-digits, tests/pcre-utf8-w:
    Skip tests on older PCRE2 implementations.

diff --git a/NEWS b/NEWS
index 400c256..6ebade3 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,14 @@ GNU grep NEWS                                    -*- outline 
-*-
 
 * Noteworthy changes in release ?.? (????-??-??) [?]
 
+** Bug fixes
+
+  With -P, patterns like [\d] now work again.  The fix relies on PCRE2
+  support for the PCRE2_EXTRA_ASCII_BSD flag planned for PCRE2 10.43.
+  With PCRE2 version 10.42 or earlier, behavior reverts to that of
+  grep 3.8, in that patterns like \w and \b use ASCII rather than
+  Unicode interpretations.
+
 
 * Noteworthy changes in release 3.10 (2023-03-22) [stable]
 
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 34b2aeb..e77509c 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -133,97 +133,12 @@ bad_utf8_from_pcre2 (int e)
 #endif
 }
 
-#if ! PCRE2_EXTRA_ASCII_BSD
-/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII
-   digits.  Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise
-   match non-ASCII digits in some locales.  Use \p{Nd} if you require to match
-   those.  Similarly, replace each \D with [^0-9].
-   FIXME: remove in 2025, or whenever we no longer accommodate pcre2-10.42
-   and prior.  */
-static void
-pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
-{
-  idx_t len = *len_p;
-  char *keys = *keys_p;
-  mbstate_t mb_state = { 0 };
-  char *new_keys = xnmalloc (len / 2 + 1, 5);
-  char *p = new_keys;
-  bool prev_backslash = false;
-
-  for (ptrdiff_t n; len; keys += n, len -= n)
-    {
-      n = mb_clen (keys, len, &mb_state);
-      switch (n)
-        {
-        case -2:
-          n = len;
-          FALLTHROUGH;
-        default:
-          if (prev_backslash)
-            {
-              prev_backslash = false;
-              *p++ = '\\';
-            }
-          p = mempcpy (p, keys, n);
-          break;
-
-        case -1:
-          if (prev_backslash)
-            {
-              prev_backslash = false;
-              *p++ = '\\';
-            }
-          memset (&mb_state, 0, sizeof mb_state);
-          n = 1;
-          FALLTHROUGH;
-        case 1:
-          if (prev_backslash)
-            {
-              prev_backslash = false;
-              switch (*keys)
-                {
-                case 'd':
-                  p = mempcpy (p, "[0-9]", 5);
-                  break;
-                case 'D':
-                  p = mempcpy (p, "[^0-9]", 6);
-                  break;
-                default:
-                  *p++ = '\\';
-                  *p++ = *keys;
-                  break;
-                }
-            }
-          else
-            {
-              if (*keys == '\\')
-                prev_backslash = true;
-              else
-                *p++ = *keys;
-            }
-          break;
-        }
-    }
-
-  if (prev_backslash)
-    *p++ = '\\';
-  *p = '\n';
-  free (*keys_p);
-  *keys_p = new_keys;
-  *len_p = p - new_keys;
-}
-#endif
-
 /* Compile the -P style PATTERN, containing SIZE bytes that are
    followed by '\n'.  Return a description of the compiled pattern.  */
 
 void *
 Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
 {
-#if ! PCRE2_EXTRA_ASCII_BSD
-  pcre_pattern_expand_backslash_d (&pattern, &size);
-#endif
-
   PCRE2_SIZE e;
   int ec;
   int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
@@ -241,7 +156,17 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, 
bool exact)
              _("-P supports only unibyte locales on this platform"));
       if (! localeinfo.using_utf8)
         die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
-      flags |= (PCRE2_UTF | PCRE2_UCP);
+
+      flags |= PCRE2_UTF;
+
+      /* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP
+         so that \d does not have the undesirable effect of matching
+         non-ASCII digits.  Otherwise (i.e., with PCRE2 10.42 and earlier),
+         escapes like \w have only their ASCII interpretations,
+         but that's better than the confusion that would ensue if \d
+         matched non-ASCII digits.  */
+      flags |= PCRE2_EXTRA_ASCII_BSD ? PCRE2_UCP : 0;
+
 #if 0
       /* Do not match individual code units but only UTF-8.  */
       flags |= PCRE2_NEVER_BACKSLASH_C;
diff --git a/tests/pcre-ascii-digits b/tests/pcre-ascii-digits
index de9fe38..9dfc0fa 100755
--- a/tests/pcre-ascii-digits
+++ b/tests/pcre-ascii-digits
@@ -17,6 +17,8 @@ require_pcre_
 
 echo . | grep -qP '(*UTF).' 2>/dev/null \
   || skip_ 'PCRE unicode support is compiled out'
+echo 0 | grep -qP '(?aD)\d' \
+  || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
 
 fail=0
 
@@ -44,19 +46,10 @@ printf '\331\2404\n' > in2 || framework_failure_
 returns_ 1 grep -P '\d\d' in2 > out || fail=1
 compare /dev/null out || fail=1
 
-# The following tests work only when built with 10.43 or newer,
-# with which, grep accepts the mode-setting '(?aD)':
-if echo 0 | grep -qP '(?aD)\d'; then
+grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1
+compare in2 out || fail=1
 
-  grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1
-  compare in2 out || fail=1
-
-  returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1
-  compare /dev/null out || fail=1
-
-else
-  warn_ 'skipped some tests: use PCRE2 10.43 or newer to enable' \
-    'support for e.g., (?aD) and (?-aD)'
-fi
+returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1
+compare /dev/null out || fail=1
 
 Exit $fail
diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w
index a88ace4..aa34784 100755
--- a/tests/pcre-utf8-w
+++ b/tests/pcre-utf8-w
@@ -16,6 +16,8 @@ require_pcre_
 
 echo . | grep -qP '(*UTF).' 2>/dev/null \
   || skip_ 'PCRE unicode support is compiled out'
+echo 0 | grep -qP '(?aD)\d' \
+  || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
 
 fail=0
 

-----------------------------------------------------------------------

Summary of changes:
 NEWS                    |  8 ++++
 src/pcresearch.c        | 97 ++++++-------------------------------------------
 tests/pcre-ascii-digits | 19 +++-------
 tests/pcre-utf8-w       |  2 +
 4 files changed, 27 insertions(+), 99 deletions(-)


hooks/post-receive
-- 
grep
[Prev in Thread]
Current Thread
[Next in Thread]
grep branch, master, updated. v3.10-2-gc63a095, Paul Eggert <=
Next by Date: grep branch, master, updated. v3.10-3-g488a115
Next by thread: grep branch, master, updated. v3.10-3-g488a115
Index(es):
- Date
- Thread