From 0ddc6bae6b09c55e39aa4723b94b13bb5722bf47 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sun, 19 Mar 2023 01:50:00 -0700 Subject: [PATCH] grep: forward port to PCRE2 10.43 * doc/grep.texi: Document this. * src/grep.c: Move recent changes into pcresearch.c. (P_MATCHER_INDEX): Remove. (pcre_pattern_expand_backslash_d): Move from here ... * src/pcresearch.c: ... to here. (PCRE2_EXTRA_ASCII_BSD): Default to 0. (Pcompile): Use PCRE2_EXTRA_ASCII_BSD if available, and expand \d to [0-9] otherwise. --- doc/grep.texi | 18 ++++++---- src/grep.c | 82 +------------------------------------------ src/pcresearch.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 99 insertions(+), 91 deletions(-) diff --git a/doc/grep.texi b/doc/grep.texi index b17c4da..8a0aef5 100644 --- a/doc/grep.texi +++ b/doc/grep.texi @@ -1144,13 +1144,17 @@ combined with the @option{-z} (@option{--null-data}) option, and note that For documentation, refer to @url{https://www.pcre.org/}, with these caveats: @itemize @item -@samp{\d} always matches only the ten ASCII digits, regardless of locale or -in-regexp directives like @samp{(?aD)}. -Use @samp{\p@{Nd@}} if you require to match non-ASCII digits. -Once pcre2 support for @samp{(?aD)} is widespread enough, -we expect to make that the default, so it will be overridable. -@c Using pcre2 git commit pcre2-10.40-112-g6277357, this demonstrates how -@c we'll prefix with (?aD) to make \d's ASCII-only behavior the default: +@samp{\d} matches only the ten ASCII digits, regardless of locale. +Use @samp{\p@{Nd@}} to also match non-ASCII digits. + +When @command{grep} is built with PCRE2 10.42 and earlier, @samp{\d} +ignores in-regexp directives like @samp{(?aD)} and matches only ASCII +digits regardless of these directives. However, later versions of +PCRE2 likely will fix this, and the plan is for @command{grep} to +respect those directives if possible. +@c Using PCRE2 git commit pcre2-10.40-112-g6277357, this demonstrates +@c the equivalent of how grep could use PCRE2_EXTRA_ASCII_BSD to make \d's +@c ASCII-only behavior the default: @c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '(?aD)^\d+' <<< '٠١٢٣٤٥٦٧٨٩' @c [Exit 1] @c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '^\d+' <<< '٠١٢٣٤٥٦٧٨٩' diff --git a/src/grep.c b/src/grep.c index 6ba881e..7547b64 100644 --- a/src/grep.c +++ b/src/grep.c @@ -2089,8 +2089,7 @@ static struct #endif }; /* Keep these in sync with the 'matchers' table. */ -enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0, - P_MATCHER_INDEX = 6 }; +enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 }; /* Return the index of the matcher corresponding to M if available. MATCHER is the index of the previous matcher, or -1 if none. @@ -2379,80 +2378,6 @@ fgrep_to_grep_pattern (char **keys_p, idx_t *len_p) *len_p = p - new_keys; } -/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII - digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise - match non-ASCII digits in some locales. Use \p{Nd} if you require to match - those. */ -static void -pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p) -{ - idx_t len = *len_p; - char *keys = *keys_p; - mbstate_t mb_state = { 0 }; - char *new_keys = xnmalloc (len / 2 + 1, 5); - char *p = new_keys; - bool prev_backslash = false; - - for (ptrdiff_t n; len; keys += n, len -= n) - { - n = mb_clen (keys, len, &mb_state); - switch (n) - { - case -2: - n = len; - FALLTHROUGH; - default: - if (prev_backslash) - { - prev_backslash = false; - *p++ = '\\'; - } - p = mempcpy (p, keys, n); - break; - - case -1: - if (prev_backslash) - { - prev_backslash = false; - *p++ = '\\'; - } - memset (&mb_state, 0, sizeof mb_state); - n = 1; - FALLTHROUGH; - case 1: - if (prev_backslash) - { - prev_backslash = false; - switch (*keys) - { - case 'd': - p = mempcpy (p, "[0-9]", 5); - break; - default: - *p++ = '\\'; - *p++ = *keys; - break; - } - } - else - { - if (*keys == '\\') - prev_backslash = true; - else - *p++ = *keys; - } - break; - } - } - - if (prev_backslash) - *p++ = '\\'; - *p = '\n'; - free (*keys_p); - *keys_p = new_keys; - *len_p = p - new_keys; -} - /* If it is easy, convert the MATCHER-style patterns KEYS (of size *LEN_P) to -F style, update *LEN_P to a possibly-smaller value, and return F_MATCHER_INDEX. If not, leave KEYS and *LEN_P alone and @@ -3045,11 +2970,6 @@ main (int argc, char **argv) matcher = try_fgrep_pattern (matcher, keys, &keycc); } - /* If -P, replace each \d with [0-9]. - Those who want to match non-ASCII digits must use \p{Nd}. */ - if (matcher == P_MATCHER_INDEX) - pcre_pattern_expand_backslash_d (&keys, &keycc); - execute = matchers[matcher].execute; compiled_pattern = matchers[matcher].compile (keys, keycc, matchers[matcher].syntax, diff --git a/src/pcresearch.c b/src/pcresearch.c index 5b111be..d370181 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -35,6 +35,9 @@ # define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT # define pcre2_set_depth_limit pcre2_set_recursion_limit #endif +#ifndef PCRE2_EXTRA_ASCII_BSD +# define PCRE2_EXTRA_ASCII_BSD 0 +#endif struct pcre_comp { @@ -130,12 +133,89 @@ bad_utf8_from_pcre2 (int e) #endif } +/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII + digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise + match non-ASCII digits in some locales. Use \p{Nd} if you require to match + those. */ +static void +pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p) +{ + idx_t len = *len_p; + char *keys = *keys_p; + mbstate_t mb_state = { 0 }; + char *new_keys = xnmalloc (len / 2 + 1, 5); + char *p = new_keys; + bool prev_backslash = false; + + for (ptrdiff_t n; len; keys += n, len -= n) + { + n = mb_clen (keys, len, &mb_state); + switch (n) + { + case -2: + n = len; + FALLTHROUGH; + default: + if (prev_backslash) + { + prev_backslash = false; + *p++ = '\\'; + } + p = mempcpy (p, keys, n); + break; + + case -1: + if (prev_backslash) + { + prev_backslash = false; + *p++ = '\\'; + } + memset (&mb_state, 0, sizeof mb_state); + n = 1; + FALLTHROUGH; + case 1: + if (prev_backslash) + { + prev_backslash = false; + switch (*keys) + { + case 'd': + p = mempcpy (p, "[0-9]", 5); + break; + default: + *p++ = '\\'; + *p++ = *keys; + break; + } + } + else + { + if (*keys == '\\') + prev_backslash = true; + else + *p++ = *keys; + } + break; + } + } + + if (prev_backslash) + *p++ = '\\'; + *p = '\n'; + free (*keys_p); + *keys_p = new_keys; + *len_p = p - new_keys; +} + /* Compile the -P style PATTERN, containing SIZE bytes that are followed by '\n'. Return a description of the compiled pattern. */ void * Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) { + if (! PCRE2_EXTRA_ASCII_BSD) + pcre_pattern_expand_backslash_d (&pattern, &size); + PCRE2_SIZE e; int ec; int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0); @@ -168,12 +248,16 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) if (rawmemchr (pattern, '\n') != patlim) die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern")); +#ifdef PCRE2_EXTRA_MATCH_LINE + uint32_t extra_options = (PCRE2_EXTRA_ASCII_BSD + | (match_lines ? PCRE2_EXTRA_MATCH_LINE : 0)); + pcre2_set_compile_extra_options (ccontext, extra_options); +#endif + void *re_storage = NULL; if (match_lines) { -#ifdef PCRE2_EXTRA_MATCH_LINE - pcre2_set_compile_extra_options (ccontext, PCRE2_EXTRA_MATCH_LINE); -#else +#ifndef PCRE2_EXTRA_MATCH_LINE static char const /* These sizes omit trailing NUL. */ xprefix[4] = "^(?:", xsuffix[2] = ")$"; idx_t re_size = size + sizeof xprefix + sizeof xsuffix; -- 2.39.2