From a1b444027231caac247ab0fbd5be8bda8eb3d626 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sat, 13 Nov 2021 13:52:23 -0800 Subject: [PATCH 05/12] grep: speed up, fix bad-UTF8 check with -P * src/pcresearch.c (bad_utf8_from_pcre2): New function. Fix bug where PCRE2_ERROR_UTF8_ERR1 was not treated as an encoding error. Improve performance when PCRE2_MATCH_INVALID_UTF is defined. (Pexecute): Use it. --- src/pcresearch.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/pcresearch.c b/src/pcresearch.c index 286e1dc..953aca2 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -104,6 +104,18 @@ jit_exec (struct pcre_comp *pc, char const *subject, PCRE2_SIZE search_bytes, } } +/* Return true if E is an error code for bad UTF-8, and if pcre2_match + could return E because PCRE lacks PCRE2_MATCH_INVALID_UTF. */ +static bool +bad_utf8_from_pcre2 (int e) +{ +#ifdef PCRE2_MATCH_INVALID_UTF + return false; +#else + return PCRE2_ERROR_UTF8_ERR21 <= e && e <= PCRE2_ERROR_UTF8_ERR1; +#endif +} + /* Compile the -P style PATTERN, containing SIZE bytes that are followed by '\n'. Return a description of the compiled pattern. */ @@ -248,9 +260,9 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size, e = jit_exec (pc, subject, line_end - subject, search_offset, options); - /* PCRE2 provides 22 different error codes for bad UTF-8 */ - if (! (PCRE2_ERROR_UTF8_ERR21 <= e && e < PCRE2_ERROR_UTF8_ERR1)) + if (!bad_utf8_from_pcre2 (e)) break; + PCRE2_SIZE valid_bytes = pcre2_get_startchar (pc->data); if (search_offset <= valid_bytes) -- 2.32.0