From b9b1bd31378af853378f43047beca0c4174a67d5 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sun, 7 Nov 2021 23:31:08 -0800 Subject: [PATCH] regex: break regcomp.c link with glibc Problem reported by Bruno Haible in: https://lists.gnu.org/r/bug-gnulib/2021-11/msg00005.html * config/srclist.txt: Comment out regcomp.c for now. * lib/regcomp.c: Revert previous change. --- ChangeLog | 8 + config/srclist.txt | 2 +- lib/regcomp.c | 464 ++++++++++++++++++++++----------------------- 3 files changed, 232 insertions(+), 242 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6aaa53991..bde24b951 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2021-11-07 Paul Eggert + + regex: break regcomp.c link with glibc + Problem reported by Bruno Haible in: + https://lists.gnu.org/r/bug-gnulib/2021-11/msg00005.html + * config/srclist.txt: Comment out regcomp.c for now. + * lib/regcomp.c: Revert previous change. + 2021-10-24 Bruno Haible string: Avoid syntax error re strdup in string.in.h (regr. 2021-09-07). diff --git a/config/srclist.txt b/config/srclist.txt index 83e251def..e36501239 100644 --- a/config/srclist.txt +++ b/config/srclist.txt @@ -65,7 +65,7 @@ $LIBCSRC malloc/scratch_buffer_grow_preserve.c lib/malloc $LIBCSRC malloc/scratch_buffer_set_array_size.c lib/malloc $LIBCSRC include/intprops.h lib $LIBCSRC misc/sys/cdefs.h lib -$LIBCSRC posix/regcomp.c lib +#$LIBCSRC posix/regcomp.c lib $LIBCSRC posix/regex.c lib $LIBCSRC posix/regex.h lib $LIBCSRC posix/regex_internal.c lib diff --git a/lib/regcomp.c b/lib/regcomp.c index e2b5c2807..887e5b506 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -2831,277 +2831,262 @@ build_collating_symbol (bitset_t sbcset, const unsigned char *name) } #endif /* not _LIBC */ -#ifdef _LIBC -/* Local function for parse_bracket_exp used in _LIBC environment. - Seek the collating symbol entry corresponding to NAME. - Return the index of the symbol in the SYMB_TABLE, - or -1 if not found. */ +/* This function parse bracket expression like "[abc]", "[a-c]", + "[[.a-a.]]" etc. */ -static inline int32_t -__attribute__ ((always_inline)) -seek_collating_symbol_entry (const unsigned char *name, size_t name_len, - const int32_t *symb_table, int32_t table_size, - const unsigned char *extra) +static bin_tree_t * +parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, + reg_syntax_t syntax, reg_errcode_t *err) { - int32_t elem; - - for (elem = 0; elem < table_size; elem++) - if (symb_table[2 * elem] != 0) - { - int32_t idx = symb_table[2 * elem + 1]; - /* Skip the name of collating element name. */ - idx += 1 + extra[idx]; - if (/* Compare the length of the name. */ - name_len == extra[idx] - /* Compare the name. */ - && memcmp (name, &extra[idx + 1], name_len) == 0) - /* Yep, this is the entry. */ - return elem; - } - return -1; -} +#ifdef _LIBC + const unsigned char *collseqmb; + const char *collseqwc; + uint32_t nrules; + int32_t table_size; + const int32_t *symb_table; + const unsigned char *extra; -/* Local function for parse_bracket_exp used in _LIBC environment. - Look up the collation sequence value of BR_ELEM. - Return the value if succeeded, UINT_MAX otherwise. */ + /* Local function for parse_bracket_exp used in _LIBC environment. + Seek the collating symbol entry corresponding to NAME. + Return the index of the symbol in the SYMB_TABLE, + or -1 if not found. */ -static inline unsigned int -__attribute__ ((always_inline)) -lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules, - const unsigned char *collseqmb, - const char *collseqwc, int32_t table_size, - const int32_t *symb_table, - const unsigned char *extra) -{ - if (br_elem->type == SB_CHAR) - { - /* if (MB_CUR_MAX == 1) */ - if (nrules == 0) - return collseqmb[br_elem->opr.ch]; - else - { - wint_t wc = __btowc (br_elem->opr.ch); - return __collseq_table_lookup (collseqwc, wc); - } - } - else if (br_elem->type == MB_CHAR) + auto inline int32_t + __attribute__ ((always_inline)) + seek_collating_symbol_entry (const unsigned char *name, size_t name_len) { - if (nrules != 0) - return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + int32_t elem; + + for (elem = 0; elem < table_size; elem++) + if (symb_table[2 * elem] != 0) + { + int32_t idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + if (/* Compare the length of the name. */ + name_len == extra[idx] + /* Compare the name. */ + && memcmp (name, &extra[idx + 1], name_len) == 0) + /* Yep, this is the entry. */ + return elem; + } + return -1; } - else if (br_elem->type == COLL_SYM) + + /* Local function for parse_bracket_exp used in _LIBC environment. + Look up the collation sequence value of BR_ELEM. + Return the value if succeeded, UINT_MAX otherwise. */ + + auto inline unsigned int + __attribute__ ((always_inline)) + lookup_collation_sequence_value (bracket_elem_t *br_elem) { - size_t sym_name_len = strlen ((char *) br_elem->opr.name); - if (nrules != 0) + if (br_elem->type == SB_CHAR) { - int32_t elem, idx; - elem = seek_collating_symbol_entry (br_elem->opr.name, - sym_name_len, - symb_table, table_size, - extra); - if (elem != -1) + /* + if (MB_CUR_MAX == 1) + */ + if (nrules == 0) + return collseqmb[br_elem->opr.ch]; + else { - /* We found the entry. */ - idx = symb_table[2 * elem + 1]; - /* Skip the name of collating element name. */ - idx += 1 + extra[idx]; - /* Skip the byte sequence of the collating element. */ - idx += 1 + extra[idx]; - /* Adjust for the alignment. */ - idx = (idx + 3) & ~3; - /* Skip the multibyte collation sequence value. */ - idx += sizeof (unsigned int); - /* Skip the wide char sequence of the collating element. */ - idx += sizeof (unsigned int) * - (1 + *(unsigned int *) (extra + idx)); - /* Return the collation sequence value. */ - return *(unsigned int *) (extra + idx); + wint_t wc = __btowc (br_elem->opr.ch); + return __collseq_table_lookup (collseqwc, wc); } - else if (sym_name_len == 1) + } + else if (br_elem->type == MB_CHAR) + { + if (nrules != 0) + return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + } + else if (br_elem->type == COLL_SYM) + { + size_t sym_name_len = strlen ((char *) br_elem->opr.name); + if (nrules != 0) { - /* No valid character. Match it as a single byte - character. */ - return collseqmb[br_elem->opr.name[0]]; + int32_t elem, idx; + elem = seek_collating_symbol_entry (br_elem->opr.name, + sym_name_len); + if (elem != -1) + { + /* We found the entry. */ + idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + /* Skip the byte sequence of the collating element. */ + idx += 1 + extra[idx]; + /* Adjust for the alignment. */ + idx = (idx + 3) & ~3; + /* Skip the multibyte collation sequence value. */ + idx += sizeof (unsigned int); + /* Skip the wide char sequence of the collating element. */ + idx += sizeof (unsigned int) * + (1 + *(unsigned int *) (extra + idx)); + /* Return the collation sequence value. */ + return *(unsigned int *) (extra + idx); + } + else if (sym_name_len == 1) + { + /* No valid character. Match it as a single byte + character. */ + return collseqmb[br_elem->opr.name[0]]; + } } + else if (sym_name_len == 1) + return collseqmb[br_elem->opr.name[0]]; } - else if (sym_name_len == 1) - return collseqmb[br_elem->opr.name[0]]; + return UINT_MAX; } - return UINT_MAX; -} -/* Local function for parse_bracket_exp used in _LIBC environment. - Build the range expression which starts from START_ELEM, and ends - at END_ELEM. The result are written to MBCSET and SBCSET. - RANGE_ALLOC is the allocated size of mbcset->range_starts, and - mbcset->range_ends, is a pointer argument since we may - update it. */ - -static inline reg_errcode_t -__attribute__ ((always_inline)) -build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, - bracket_elem_t *start_elem, bracket_elem_t *end_elem, - re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules, - const unsigned char *collseqmb, const char *collseqwc, - int32_t table_size, const int32_t *symb_table, - const unsigned char *extra) -{ - unsigned int ch; - uint32_t start_collseq; - uint32_t end_collseq; + /* Local function for parse_bracket_exp used in _LIBC environment. + Build the range expression which starts from START_ELEM, and ends + at END_ELEM. The result are written to MBCSET and SBCSET. + RANGE_ALLOC is the allocated size of mbcset->range_starts, and + mbcset->range_ends, is a pointer argument since we may + update it. */ - /* Equivalence Classes and Character Classes can't be a range - start/end. */ - if (__glibc_unlikely (start_elem->type == EQUIV_CLASS - || start_elem->type == CHAR_CLASS - || end_elem->type == EQUIV_CLASS - || end_elem->type == CHAR_CLASS)) - return REG_ERANGE; + auto inline reg_errcode_t + __attribute__ ((always_inline)) + build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, + bracket_elem_t *start_elem, bracket_elem_t *end_elem) + { + unsigned int ch; + uint32_t start_collseq; + uint32_t end_collseq; + + /* Equivalence Classes and Character Classes can't be a range + start/end. */ + if (__glibc_unlikely (start_elem->type == EQUIV_CLASS + || start_elem->type == CHAR_CLASS + || end_elem->type == EQUIV_CLASS + || end_elem->type == CHAR_CLASS)) + return REG_ERANGE; - /* FIXME: Implement rational ranges here, too. */ - start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc, - table_size, symb_table, extra); - end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc, - table_size, symb_table, extra); - /* Check start/end collation sequence values. */ - if (__glibc_unlikely (start_collseq == UINT_MAX - || end_collseq == UINT_MAX)) - return REG_ECOLLATE; - if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) - && start_collseq > end_collseq)) - return REG_ERANGE; + /* FIXME: Implement rational ranges here, too. */ + start_collseq = lookup_collation_sequence_value (start_elem); + end_collseq = lookup_collation_sequence_value (end_elem); + /* Check start/end collation sequence values. */ + if (__glibc_unlikely (start_collseq == UINT_MAX + || end_collseq == UINT_MAX)) + return REG_ECOLLATE; + if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) + && start_collseq > end_collseq)) + return REG_ERANGE; - /* Got valid collation sequence values, add them as a new entry. - However, if we have no collation elements, and the character set - is single byte, the single byte character set that we - build below suffices. */ - if (nrules > 0 || dfa->mb_cur_max > 1) - { - /* Check the space of the arrays. */ - if (__glibc_unlikely (*range_alloc == mbcset->nranges)) + /* Got valid collation sequence values, add them as a new entry. + However, if we have no collation elements, and the character set + is single byte, the single byte character set that we + build below suffices. */ + if (nrules > 0 || dfa->mb_cur_max > 1) { - /* There is not enough space, need realloc. */ - uint32_t *new_array_start; - uint32_t *new_array_end; - int new_nranges; + /* Check the space of the arrays. */ + if (__glibc_unlikely (*range_alloc == mbcset->nranges)) + { + /* There is not enough space, need realloc. */ + uint32_t *new_array_start; + uint32_t *new_array_end; + Idx new_nranges; + + /* +1 in case of mbcset->nranges is 0. */ + new_nranges = 2 * mbcset->nranges + 1; + new_array_start = re_realloc (mbcset->range_starts, uint32_t, + new_nranges); + new_array_end = re_realloc (mbcset->range_ends, uint32_t, + new_nranges); - /* +1 in case of mbcset->nranges is 0. */ - new_nranges = 2 * mbcset->nranges + 1; - new_array_start = re_realloc (mbcset->range_starts, uint32_t, - new_nranges); - new_array_end = re_realloc (mbcset->range_ends, uint32_t, - new_nranges); + if (__glibc_unlikely (new_array_start == NULL + || new_array_end == NULL)) + return REG_ESPACE; - if (__glibc_unlikely (new_array_start == NULL - || new_array_end == NULL)) - return REG_ESPACE; + mbcset->range_starts = new_array_start; + mbcset->range_ends = new_array_end; + *range_alloc = new_nranges; + } - mbcset->range_starts = new_array_start; - mbcset->range_ends = new_array_end; - *range_alloc = new_nranges; + mbcset->range_starts[mbcset->nranges] = start_collseq; + mbcset->range_ends[mbcset->nranges++] = end_collseq; } - mbcset->range_starts[mbcset->nranges] = start_collseq; - mbcset->range_ends[mbcset->nranges++] = end_collseq; - } - - /* Build the table for single byte characters. */ - for (ch = 0; ch < SBC_MAX; ch++) - { - uint32_t ch_collseq; - /* if (MB_CUR_MAX == 1) */ - if (nrules == 0) - ch_collseq = collseqmb[ch]; - else - ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); - if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) - bitset_set (sbcset, ch); + /* Build the table for single byte characters. */ + for (ch = 0; ch < SBC_MAX; ch++) + { + uint32_t ch_collseq; + /* + if (MB_CUR_MAX == 1) + */ + if (nrules == 0) + ch_collseq = collseqmb[ch]; + else + ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); + if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) + bitset_set (sbcset, ch); + } + return REG_NOERROR; } - return REG_NOERROR; -} -/* Local function for parse_bracket_exp used in _LIBC environment. - Build the collating element which is represented by NAME. - The result are written to MBCSET and SBCSET. - COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a - pointer argument since we may update it. */ + /* Local function for parse_bracket_exp used in _LIBC environment. + Build the collating element which is represented by NAME. + The result are written to MBCSET and SBCSET. + COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a + pointer argument since we may update it. */ -static inline reg_errcode_t -__attribute__ ((always_inline)) -build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, - int *coll_sym_alloc, const unsigned char *name, - uint32_t nrules, int32_t table_size, - const int32_t *symb_table, const unsigned char *extra) -{ - int32_t elem, idx; - size_t name_len = strlen ((const char *) name); - if (nrules != 0) + auto inline reg_errcode_t + __attribute__ ((always_inline)) + build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, + Idx *coll_sym_alloc, const unsigned char *name) { - elem = seek_collating_symbol_entry (name, name_len, symb_table, - table_size, extra); - if (elem != -1) - { - /* We found the entry. */ - idx = symb_table[2 * elem + 1]; - /* Skip the name of collating element name. */ - idx += 1 + extra[idx]; - } - else if (name_len == 1) + int32_t elem, idx; + size_t name_len = strlen ((const char *) name); + if (nrules != 0) { - /* No valid character, treat it as a normal - character. */ - bitset_set (sbcset, name[0]); - return REG_NOERROR; - } - else - return REG_ECOLLATE; + elem = seek_collating_symbol_entry (name, name_len); + if (elem != -1) + { + /* We found the entry. */ + idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + } + else if (name_len == 1) + { + /* No valid character, treat it as a normal + character. */ + bitset_set (sbcset, name[0]); + return REG_NOERROR; + } + else + return REG_ECOLLATE; - /* Got valid collation sequence, add it as a new entry. */ - /* Check the space of the arrays. */ - if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) - { - /* Not enough, realloc it. */ - /* +1 in case of mbcset->ncoll_syms is 0. */ - int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; - /* Use realloc since mbcset->coll_syms is NULL - if *alloc == 0. */ - int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, - new_coll_sym_alloc); - if (__glibc_unlikely (new_coll_syms == NULL)) - return REG_ESPACE; - mbcset->coll_syms = new_coll_syms; - *coll_sym_alloc = new_coll_sym_alloc; + /* Got valid collation sequence, add it as a new entry. */ + /* Check the space of the arrays. */ + if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) + { + /* Not enough, realloc it. */ + /* +1 in case of mbcset->ncoll_syms is 0. */ + Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; + /* Use realloc since mbcset->coll_syms is NULL + if *alloc == 0. */ + int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, + new_coll_sym_alloc); + if (__glibc_unlikely (new_coll_syms == NULL)) + return REG_ESPACE; + mbcset->coll_syms = new_coll_syms; + *coll_sym_alloc = new_coll_sym_alloc; + } + mbcset->coll_syms[mbcset->ncoll_syms++] = idx; + return REG_NOERROR; } - mbcset->coll_syms[mbcset->ncoll_syms++] = idx; - return REG_NOERROR; - } - else - { - if (__glibc_unlikely (name_len != 1)) - return REG_ECOLLATE; else { - bitset_set (sbcset, name[0]); - return REG_NOERROR; + if (__glibc_unlikely (name_len != 1)) + return REG_ECOLLATE; + else + { + bitset_set (sbcset, name[0]); + return REG_NOERROR; + } } } -} -#endif /* _LIBC */ - -/* This function parse bracket expression like "[abc]", "[a-c]", - "[[.a-a.]]" etc. */ - -static bin_tree_t * -parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, - reg_syntax_t syntax, reg_errcode_t *err) -{ -#ifdef _LIBC - const unsigned char *collseqmb; - const char *collseqwc = NULL; - uint32_t nrules; - int32_t table_size = 0; - const int32_t *symb_table = NULL; - const unsigned char *extra = NULL; #endif re_token_t br_token; @@ -3245,9 +3230,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, #ifdef _LIBC *err = build_range_exp (sbcset, mbcset, &range_alloc, - &start_elem, &end_elem, - dfa, syntax, nrules, collseqmb, collseqwc, - table_size, symb_table, extra); + &start_elem, &end_elem); #else # ifdef RE_ENABLE_I18N *err = build_range_exp (syntax, sbcset, @@ -3300,8 +3283,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, #ifdef RE_ENABLE_I18N mbcset, &coll_sym_alloc, #endif /* RE_ENABLE_I18N */ - start_elem.opr.name, - nrules, table_size, symb_table, extra); + start_elem.opr.name); if (__glibc_unlikely (*err != REG_NOERROR)) goto parse_bracket_exp_free_return; break; -- 2.32.0