/* fsalex - Repackage pattern text as compact, expressive tokens Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2014 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ /* Written June, 1988 by Mike Haertel Modified July, 1988 by Arthur David Olson to assist BMG speedups */ /* 2014: Repackaged by "untangle" script, written by behoffski. */ /* Always import environment-specific configuration items first. */ #include /* define _GNU_SOURCE for regex extensions. */ #include #include "charclass.h" #include #include "fsalex.h" #include "fsatoken.h" #include #include #include "mbcsets.h" #include "proto-lexparse.h" #include #include #include #include #include #include #include "xalloc.h" #include "gettext.h" #define _(str) gettext (str) /* ISASCIIDIGIT differs from isdigit, as follows: - Its arg may be any int or unsigned int; it need not be an unsigned char. - It's guaranteed to evaluate its argument exactly once. - It's typically faster. Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that only '0' through '9' are digits. Prefer ISASCIIDIGIT to isdigit unless it's important to use the locale's definition of "digit" even when the host does not conform to Posix. */ #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) #define STREQ(a, b) (strcmp (a, b) == 0) #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif /* The following list maps the names of the Posix named character classes to predicate functions that determine whether a given character is in the class. The leading [ has already been eaten by the lexical analyzer. Additional objects are provided to assist the client: wchar_desc for multibyte matching, and class for octet matching. Lazy evaluation and caching are used to minimise processing costs, so these additional items are only valid after a class has been located using find_pred (). */ typedef int predicate_t (wint_t, wctype_t); typedef struct predicate_entry_struct { const char *name; wctype_t wchar_desc; charclass_t *class; bool may_be_multibyte; } predicate_entry_t; /* This list is a template, copied into each lexer's state, and interrogated and updated from there. The membership of a class can vary due to locale and other settings, so each lexer must maintain its own list. Duplicate class sharing across different lexer instances is facilitated by checks in charclass_completed. */ /* Locale portability note: We may need to record the entire explicit locale when syntax () is called, and then use isalpha_l () etc. here, as otherwise the wchar_desc may be interpreted at runtime in the context of the current locale. If a complete copy of the locale needs to be stored locally, see uselocale(3) and duplocale(3) (and free the copy with freelocale(3)). */ static predicate_entry_t template_predicate_list[] = { {"alpha", 0, NULL, true}, {"alnum", 0, NULL, true}, {"blank", 0, NULL, true}, {"cntrl", 0, NULL, true}, {"digit", 0, NULL, false}, {"graph", 0, NULL, true}, {"lower", 0, NULL, true}, {"print", 0, NULL, true}, {"punct", 0, NULL, true}, {"space", 0, NULL, true}, {"upper", 0, NULL, true}, {"xdigit", 0, NULL, true}, {NULL, 0, NULL} }; #define PREDICATE_TEMPLATE_ITEMS \ (sizeof template_predicate_list / sizeof *template_predicate_list) /* Flesh out the opaque instance context type given in the header. */ struct fsalex_ctxt_struct { /* Singly-linked list of all lexer instances, so destroy_module can release all resources by traversing the list. */ fsalex_ctxt_t *next_instance; /* Using the lexer without setting the syntax is a fatal error, so use a flag so we can report such errors in a direct fashion. */ bool syntax_initialised; /* Syntax flags/characters directing how to interpret the pattern. */ reg_syntax_t syntax_bits; bool case_fold; unsigned char eolbyte; /* Exception handling is done by explicit callbacks. */ fsalex_warn_callback_fn *warn_client; fsalex_error_callback_fn *abandon_with_error; /* Pattern pointer/length, updated as pattern is consumed. */ char const *lexptr; size_t lexleft; /* Break out some regex syntax bits into boolean vars. Do this for the ones that are heavily used, and/or where the nature of the bitmask flag test tends to clutter the lexer code. */ bool re_gnu_ops; /* GNU regex operators are allowed. */ /* Carry dotclass here, as it's easier for clients (UTF-8) to perform class operations with this class, rather than to know intimate details of the regex syntax configuration bits and items such as eolbyte. */ charclass_t *dotclass; /* ".": All chars except eolbyte and/or NUL, depending on syntax flags. */ charclass_index_t dotclass_index; /* Work variables to help organise lexer operation. */ fsatoken_token_t lasttok; bool laststart; size_t parens; /* Character class predicate mapping/caching table. */ predicate_entry_t predicates[PREDICATE_TEMPLATE_ITEMS]; /* Minrep and maxrep are actually associated with the REPMN token, and need to be accessible outside this module (by the parser), perhaps by an explicit interface call. In the far, far future, a completely-reworked token list may see these values properly become integrated into the token stream (perhaps by a pair of "Parameter" tokens? Perhaps by a MINREP token with 1 parameter, followed by a MAXREP token with a corresponding parameter?) */ int minrep, maxrep; /* Booleans to simplify unibyte/multibyte code selection paths. In addition, other flags are placed here to summarise properties of the locale in a concise fashion that can useful when implementing optimisations. There may be some overlap and/or redundancy here; flag names are chosen to let the user be direct and concise, but we only provide names for the more popular cases, not all. */ char *locale_name; bool unibyte_locale; bool multibyte_locale; bool ascii_7bit_encoding; bool using_simple_locale; /* REVIEWME: Wide-character support variables. */ int cur_mb_len; /* Length (in bytes) of the last character fetched; this is needed when backing up during lexing. In a non-multibyte situation (locale?), this variable remains at 1; otherwise, it is updated as required by FETCH_WC. */ /* These variables are used only if in a multibyte locale. */ wchar_t wctok; /* Storage for a single multibyte character, used both during lexing, and as the implied parameter of a WCHAR token returned by the lexer. */ mbstate_t mbrtowc_state; /* State management area for mbrtowc to use. */ /* A table indexed by byte values that contains the corresponding wide character (if any) for that byte. WEOF means the byte is not a valid single-byte character. */ wint_t mbrtowc_cache[FSATOKEN_NOTCHAR]; mbcsets_set_t **mbcsets; size_t nmbcsets; size_t mbcsets_alloc; }; /* Linked list of all instances created by this module. */ static fsalex_ctxt_t *fsalex_instance_list_head = NULL; /* Ensure that the array addressed by PTR holds at least NITEMS + (PTR || !NITEMS) items. Either return PTR, or reallocate the array and return its new address. Although PTR may be null, the returned value is never null. The array holds *NALLOC items; *NALLOC is updated on reallocation. ITEMSIZE is the size of one item. Avoid O(N**2) behavior on arrays growing linearly. */ static void * maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize) { if (nitems < *nalloc) return ptr; *nalloc = nitems; return x2nrealloc (ptr, nalloc, itemsize); } /* Set a bit in the charclass for the given wchar_t. Do nothing if WC is represented by a multi-byte sequence. Even in unibyte locales, this may happen when folding case in weird Turkish locales where dotless i/dotted I are not included in the chosen character set. Return whether a bit was set in the charclass. */ /* Set a bit for B and its case variants in the charclass C. We must be in an unibyte locale. */ static void setbit_case_fold_c (int b, charclass_t *c) { int ub = toupper (b); int i; for (i = 0; i < FSATOKEN_NOTCHAR; i++) if (toupper (i) == ub) charclass_setbit (i, c); } /* Convert a possibly-signed character to an unsigned character. This is a bit safer than casting to unsigned char, since it catches some type errors that the cast doesn't. */ static unsigned char to_uchar (char ch) { return ch; } static void initialise_uchar_to_wc_cache (fsalex_ctxt_t *lexer) { int i; for (i = CHAR_MIN; i <= CHAR_MAX; ++i) { char c = i; unsigned char uc = i; mbstate_t s = { 0 }; wchar_t wc; lexer->mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; } } /* This function is intimately connected with multibyte (wide-char) handling in the macro FETCH_WC below, in the case where FETCH_SINGLE_CHAR has run but the result has been found to be inconclusive. It works by unwinding the FETCH_SINGLE_CHAR side-effects (lexptr/lexleft), then calling mbrtowc on the pattern space, and communicates mbrtowc's understanding of the octet stream back to the caller: - If a valid multibyte octet sequence is next, then the wide character associated with this sequence is written back to *p_wchar, and the number of octets consumed is returned; or - If the sequence is invalid for any reason, the mbrtowc working state is reset (zeroed), *p_wchar is not modified, and 1 is returned. Lexer state variables, including cur_mb_len, mbs, lexleft and lexptr, are updated as appropriate by this function (mainly if mbrtowc succeeds). The wide char NUL is unusual as it is a 1-octet sequence, the length returned is 0, we report it as length 1, but write the converted wide character in temp_wchar to the caller. */ /* Additional notes: This code, in partnership with the macro FETCH_WC, is closely related to mbs_to_wchar in dfa.c. There is documentation there (e.g. pattern must end in a sentinel, shift encodings not supported, plus other comments/guarantees) that is important, but I'm deferring writing up anything at present until I see how this code is received. */ static size_t fetch_offset_wide_char (fsalex_ctxt_t *lexer, wchar_t *p_wchar) { size_t nbytes; wchar_t temp_wchar; nbytes = mbrtowc (&temp_wchar, lexer->lexptr - 1, lexer->lexleft + 1, &lexer->mbrtowc_state); switch (nbytes) { case (size_t) -2: case (size_t) -1: /* Conversion failed: Incomplete (-2) or invalid (-1) sequence. */ memset (&lexer->mbrtowc_state, 0, sizeof (lexer->mbrtowc_state)); return 1; case (size_t) 0: /* This is the wide NUL character, actually 1 byte long. */ nbytes = 1; break; default: /* Converted character is in temp_wchar, and nbytes is a byte count. */ break; } /* We converted 1 or more bytes, tell result to caller. */ *p_wchar = temp_wchar; /* Update the number of bytes consumed (offset by 1 since FETCH_SINGLE_CHAR grabbed one earlier). */ lexer->lexptr += nbytes - 1; lexer->lexleft -= nbytes - 1; return nbytes; } /* Single-character input fetch, with EOF/error handling. Note that characters become unsigned here. If no characters are available, the macro either returns END or reports an error, depending on eoferr. Otherwise, one character is consumed (lexptr/lexleft), the char is converted into an unsigned char, and is written into the parameter c. */ #define FETCH_SINGLE_CHAR(lexer, c, eoferr) \ do { \ if (! (lexer)->lexleft) \ { \ if ((eoferr) != 0) \ (lexer)->abandon_with_error (eoferr); \ else \ return FSATOKEN_TK_END; \ } \ (c) = to_uchar (*(lexer)->lexptr++); \ (lexer)->lexleft--; \ } while (0) /* Do the fetch in stages: Single char, octet+multibyte cache check, and possible wide char fetch if the cache result indicates that the input sequence is longer than a single octet. The first fetch handles end-of-input cases (if this happens, control never reaches the rest of the macro); otherwise, it returns temp_uchar which is used in the cache lookup, and may be the single-octet result. A cache result of WEOF means that the octet is not a complete sequence by itself, so a second fetch tweaks lexptr/lexleft to undo the single-char-fetch side-effects, and, depending on mbrtowc valid/invalid result, propagates either the multichar fetch or the single-char fetch back to the caller. */ # define FETCH_WC(lexer, c, wc, eoferr) \ do { \ wchar_t temp_wc; \ unsigned char temp_uchar; \ (lexer)->cur_mb_len = 1; \ FETCH_SINGLE_CHAR ((lexer), temp_uchar, (eoferr)); \ temp_wc = (lexer)->mbrtowc_cache[temp_uchar]; \ if (temp_wc != WEOF) \ { \ (c) = temp_uchar; \ (wc) = temp_wc; \ } \ else \ { \ size_t nbytes; \ temp_wc = temp_uchar; \ nbytes = fetch_offset_wide_char ((lexer), &temp_wc); \ (wc) = temp_wc; \ (c) = nbytes == 1 ? temp_uchar : EOF; \ (lexer)->cur_mb_len = nbytes; \ } \ } while (0) /* Given a predicate name, find it in a list, and report the list entry to the caller. If the name is not recognised, the function returns NULL. The list entry includes a charclass set and (if relevant) a wide-char descriptor for testing for the predicate. Lazy evaluation and caching are used to keep processing costs down. */ static predicate_entry_t * find_pred (fsalex_ctxt_t *lexer, const char *str) { predicate_entry_t *p_entry; charclass_t *work_class; for (p_entry = lexer->predicates; p_entry->name; p_entry++) { if (STREQ (str, p_entry->name)) break; } /* If there was no matching predicate name found, return NULL. */ if (! p_entry->name) return NULL; /* Is the charclass pointer NULL for this entry? */ if (p_entry->class == NULL) { /* Yes, allocate, set up and cache a charclass for this predicate. Note that the wchar_desc entries were set up in fsalex_syntax (). */ int i; charclass_index_t index; wctype_t wctype_desc; wctype_desc = p_entry->wchar_desc; work_class = charclass_alloc (); for (i = 0; i < FSATOKEN_NOTCHAR; i++) { wchar_t wc; /* Try integer->unsigned char->wide char using lexer's mbrtowc_cache array, and, if successful, test for class membership, and set the bit in the class if the value is a member. */ /* FIXME: iswctype depends on the *current* locale (LC_CTYPE); this breaks the long-term goal of having each lexer instance use only the locale in force when fsalex_syntax was called. We can fix this by using isalnum_l () etc here, but carrying around a full copy of the locale in the lexer instance may be expensive or possibly non-portable, so it's being avoided while this code is still in an experimental/proof-of-concept form. */ wc = lexer->mbrtowc_cache[i]; if (iswctype (wc, wctype_desc)) charclass_setbit (i, work_class); } /* Mark the class as completed, and obtain a persistent class pointer. */ index = charclass_completed (work_class); p_entry->class = charclass_get_pointer (index); } /* Return predicate entry to the caller. */ return p_entry; } /* The set of wchar_t values C such that there's a useful locale somewhere where C != towupper (C) && C != towlower (towupper (C)). For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ static short const lonesome_lower[] = { 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase counterpart in locales predating Unicode 4.0.0 (April 2003). */ 0x03F2, 0x03F5, 0x1E9B, 0x1FBE, }; /* Maximum number of wide characters that can be the case-folded counterparts of a single wide character, not counting the character itself. This is 1 for towupper, 1 for towlower, and 1 for each entry in lonesome_lower[]. */ enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 }; /* Given a wide character c, and working to a search specification that demands that case-folded variants of the character must match wherever that character could match, generate a list of all possible match counterparts to the specified character, and return that list, the characters via a parameter pointer, and the list length (possibly 0) via the function return value. */ /* ?? CHECKME: Should parameter "c" in, and/or "folded" out, be wint_t? */ static int case_folded_counterparts (fsalex_ctxt_t *lexer, wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) { int i; int n = 0; wint_t uc = towupper (c); wint_t lc = towlower (uc); if (uc != c) folded[n++] = uc; if (lc != uc && lc != c && towupper (lc) == uc) folded[n++] = lc; for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) { wint_t li = lonesome_lower[i]; if (li != lc && li != uc && li != c && towupper (li) == uc) folded[n++] = li; } return n; } /* Multibyte character handling sub-routine for lex. Parse a bracket expression and build a struct mb_char_classes. */ static fsatoken_token_t parse_bracket_exp (fsalex_ctxt_t *lexer) { bool invert; int c, c1, c2; charclass_t *ccl; mbcsets_set_t *work_mbc = NULL; /* This is a bracket expression that dfaexec is known to process correctly. */ bool known_bracket_exp = true; /* Used to warn about [:space:]. Bit 0 = first character is a colon. Bit 1 = last character is a colon. Bit 2 = includes any other character but a colon. Bit 3 = includes ranges, char/equiv classes or collation elements. */ int colon_warning_state; wint_t wc; wint_t wc2; wint_t wc1 = 0; if (lexer->multibyte_locale) { /* Ensure we have space to store the new set. */ lexer->mbcsets = maybe_realloc (lexer->mbcsets, lexer->nmbcsets, &lexer->mbcsets_alloc, sizeof work_mbc); /* Initialize work area. */ work_mbc = mbcsets_new (); lexer->mbcsets[lexer->nmbcsets++] = work_mbc; } ccl = charclass_alloc (); FETCH_WC (lexer, c, wc, _("unbalanced [")); if (c == '^') { FETCH_WC (lexer, c, wc, _("unbalanced [")); invert = true; known_bracket_exp = lexer->using_simple_locale; } else invert = false; colon_warning_state = (c == ':'); do { c1 = FSATOKEN_NOTCHAR; /* Mark c1 as not initialized. */ colon_warning_state &= ~2; /* Note that if we're looking at some other [:...:] construct, we just treat it as a bunch of ordinary characters. We can do this because we assume regex has checked for syntax errors before dfa is ever called. */ if (c == '[') { FETCH_WC (lexer, c1, wc1, _("unbalanced [")); if ((c1 == ':' && (lexer->syntax_bits & RE_CHAR_CLASSES)) || c1 == '.' || c1 == '=') { enum { MAX_BRACKET_STRING_LEN = 32 }; char str[MAX_BRACKET_STRING_LEN + 1]; size_t len = 0; for (;;) { FETCH_WC (lexer, c, wc, _("unbalanced [")); if ((c == c1 && *lexer->lexptr == ']') || lexer->lexleft == 0) break; if (len < MAX_BRACKET_STRING_LEN) str[len++] = c; else /* This is in any case an invalid class name. */ str[0] = '\0'; } str[len] = '\0'; /* Fetch bracket. */ FETCH_WC (lexer, c, wc, _("unbalanced [")); if (c1 == ':') /* Find and merge named character class. POSIX allows character classes to match multicharacter collating elements, but the regex code does not support that, so do not worry about that possibility. */ { char const *class; predicate_entry_t *pred; class = str; if (lexer->case_fold && (STREQ (class, "upper") || STREQ (class, "lower"))) class = "alpha"; pred = find_pred (lexer, class); if (! pred) lexer->abandon_with_error (_("invalid character class")); charclass_unionset (pred->class, ccl); /* Does this class have a wide-char type descriptor? */ if (lexer->multibyte_locale && pred->may_be_multibyte) { mbcsets_add_class(work_mbc, pred->wchar_desc); } } else known_bracket_exp = false; colon_warning_state |= 8; /* Fetch new lookahead character. */ FETCH_WC (lexer, c1, wc1, _("unbalanced [")); continue; } /* We treat '[' as a normal character here. c/c1/wc/wc1 are already set up. */ } if (c == '\\' && (lexer->syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) FETCH_WC (lexer, c, wc, _("unbalanced [")); if (c1 == FSATOKEN_NOTCHAR) FETCH_WC (lexer, c1, wc1, _("unbalanced [")); if (c1 == '-') /* build range characters. */ { FETCH_WC (lexer, c2, wc2, _("unbalanced [")); /* A bracket expression like [a-[.aa.]] matches an unknown set. Treat it like [-a[.aa.]] while parsing it, and remember that the set is unknown. */ if (c2 == '[' && *lexer->lexptr == '.') { known_bracket_exp = false; c2 = ']'; } if (c2 != ']') { if (c2 == '\\' && (lexer->syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) FETCH_WC (lexer, c2, wc2, _("unbalanced [")); if (lexer->multibyte_locale) { /* Merely record range as given; don't try to handle the case-folding variants here for now. */ if (wc != WEOF && wc2 != WEOF) { mbcsets_add_range(work_mbc, wc, wc2); /* FIXME, part 2: Since the canonical measure of case equivalence is uppercase(A) == uppercase(B), perhaps we could do this transformation above if lexer->case_fold is true... but need to be careful not to throw away information. */ } } else if (lexer->using_simple_locale) { charclass_setbit_range (c, c2, ccl); if (lexer->case_fold) { int uc = toupper (c); int uc2 = toupper (c2); for (c1 = 0; c1 < FSATOKEN_NOTCHAR; c1++) { int uc1 = toupper (c1); if (uc <= uc1 && uc1 <= uc2) charclass_setbit (c1, ccl); } } } else known_bracket_exp = false; colon_warning_state |= 8; FETCH_WC (lexer, c1, wc1, _("unbalanced [")); continue; } /* In the case [x-], the - is an ordinary hyphen, which is left in c1, the lookahead character. */ lexer->lexptr -= lexer->cur_mb_len; lexer->lexleft += lexer->cur_mb_len; } colon_warning_state |= (c == ':') ? 2 : 4; if (lexer->unibyte_locale) { if (lexer->case_fold) setbit_case_fold_c (c, ccl); else charclass_setbit (c, ccl); continue; } if (wc == WEOF) known_bracket_exp = false; else { wchar_t folded[CASE_FOLDED_BUFSIZE + 1]; int n = 1; folded[0] = wc; if (lexer->case_fold) n += case_folded_counterparts (lexer, wc, &folded[1]); mbcsets_add_wchar_list (work_mbc, n, folded); } } while ((wc = wc1, (c = c1) != ']')); if (colon_warning_state == 7) lexer->warn_client (_("character class syntax is [[:space:]], not [:space:]")); if (! known_bracket_exp) return FSATOKEN_TK_BACKREF; if (lexer->multibyte_locale) { mbcsets_set_match_sense (work_mbc, invert); mbcsets_receive_incomplete_charclass (work_mbc, ccl); mbcsets_completed (work_mbc); return FSATOKEN_TK_MBCSET; } if (invert) { charclass_notset (ccl); if (lexer->syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) charclass_clrbit (lexer->eolbyte, ccl); } return FSATOKEN_TK_CSET + charclass_completed (ccl); } fsatoken_token_t fsalex_lex (fsalex_ctxt_t *lexer) { int c; bool backslash = false; int i; predicate_entry_t *predicate; charclass_t *work_class; charclass_index_t class_index; mbcsets_set_t *work_mbc; bool invert; /* Ensure that syntax () has been called on this lexer instance; many things will fail if this isn't done. */ assert (lexer->syntax_initialised); /* Basic plan: We fetch a character. If it's a backslash, we set the backslash flag and go through the loop again. On the plus side, this avoids having a duplicate of the main switch inside the backslash case. On the minus side, it means that just about every case begins with "if (backslash) ...". */ for (i = 0; i < 2; ++i) { FETCH_WC (lexer, c, lexer->wctok, NULL); switch (c) { case '\\': if (backslash) goto normal_char; if (lexer->lexleft == 0) lexer->abandon_with_error (_("unfinished \\ escape")); backslash = true; break; case '^': if (backslash) goto normal_char; if (lexer->syntax_bits & RE_CONTEXT_INDEP_ANCHORS || lexer->lasttok == FSATOKEN_TK_END || lexer->lasttok == FSATOKEN_TK_LPAREN || lexer->lasttok == FSATOKEN_TK_OR) return lexer->lasttok = FSATOKEN_TK_BEGLINE; goto normal_char; case '$': if (backslash) goto normal_char; if (lexer->syntax_bits & RE_CONTEXT_INDEP_ANCHORS || lexer->lexleft == 0 || (lexer->syntax_bits & RE_NO_BK_PARENS ? lexer->lexleft > 0 && *lexer->lexptr == ')' : lexer->lexleft > 1 && lexer->lexptr[0] == '\\' && lexer->lexptr[1] == ')') || (lexer->syntax_bits & RE_NO_BK_VBAR ? lexer->lexleft > 0 && *lexer->lexptr == '|' : lexer->lexleft > 1 && lexer->lexptr[0] == '\\' && lexer->lexptr[1] == '|') || ((lexer->syntax_bits & RE_NEWLINE_ALT) && lexer->lexleft > 0 && *lexer->lexptr == '\n')) return lexer->lasttok = FSATOKEN_TK_ENDLINE; goto normal_char; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (backslash && !(lexer->syntax_bits & RE_NO_BK_REFS)) { lexer->laststart = false; return lexer->lasttok = FSATOKEN_TK_BACKREF; } goto normal_char; case '`': if (backslash && lexer->re_gnu_ops) return lexer->lasttok = FSATOKEN_TK_BEGLINE; /* FIXME: should be beginning of string */ goto normal_char; case '\'': if (backslash && lexer->re_gnu_ops) return lexer->lasttok = FSATOKEN_TK_ENDLINE; /* FIXME: should be end of string */ goto normal_char; case '<': if (backslash && lexer->re_gnu_ops) return lexer->lasttok = FSATOKEN_TK_BEGWORD; goto normal_char; case '>': if (backslash && lexer->re_gnu_ops) return lexer->lasttok = FSATOKEN_TK_ENDWORD; goto normal_char; case 'b': if (backslash && lexer->re_gnu_ops) return lexer->lasttok = FSATOKEN_TK_LIMWORD; goto normal_char; case 'B': if (backslash && lexer->re_gnu_ops) return lexer->lasttok = FSATOKEN_TK_NOTLIMWORD; goto normal_char; case '?': if (lexer->syntax_bits & RE_LIMITED_OPS) goto normal_char; if (backslash != ((lexer->syntax_bits & RE_BK_PLUS_QM) != 0)) goto normal_char; if (!(lexer->syntax_bits & RE_CONTEXT_INDEP_OPS) && lexer->laststart) goto normal_char; return lexer->lasttok = FSATOKEN_TK_QMARK; case '*': if (backslash) goto normal_char; if (!(lexer->syntax_bits & RE_CONTEXT_INDEP_OPS) && lexer->laststart) goto normal_char; return lexer->lasttok = FSATOKEN_TK_STAR; case '+': if (lexer->syntax_bits & RE_LIMITED_OPS) goto normal_char; if (backslash != ((lexer->syntax_bits & RE_BK_PLUS_QM) != 0)) goto normal_char; if (!(lexer->syntax_bits & RE_CONTEXT_INDEP_OPS) && lexer->laststart) goto normal_char; return lexer->lasttok = FSATOKEN_TK_PLUS; case '{': if (!(lexer->syntax_bits & RE_INTERVALS)) goto normal_char; if (backslash != ((lexer->syntax_bits & RE_NO_BK_BRACES) == 0)) goto normal_char; if (!(lexer->syntax_bits & RE_CONTEXT_INDEP_OPS) && lexer->laststart) goto normal_char; /* Cases: {M} - exact count {M,} - minimum count, maximum is infinity {,N} - 0 through N {,} - 0 to infinity (same as '*') {M,N} - M through N */ { char const *p = lexer->lexptr; char const *lim = p + lexer->lexleft; int minrep = -1; int maxrep = -1; for (; p != lim && ISASCIIDIGIT (*p); p++) { if (minrep < 0) minrep = *p - '0'; else minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0'); } if (p != lim) { if (*p != ',') maxrep = minrep; else { if (minrep < 0) minrep = 0; while (++p != lim && ISASCIIDIGIT (*p)) { if (maxrep < 0) maxrep = *p - '0'; else maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - '0'); } } } if (! ((! backslash || (p != lim && *p++ == '\\')) && p != lim && *p++ == '}' && 0 <= minrep && (maxrep < 0 || minrep <= maxrep))) { if (lexer->syntax_bits & RE_INVALID_INTERVAL_ORD) goto normal_char; lexer->abandon_with_error (_("invalid content of \\{\\}")); } if (RE_DUP_MAX < maxrep) lexer->abandon_with_error (_("regular expression too big")); lexer->lexptr = p; lexer->lexleft = lim - p; lexer->minrep = minrep; lexer->maxrep = maxrep; } lexer->laststart = false; return lexer->lasttok = FSATOKEN_TK_REPMN; case '|': if (lexer->syntax_bits & RE_LIMITED_OPS) goto normal_char; if (backslash != ((lexer->syntax_bits & RE_NO_BK_VBAR) == 0)) goto normal_char; lexer->laststart = true; return lexer->lasttok = FSATOKEN_TK_OR; case '\n': if (lexer->syntax_bits & RE_LIMITED_OPS || backslash || !(lexer->syntax_bits & RE_NEWLINE_ALT)) goto normal_char; lexer->laststart = true; return lexer->lasttok = FSATOKEN_TK_OR; case '(': if (backslash != ((lexer->syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; ++lexer->parens; lexer->laststart = true; return lexer->lasttok = FSATOKEN_TK_LPAREN; case ')': if (backslash != ((lexer->syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; if (lexer->parens == 0 && lexer->syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) goto normal_char; --lexer->parens; lexer->laststart = false; return lexer->lasttok = FSATOKEN_TK_RPAREN; case '.': if (backslash) goto normal_char; lexer->laststart = false; if (lexer->multibyte_locale) { /* In multibyte environment period must match with a single character not a byte. So we use FSATOKEN_TK_ANYCHAR. */ return lexer->lasttok = FSATOKEN_TK_ANYCHAR; } return lexer->lasttok = FSATOKEN_TK_CSET + lexer->dotclass_index; case 's': case 'S': /* "\s" == "[[:space:]]"; "\S" == "[^[:space:]]". */ if (! (backslash && lexer->re_gnu_ops)) goto normal_char; lexer->laststart = false; invert = c == 'S'; predicate = find_pred (lexer, "space"); if (c == 's') class_index = charclass_get_index (predicate->class); else { /* Invert the predicate class in a work class. */ work_class = charclass_alloc (); charclass_copyset (predicate->class, work_class); charclass_notset (work_class); class_index = charclass_completed (work_class); } if (lexer->unibyte_locale) return lexer->lasttok = FSATOKEN_TK_CSET + class_index; /* FIXME: see if optimizing this, as is done with FSATOKEN_TK_ANYCHAR and add_utf8_anychar, makes sense. */ /* Multibyte locale: Fill out an entire set description. */ lexer->mbcsets = maybe_realloc (lexer->mbcsets, lexer->nmbcsets, &lexer->mbcsets_alloc, sizeof work_mbc); work_mbc = mbcsets_new (); lexer->mbcsets[lexer->nmbcsets++] = work_mbc; mbcsets_set_match_sense (work_mbc, invert); mbcsets_add_class (work_mbc, predicate->wchar_desc); /* ?? REVIEWME: "invert" in parse_bracket_exp leads to FSATOKEN_TK_BACKREF (== "this is too hard for me") token via known_bracket_exp flag. */ if ((c == 'S') && !lexer->using_simple_locale) return lexer->lasttok = FSATOKEN_TK_BACKREF; else return lexer->lasttok = FSATOKEN_TK_MBCSET; case 'w': case 'W': /* Can mean "[_[:alnum:]]" (\w) or its inverse (\W). */ if (! (backslash && lexer->re_gnu_ops)) goto normal_char; lexer->laststart = false; predicate = find_pred (lexer, "alnum"); work_class = charclass_alloc (); charclass_copyset (predicate->class, work_class); charclass_setbit ('_', work_class); if (c == 'W') charclass_notset (work_class); return lexer->lasttok = FSATOKEN_TK_CSET + charclass_completed (work_class); case '[': if (backslash) goto normal_char; lexer->laststart = false; return lexer->lasttok = parse_bracket_exp (lexer); default: normal_char: lexer->laststart = false; /* For multibyte character sets, folding is done in atom, so always return FSATOKEN_TK_WCHAR. */ if (lexer->multibyte_locale) return lexer->lasttok = FSATOKEN_TK_WCHAR; if (lexer->case_fold && isalpha (c)) { charclass_t *ccl = charclass_alloc (); setbit_case_fold_c (c, ccl); return lexer->lasttok = FSATOKEN_TK_CSET + charclass_completed (ccl); } return lexer->lasttok = c; } } /* The above loop should consume at most a backslash and some other character. */ abort (); return FSATOKEN_TK_END; /* keeps pedantic compilers happy. */ } /* Receive the pattern, and reset the lexical analyser state. The interpretation of the chars (octets?) in the pattern (ASCII chars? variable-length UTF-8 sequences? Simplified Chinese? etc.) depends on the locale that was in force when fsalex_syntax () was called. NULs may be present amongst the codes, which is why the length is given explicitly, rather than relying on strlen(3). */ void fsalex_pattern (fsalex_ctxt_t *lexer, char const *pattern, size_t const pattern_len) { /* Copy parameters to internal state variables. */ lexer->lexptr = pattern; lexer->lexleft = pattern_len; /* Reset lexical scanner state. */ lexer->lasttok = FSATOKEN_TK_END; lexer->laststart = 1; lexer->parens = 0; /* Reset multibyte parsing state. */ lexer->cur_mb_len = 1; memset (&lexer->mbrtowc_state, 0, sizeof (lexer->mbrtowc_state)); } /* Report whether codes 0-127 conform to ASCII encoding. This is handy for optimisation, particularly lower-case and upper-case characters, as the codes are contiguous, unlike EBCDIC. ASCII is also a common denominator in many other unibyte code pages (locales), and also in the multibyte UTF-8 locale. */ static bool _GL_ATTRIBUTE_PURE char_encoding_7bits_is_ascii (void) { /* True if the native character set is known to be compatible with the C locale. The following test isn't perfect, but it's good enough in practice, as only ASCII and EBCDIC are in common use and this test correctly accepts ASCII and rejects EBCDIC. */ const bool is_ascii = '\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12 && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35 && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41 && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46 && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59 && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65 && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94 && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124 && '}' == 125 && '~' == 126; return is_ascii; } /* Receive syntax directives, and other pattern interpretation instructions such as case folding and end-of-line character. In addition, this function configures various internal structures based on the locale in force. */ void fsalex_syntax (fsalex_ctxt_t *lexer, reg_syntax_t bits, int fold, unsigned char eol) { charclass_t *work_class; predicate_entry_t *pred; char const *locale_name; /* Set a flag noting that this lexer has had its syntax params set. */ lexer->syntax_initialised = true; /* Record the function parameters in our local context. */ lexer->syntax_bits = bits; lexer->case_fold = fold; lexer->eolbyte = eol; /* Set up unibyte/multibyte flags, based on MB_CUR_MAX, which depends on the current locale. We capture this information here as the locale may change later. At present, we don't capture MB_CUR_MAX itself. */ if (MB_CUR_MAX > 1) { /* Multibyte locale: Prepare booleans to make code easier to read */ lexer->unibyte_locale = false; lexer->multibyte_locale = true; /* Discard any earlier storage we may have acquired. */ free (lexer->mbcsets); lexer->mbcsets = NULL; /* Set up an array of structures to hold multibyte character sets. */ lexer->nmbcsets = 0; lexer->mbcsets_alloc = 2; lexer->mbcsets = xzalloc (sizeof (*lexer->mbcsets) * lexer->mbcsets_alloc); } else { /* Unibyte locale: Prepare booleans to make code easier to read */ lexer->unibyte_locale = true; lexer->multibyte_locale = false; } /* Charclass guarantees that class index 0 is zeroclass, so we don't need to set it up here. */ /* Set up a character class to match anychar ('.'), tailored to accommodate options from the regex syntax. */ work_class = charclass_alloc (); charclass_notset (work_class); if (! (lexer->syntax_bits & RE_DOT_NEWLINE)) { charclass_clrbit (lexer->eolbyte, work_class); } if (lexer->syntax_bits & RE_DOT_NOT_NULL) { charclass_clrbit (0, work_class); } lexer->dotclass_index = charclass_completed (work_class); lexer->dotclass = charclass_get_pointer (lexer->dotclass_index); /* Testing for the absence of RE_NO_GNU_OPS in syntax_bits happens often, so set a direct flag variable: This makes code more readable. */ lexer->re_gnu_ops = ! (lexer->syntax_bits & RE_NO_GNU_OPS); /* Initialise cache and other tables that have syntax and/or locale influences. */ /* Set up the wchar_desc fields of the predicate table. */ for (pred = lexer->predicates; pred->name != NULL; pred++) pred->wchar_desc = wctype (pred->name); /* Record the name of the current locale. Note that setlocale can return NULL for the name. */ lexer->locale_name = NULL; locale_name = setlocale (LC_ALL, NULL); if (locale_name != NULL) lexer->locale_name = xstrdup (locale_name); /* Determine if the locale's treatment of codes 0-127 matches ASCII, as some operations (e.g. "[c-v]") become easier to handle. */ lexer->ascii_7bit_encoding = char_encoding_7bits_is_ascii (); /* Further optimisations are possible if the locale is an unibyte ASCII 7-bit C/POSIX environment, and this is a common case, so spend some effort looking for this situation. */ lexer->using_simple_locale = false; if (lexer->unibyte_locale && lexer->ascii_7bit_encoding) { lexer->using_simple_locale = ! lexer->locale_name || STREQ (lexer->locale_name, "C") || STREQ (lexer->locale_name, "POSIX"); } /* Initialise cache that distinguishes between unibyte and multibyte (wide) characters based on the leading octet. */ initialise_uchar_to_wc_cache (lexer); /* ?? If a complete copy of the locale needs to be stored within this instance, see uselocale(3) and duplocale(3) (and free the copy with freelocale(3)). */ } /* Define external function to do non-core data exchanges between the lexer and the parser. This function must conform to proto_lexparse_exchange_fn_t. This interface lets two instances communicate without requiring help from an outside party. */ int fsalex_exchange (fsalex_ctxt_t *lexer, proto_lexparse_opcode_t opcode, void *param) { switch (opcode) { case PROTO_LEXPARSE_OP_GET_IS_MULTIBYTE_LOCALE: return (int) lexer->multibyte_locale; case PROTO_LEXPARSE_OP_GET_REPMN_MIN: return lexer->minrep; case PROTO_LEXPARSE_OP_GET_REPMN_MAX: return lexer->maxrep; case PROTO_LEXPARSE_OP_GET_WIDE_CHAR_LIST_MAX: /* Original token, plus upper/lowercase, plus extended variants. */ return 1 + CASE_FOLDED_BUFSIZE; case PROTO_LEXPARSE_OP_GET_WIDE_CHARS: { wchar_t *p_wc = (wchar_t *) param; /* Record the original token (wctok) implicitly associated with WCHAR. */ *p_wc++ = lexer->wctok; /* Add in direct and/or indirect case equivalents, as appropriate. */ if (lexer->case_fold) p_wc += case_folded_counterparts (lexer, lexer->wctok, p_wc); /* Return how many wide chars (at least 1) are in the list. */ return p_wc - (wchar_t *) param; } break; case PROTO_LEXPARSE_OP_GET_DOTCLASS: *((charclass_t **) param) = lexer->dotclass; break; case PROTO_LEXPARSE_OP_GET_MBCSET: /* Return the set associated with the most recent MBCSET token. */ *((mbcsets_set_t **) param) = lexer->mbcsets[lexer->nmbcsets - 1]; break; default: /* ?? Not sure if we should complain/assert or merely ignore an opcode that we don't recognise here. */ assert (!"unrecognised PROTO_LEXPARSE opcode"); /* NOTREACHED */ break; } /* If we reach here, return value is unimportant, so just say 0. */ return 0; } /* Receive functions to deal with exceptions detected by the lexer: Warnings and errors. Internally, we add the _Noreturn attribute to the error callback, to help the compiler with code flow analysis. */ void fsalex_exception_fns (fsalex_ctxt_t *lexer, fsalex_warn_callback_fn *warningfn, fsalex_error_callback_fn *errorfn) { /* Record the provided functions in the lexer's context. */ lexer->warn_client = warningfn; lexer->abandon_with_error = errorfn; } /* Add "not provided!" stub function that gets called if the client fails to provide proper resources. This is a hack, merely to get the module started; better treatment needs to be added later. */ static void no_function_provided (void *unused) { assert (!"fsalex: Plug-in function required, but not provided."); } /* Generate a new instance of an FSA lexer. */ fsalex_ctxt_t * fsalex_new (void) { fsalex_ctxt_t *new_context; /* Acquire zeroed memory for new lexer context. */ new_context = XZALLOC (fsalex_ctxt_t); /* ?? Point warning and error functions to a "you need to tell me these first!" function? */ new_context->warn_client = (fsalex_warn_callback_fn *) no_function_provided; new_context->abandon_with_error = (fsalex_error_callback_fn *) no_function_provided; /* Default to working in a non-multibyte locale. In some cases, FETCH_WC never sets this variable (as it's assumed to be 1), so fulfil this expectation here. */ new_context->cur_mb_len = 1; /* Copy the template predicate list into this context, so that we can have lexer-specific named predicate classes. */ memcpy (new_context->predicates, template_predicate_list, sizeof (new_context->predicates)); /* Default to unibyte locale at first; the final locale setting is made according to what's in force when fsalex_syntax () is called. */ new_context->unibyte_locale = true; new_context->multibyte_locale = false; /* Many things depend on decisions made in fsalex_syntax (), so note here that it hasn't been called yet, and fail gracefully later if the client hasn't called the function before commencing work. */ new_context->syntax_initialised = false; /* Add this instance at the head of the module's list. */ new_context->next_instance = fsalex_instance_list_head; fsalex_instance_list_head = new_context; return new_context; } /* Internal function to free all resources directly or indirectly used by an instance. The pointer is no longer valid after this call. */ static void free_instance (fsalex_ctxt_t *lexer) { /* Free top-level structures. */ free (lexer->locale_name); free (lexer->mbcsets); free (lexer); /* Note that multibyte sets are no longer stored explicitly in the instance; apart from the pointer list (freed above), the set contents are freed separately when mbcsets module is destroyed. */ } /* Destroy all lexer instances, plus any associated resources owned by the module. */ void fsalex_destroy_module (void) { fsalex_ctxt_t *p_list; fsalex_ctxt_t *p_next; /* Move the global list head into a local variable, and immediately clear the global. This is a half-hearted attempt to avoid race conditions; to do things properly, a system-wide atomic operation (locked, including multi-CPU cache coherency) operation should be used. */ p_list = fsalex_instance_list_head; fsalex_instance_list_head = NULL; /* Traverse the list of instances, releasing all resources associated with each one. */ while (p_list) { p_next = p_list->next_instance; free_instance (p_list); p_list = p_next; } } /* Prepare module for operation. */ void fsalex_initialise (void) { /* Initialise the linked list of instances created by this module. */ fsalex_instance_list_head = NULL; atexit (fsalex_destroy_module); } /* vim:set shiftwidth=2: */