/* fsatoken - Create tokens for a compact, coherent regular expression language Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2014 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ /* Written June, 1988 by Mike Haertel Modified July, 1988 by Arthur David Olson to assist BMG speedups */ /* 2014: Repackaged by "untangle" script, written by behoffski. */ /* Regular expression patterns are presented as text, possibly ASCII; the format is very expressive, but this comes at the cost of being somewhat expensive to interpret (including identifying invalid patterns). By tokenising the pattern, we make life much easier for the parser and other search machinery that follows. This file defines the tokens that we use, both for the benefit of the lexer/parser/dfa analyser that share this information, and for other machinery (such as the C compiler) that may need to store and/or manipulate these items. */ #ifndef FSATOKEN_H #define FSATOKEN_H 1 /* Always import environment-specific configuration items first. */ #include /* Obtain defn. of ptrdiff_t from stddef.h, and CHAR_BIT from limits.h */ #include #include /* C stream octets, and non-stream EOF, are self-representing tokens. We need to include stdio.h to obtain the definition of EOF. */ #include /* First integer value that is greater than any character code. */ enum { FSATOKEN_NOTCHAR = 1 << CHAR_BIT }; /* The regexp is parsed into an array of tokens in postfix form. Some tokens are operators and others are terminal symbols. Most (but not all) of these codes are returned by the lexical analyzer. */ typedef ptrdiff_t fsatoken_token_t; /* Predefined token values. */ enum { FSATOKEN_TK_END = -1, /* FSATOKEN_TK_END is a terminal symbol that matches the end of input; any value of FSATOKEN_TK_END or less in the parse tree is such a symbol. Accepting states of the DFA are those that would have a transition on FSATOKEN_TK_END. */ /* Ordinary character values are terminal symbols that match themselves. */ FSATOKEN_TK_EMPTY = FSATOKEN_NOTCHAR, /* FSATOKEN_TK_EMPTY is a terminal symbol that matches the empty string. */ FSATOKEN_TK_BACKREF, /* FSATOKEN_TK_BACKREF is generated by \ or by any other construct that is not completely handled. If the scanner detects a transition on backref, it returns a kind of "semi-success" indicating that the match will have to be verified with a backtracking matcher. */ FSATOKEN_TK_BEGLINE, /* FSATOKEN_TK_BEGLINE is a terminal symbol that matches the empty string at the beginning of a line. */ FSATOKEN_TK_ENDLINE, /* FSATOKEN_TK_ENDLINE is a terminal symbol that matches the empty string at the end of a line. */ FSATOKEN_TK_BEGWORD, /* FSATOKEN_TK_BEGWORD is a terminal symbol that matches the empty string at the beginning of a word. */ FSATOKEN_TK_ENDWORD, /* FSATOKEN_TK_ENDWORD is a terminal symbol that matches the empty string at the end of a word. */ FSATOKEN_TK_LIMWORD, /* FSATOKEN_TK_LIMWORD is a terminal symbol that matches the empty string at the beginning or the end of a word. */ FSATOKEN_TK_NOTLIMWORD, /* FSATOKEN_TK_NOTLIMWORD is a terminal symbol that matches the empty string not at the beginning or end of a word. */ FSATOKEN_TK_QMARK, /* FSATOKEN_TK_QMARK is an operator of one argument that matches zero or one occurrences of its argument. */ FSATOKEN_TK_STAR, /* FSATOKEN_TK_STAR is an operator of one argument that matches the Kleene closure (zero or more occurrences) of its argument. */ FSATOKEN_TK_PLUS, /* FSATOKEN_TK_PLUS is an operator of one argument that matches the positive closure (one or more occurrences) of its argument. */ FSATOKEN_TK_REPMN, /* FSATOKEN_TK_REPMN is a lexical token corresponding to the {m,n} construct. FSATOKEN_TK_REPMN never appears in the compiled token vector. */ FSATOKEN_TK_CAT, /* FSATOKEN_TK_CAT is an operator of two arguments that matches the concatenation of its arguments. FSATOKEN_TK_CAT is never returned by the lexical analyzer. */ FSATOKEN_TK_OR, /* FSATOKEN_TK_OR is an operator of two arguments that matches either of its arguments. */ FSATOKEN_TK_LPAREN, /* FSATOKEN_TK_LPAREN never appears in the parse tree, it is only a lexeme. */ FSATOKEN_TK_RPAREN, /* FSATOKEN_TK_RPAREN never appears in the parse tree. */ FSATOKEN_TK_ANYCHAR, /* FSATOKEN_TK_ANYCHAR is a terminal symbol that matches a valid multibyte (or single byte) character. It is used only if MB_CUR_MAX > 1. */ FSATOKEN_TK_MBCSET, /* FSATOKEN_TK_MBCSET is similar to FSATOKEN_TK_CSET, but for multibyte characters. */ FSATOKEN_TK_WCHAR, /* Only returned by lex. wctok contains the wide character representation. */ FSATOKEN_TK_CSET /* FSATOKEN_TK_CSET and (and any value greater) is a terminal symbol that matches any of a class of characters. */ }; /* prtok - Display token name (for debugging) */ #ifdef DEBUG extern void fsatoken_prtok (fsatoken_token_t t); #endif /* DEBUG */ #endif /* FSATOKEN_H */ /* vim:set shiftwidth=2: */