/* -*- mode: C -*- Time-stamp: "2009-01-05 08:53:02 mgold" * * File: pdf-tokeniser.h * Date: Mon Dec 29 00:45:09 2008 * * GNU PDF Library - Stream tokeniser * */ /* Copyright (C) 2008 Michael Gold */ /* This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef PDF_TOKENISER #define PDF_TOKENISER #include #include enum pdf_tokeniser_state_e { PDF_TOKENISER_STATE_NONE = 0, PDF_TOKENISER_STATE_WSPACE, PDF_TOKENISER_STATE_COMMENT, PDF_TOKENISER_STATE_KEYWORD, PDF_TOKENISER_STATE_NAME, PDF_TOKENISER_STATE_STRING, PDF_TOKENISER_STATE_HEXSTRING, PDF_TOKENISER_STATE_DICTEND, PDF_TOKENISER_STATE_PENDING }; /* Tokeniser states (from pdf_tokeniser_state_e): * NONE - Initial state: not reading a token. * WSPACE - Reading whitespace into buffer. * COMMENT - Reading a comment. buffer collects the comment bytes, including * the initial '%'. * KEYWORD - Reading some regular characters into buffer; this could result * in a symbol like "null", or a number. * NAME - Reading a name (which starts with '/'). * Substates: * 0 - normal state * 1 - just read a '#' (escape prefix) * 2 - read the first hex digit after '#'; the value is in charparam * buffer collects the name, excluding the initial '/'. * STRING - Reading a literal string (enclosed in '(', ')'). * Substates: * 0 - normal state * 1 - ignore the next byte if its value is 10 (ASCII LF; * this is used to treat CRLF as a single line ending) * 2 - just saw a backslash (escape prefix) * 3 - read 1 octal digit; the value is in charparam * 4 - read 2 octal digits; the value is in charparam * intparam is the bracket nesting level; ')' at level 0 ends the string. * buffer collects the string. * HEXSTRING - Reading a hex string. * Substates: * 0 - initial state: we just saw the opening '<', and if the next byte is * also '<' this is the start of a dictionary rather than a string * 1 - normal state (the next hex digit will be the first in a pair) * 2 - read the first hex digit; its value is in charparam * 3 - end state; saw the closing '>' * buffer collects the string. * DICTEND - Just got a '>'; expecting another. * Substates: * 0 - starting state * 1 - saw the second '>' * PENDING - Need to emit a token (determined by charparam) ASAP. */ enum pdf_tokeniser_flag_e { PDF_TOKENISER_FLAG_RET_COMMENTS = 1, /* return comments as tokens */ PDF_TOKENISER_FLAG_PDF11 = 2, /* disallow '#' escapes in names */ }; /* Internal state */ struct pdf_tokeniser_s { int flags; /* miscellaneous settings (from pdf_tokeniser_flag_e) */ int state; int substate; pdf_char_t charparam; int intparam; pdf_stm_buffer_t buffer; }; //TODO: use pdf_obj structures for tokens enum pdf_token_type_e { PDF_TOKEN_TYPE_WSPACE, PDF_TOKEN_TYPE_COMMENT, PDF_TOKEN_TYPE_KEYWORD, PDF_TOKEN_TYPE_INTEGER, PDF_TOKEN_TYPE_REAL, PDF_TOKEN_TYPE_NAME, PDF_TOKEN_TYPE_STRING, PDF_TOKEN_TYPE_DICT_START, PDF_TOKEN_TYPE_DICT_END, PDF_TOKEN_TYPE_ARRAY_START, PDF_TOKEN_TYPE_ARRAY_END, PDF_TOKEN_TYPE_PROC_START, PDF_TOKEN_TYPE_PROC_END, }; struct pdf_token_s { char type; char _priv_flags; union { pdf_stm_buffer_t buffer; double real; int integer; } value; }; /* BEGIN PUBLIC */ typedef struct pdf_tokeniser_s *pdf_tokeniser_t; pdf_status_t pdf_tokeniser_new(pdf_tokeniser_t *context); pdf_status_t pdf_tokeniser_destroy(pdf_tokeniser_t context); typedef struct pdf_token_s *pdf_token_t; pdf_status_t read_token(pdf_tokeniser_t context, pdf_stm_buffer_t in, pdf_token_t *token, pdf_bool_t finish_p); /* END PUBLIC */ #endif /* End of pdf_tokeniser.h */