/* -*- mode: C -*- Time-stamp: "2009-01-05 08:53:02 mgold"
*
* File: pdf-tokeniser.h
* Date: Mon Dec 29 00:45:09 2008
*
* GNU PDF Library - Stream tokeniser
*
*/
/* Copyright (C) 2008 Michael Gold */
/* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifndef PDF_TOKENISER
#define PDF_TOKENISER
#include
#include
enum pdf_tokeniser_state_e {
PDF_TOKENISER_STATE_NONE = 0,
PDF_TOKENISER_STATE_WSPACE,
PDF_TOKENISER_STATE_COMMENT,
PDF_TOKENISER_STATE_KEYWORD,
PDF_TOKENISER_STATE_NAME,
PDF_TOKENISER_STATE_STRING,
PDF_TOKENISER_STATE_HEXSTRING,
PDF_TOKENISER_STATE_DICTEND,
PDF_TOKENISER_STATE_PENDING
};
/* Tokeniser states (from pdf_tokeniser_state_e):
* NONE - Initial state: not reading a token.
* WSPACE - Reading whitespace into buffer.
* COMMENT - Reading a comment. buffer collects the comment bytes, including
* the initial '%'.
* KEYWORD - Reading some regular characters into buffer; this could result
* in a symbol like "null", or a number.
* NAME - Reading a name (which starts with '/').
* Substates:
* 0 - normal state
* 1 - just read a '#' (escape prefix)
* 2 - read the first hex digit after '#'; the value is in charparam
* buffer collects the name, excluding the initial '/'.
* STRING - Reading a literal string (enclosed in '(', ')').
* Substates:
* 0 - normal state
* 1 - ignore the next byte if its value is 10 (ASCII LF;
* this is used to treat CRLF as a single line ending)
* 2 - just saw a backslash (escape prefix)
* 3 - read 1 octal digit; the value is in charparam
* 4 - read 2 octal digits; the value is in charparam
* intparam is the bracket nesting level; ')' at level 0 ends the string.
* buffer collects the string.
* HEXSTRING - Reading a hex string.
* Substates:
* 0 - initial state: we just saw the opening '<', and if the next byte is
* also '<' this is the start of a dictionary rather than a string
* 1 - normal state (the next hex digit will be the first in a pair)
* 2 - read the first hex digit; its value is in charparam
* 3 - end state; saw the closing '>'
* buffer collects the string.
* DICTEND - Just got a '>'; expecting another.
* Substates:
* 0 - starting state
* 1 - saw the second '>'
* PENDING - Need to emit a token (determined by charparam) ASAP.
*/
enum pdf_tokeniser_flag_e {
PDF_TOKENISER_FLAG_RET_COMMENTS = 1, /* return comments as tokens */
PDF_TOKENISER_FLAG_PDF11 = 2, /* disallow '#' escapes in names */
};
/* Internal state */
struct pdf_tokeniser_s {
int flags; /* miscellaneous settings (from pdf_tokeniser_flag_e) */
int state;
int substate;
pdf_char_t charparam;
int intparam;
pdf_stm_buffer_t buffer;
};
//TODO: use pdf_obj structures for tokens
enum pdf_token_type_e {
PDF_TOKEN_TYPE_WSPACE,
PDF_TOKEN_TYPE_COMMENT,
PDF_TOKEN_TYPE_KEYWORD,
PDF_TOKEN_TYPE_INTEGER,
PDF_TOKEN_TYPE_REAL,
PDF_TOKEN_TYPE_NAME,
PDF_TOKEN_TYPE_STRING,
PDF_TOKEN_TYPE_DICT_START,
PDF_TOKEN_TYPE_DICT_END,
PDF_TOKEN_TYPE_ARRAY_START,
PDF_TOKEN_TYPE_ARRAY_END,
PDF_TOKEN_TYPE_PROC_START,
PDF_TOKEN_TYPE_PROC_END,
};
struct pdf_token_s
{
char type;
char _priv_flags;
union {
pdf_stm_buffer_t buffer;
double real;
int integer;
} value;
};
/* BEGIN PUBLIC */
typedef struct pdf_tokeniser_s *pdf_tokeniser_t;
pdf_status_t pdf_tokeniser_new(pdf_tokeniser_t *context);
pdf_status_t pdf_tokeniser_destroy(pdf_tokeniser_t context);
typedef struct pdf_token_s *pdf_token_t;
pdf_status_t read_token(pdf_tokeniser_t context,
pdf_stm_buffer_t in, pdf_token_t *token, pdf_bool_t finish_p);
/* END PUBLIC */
#endif
/* End of pdf_tokeniser.h */