/* -*- mode: C -*- Time-stamp: "2009-01-05 08:53:02 mgold"
 *
 *       File:         pdf-tokeniser.h
 *       Date:         Mon Dec 29 00:45:09 2008
 *
 *       GNU PDF Library - Stream tokeniser
 *
 */

/* Copyright (C) 2008 Michael Gold */

/* This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef PDF_TOKENISER
#define PDF_TOKENISER
#include <pdf-types.h>
#include <pdf-stm-buffer.h>

enum pdf_tokeniser_state_e {
  PDF_TOKENISER_STATE_NONE = 0,
  PDF_TOKENISER_STATE_WSPACE,
  PDF_TOKENISER_STATE_COMMENT,
  PDF_TOKENISER_STATE_KEYWORD,
  PDF_TOKENISER_STATE_NAME,
  PDF_TOKENISER_STATE_STRING,
  PDF_TOKENISER_STATE_HEXSTRING,
  PDF_TOKENISER_STATE_DICTEND,
  PDF_TOKENISER_STATE_PENDING
};

/* Tokeniser states (from pdf_tokeniser_state_e):
 * NONE - Initial state: not reading a token.
 * WSPACE - Reading whitespace into buffer.
 * COMMENT - Reading a comment. buffer collects the comment bytes, including
 *           the initial '%'.
 * KEYWORD - Reading some regular characters into buffer; this could result
 *           in a symbol like "null", or a number.
 * NAME - Reading a name (which starts with '/').
 *   Substates:
 *     0 - normal state
 *     1 - just read a '#' (escape prefix)
 *     2 - read the first hex digit after '#'; the value is in charparam
 *   buffer collects the name, excluding the initial '/'.
 * STRING - Reading a literal string (enclosed in '(', ')').
 *   Substates:
 *     0 - normal state
 *     1 - ignore the next byte if its value is 10 (ASCII LF;
 *         this is used to treat CRLF as a single line ending)
 *     2 - just saw a backslash (escape prefix)
 *     3 - read 1 octal digit; the value is in charparam
 *     4 - read 2 octal digits; the value is in charparam
 *   intparam is the bracket nesting level; ')' at level 0 ends the string.
 *   buffer collects the string.
 * HEXSTRING - Reading a hex string.
 *   Substates:
 *     0 - initial state: we just saw the opening '<', and if the next byte is
 *         also '<' this is the start of a dictionary rather than a string
 *     1 - normal state (the next hex digit will be the first in a pair)
 *     2 - read the first hex digit; its value is in charparam
 *     3 - end state; saw the closing '>'
 *   buffer collects the string.
 * DICTEND - Just got a '>'; expecting another.
 *   Substates:
 *     0 - starting state
 *     1 - saw the second '>'
 * PENDING - Need to emit a token (determined by charparam) ASAP.
 */

enum pdf_tokeniser_flag_e {
  PDF_TOKENISER_FLAG_RET_COMMENTS = 1,  /* return comments as tokens */
  PDF_TOKENISER_FLAG_PDF11 = 2,         /* disallow '#' escapes in names */
};

/* Internal state */
struct pdf_tokeniser_s {
  int flags;  /* miscellaneous settings (from pdf_tokeniser_flag_e) */
  int state;
  int substate;
  pdf_char_t charparam;
  int intparam;
  pdf_stm_buffer_t buffer;
};

//TODO: use pdf_obj structures for tokens
enum pdf_token_type_e {
  PDF_TOKEN_TYPE_WSPACE,
  PDF_TOKEN_TYPE_COMMENT,
  PDF_TOKEN_TYPE_KEYWORD,
  PDF_TOKEN_TYPE_INTEGER,
  PDF_TOKEN_TYPE_REAL,
  PDF_TOKEN_TYPE_NAME,
  PDF_TOKEN_TYPE_STRING,
  PDF_TOKEN_TYPE_DICT_START,
  PDF_TOKEN_TYPE_DICT_END,
  PDF_TOKEN_TYPE_ARRAY_START,
  PDF_TOKEN_TYPE_ARRAY_END,
  PDF_TOKEN_TYPE_PROC_START,
  PDF_TOKEN_TYPE_PROC_END,
};

struct pdf_token_s
{
  char type;
  char _priv_flags;
  union {
    pdf_stm_buffer_t buffer;
    double real;
    int integer;
  } value;
};

/* BEGIN PUBLIC */
typedef struct pdf_tokeniser_s *pdf_tokeniser_t;
pdf_status_t pdf_tokeniser_new(pdf_tokeniser_t *context);
pdf_status_t pdf_tokeniser_destroy(pdf_tokeniser_t context);

typedef struct pdf_token_s *pdf_token_t;
pdf_status_t read_token(pdf_tokeniser_t context,
    pdf_stm_buffer_t in, pdf_token_t *token, pdf_bool_t finish_p);
/* END PUBLIC */


#endif

/* End of pdf_tokeniser.h */