/* fsalex - Repackage pattern text as compact, expressive tokens

   Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2014 Free Software
   Foundation, Inc.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc.,
   51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */

/* Written June, 1988 by Mike Haertel
   Modified July, 1988 by Arthur David Olson to assist BMG speedups  */

/* 2014: Repackaged by "untangle" script, written by behoffski.  */

/* Always import environment-specific configuration items first.  */
#include <config.h>    /* define _GNU_SOURCE for regex extensions.  */

#include <assert.h>
#include "charclass.h"
#include <ctype.h>
#include "fsalex.h"
#include "fsatoken.h"
#include <limits.h>
#include <locale.h>
#include "mbcsets.h"
#include "proto-lexparse.h"
#include <regex.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <wctype.h>
#include "xalloc.h"

#include "gettext.h"
#define _(str) gettext (str)

/* ISASCIIDIGIT differs from isdigit, as follows:
   - Its arg may be any int or unsigned int; it need not be an unsigned char.
   - It's guaranteed to evaluate its argument exactly once.
   - It's typically faster.
   Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
   only '0' through '9' are digits.  Prefer ISASCIIDIGIT to isdigit unless
   it's important to use the locale's definition of "digit" even when the
   host does not conform to Posix.  */
#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)

#define STREQ(a, b) (strcmp (a, b) == 0)

#ifndef MIN
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif

/* The following list maps the names of the Posix named character
   classes to predicate functions that determine whether a given
   character is in the class.  The leading [ has already been eaten by
   the lexical analyzer.  Additional objects are provided to assist
   the client: wchar_desc for multibyte matching, and class for octet
   matching.  Lazy evaluation and caching are used to minimise
   processing costs, so these additional items are only valid after a
   class has been located using find_pred ().  */
typedef int predicate_t (wint_t, wctype_t);
typedef struct predicate_entry_struct
{
  const char *name;
  wctype_t wchar_desc;
  charclass_t *class;
  bool may_be_multibyte;
} predicate_entry_t;

/* This list is a template, copied into each lexer's state, and
   interrogated and updated from there.  The membership of a class can
   vary due to locale and other settings, so each lexer must maintain
   its own list.  Duplicate class sharing across different lexer
   instances is facilitated by checks in charclass_completed.  */
/* Locale portability note: We may need to record the entire explicit
   locale when syntax () is called, and then use isalpha_l () etc.
   here, as otherwise the wchar_desc may be interpreted at runtime in
   the context of the current locale.  If a complete copy of the locale
   needs to be stored locally, see uselocale(3) and duplocale(3) (and
   free the copy with freelocale(3)).  */
static predicate_entry_t template_predicate_list[] = {
  {"alpha",  0, NULL, true},
  {"alnum",  0, NULL, true},
  {"blank",  0, NULL, true},
  {"cntrl",  0, NULL, true},
  {"digit",  0, NULL, false},
  {"graph",  0, NULL, true},
  {"lower",  0, NULL, true},
  {"print",  0, NULL, true},
  {"punct",  0, NULL, true},
  {"space",  0, NULL, true},
  {"upper",  0, NULL, true},
  {"xdigit", 0, NULL, true},
  {NULL, 0, NULL}
};

#define PREDICATE_TEMPLATE_ITEMS \
    (sizeof template_predicate_list / sizeof *template_predicate_list)

/* Flesh out the opaque instance context type given in the header.  */
struct fsalex_ctxt_struct
{
  /* Singly-linked list of all lexer instances, so destroy_module can
     release all resources by traversing the list.  */
  fsalex_ctxt_t *next_instance;

  /* Using the lexer without setting the syntax is a fatal error, so
     use a flag so we can report such errors in a direct fashion.  */
  bool syntax_initialised;

  /* Syntax flags/characters directing how to interpret the pattern.  */
  reg_syntax_t syntax_bits;
  bool case_fold;
  unsigned char eolbyte;

  /* Exception handling is done by explicit callbacks.  */
  fsalex_warn_callback_fn *warn_client;
  fsalex_error_callback_fn *abandon_with_error;

  /* Pattern pointer/length, updated as pattern is consumed.  */
  char const *lexptr;
  size_t lexleft;

  /* Break out some regex syntax bits into boolean vars.  Do this for
     the ones that are heavily used, and/or where the nature of the
     bitmask flag test tends to clutter the lexer code.  */
  bool re_gnu_ops;             /* GNU regex operators are allowed.  */

  /* Carry dotclass here, as it's easier for clients (UTF-8) to perform
     class operations with this class, rather than to know intimate
     details of the regex syntax configuration bits and items such as
     eolbyte.  */
  charclass_t *dotclass;         /* ".": All chars except eolbyte and/or
                                        NUL, depending on syntax flags.  */
  charclass_index_t dotclass_index;

  /* Work variables to help organise lexer operation.  */
  fsatoken_token_t lasttok;
  bool laststart;
  size_t parens;

  /* Character class predicate mapping/caching table.  */
  predicate_entry_t predicates[PREDICATE_TEMPLATE_ITEMS];

  /* Minrep and maxrep are actually associated with the REPMN token,
     and need to be accessible outside this module (by the parser),
     perhaps by an explicit interface call.  In the far, far future,
     a completely-reworked token list may see these values properly
     become integrated into the token stream (perhaps by a pair of
     "Parameter" tokens?  Perhaps by a MINREP token with 1 parameter,
     followed by a MAXREP token with a corresponding parameter?)  */
  int minrep, maxrep;

  /* Booleans to simplify unibyte/multibyte code selection paths.  In
     addition, other flags are placed here to summarise properties of
     the locale in a concise fashion that can useful when
     implementing optimisations.  There may be some overlap and/or
     redundancy here; flag names are chosen to let the user be direct
     and concise, but we only provide names for the more popular cases,
     not all.  */
  char *locale_name;
  bool unibyte_locale;
  bool multibyte_locale;
  bool ascii_7bit_encoding;
  bool using_simple_locale;

  /* REVIEWME: Wide-character support variables.  */
  int cur_mb_len;       /* Length (in bytes) of the last character
                           fetched; this is needed when backing up
                           during lexing.  In a non-multibyte situation
                           (locale?), this variable remains at 1;
                           otherwise, it is updated as required by
                           FETCH_WC.  */

  /* These variables are used only if in a multibyte locale.  */
  wchar_t wctok;         /* Storage for a single multibyte character,
                            used both during lexing, and as the implied
                            parameter of a WCHAR token returned by the
                            lexer.  */
  mbstate_t mbrtowc_state; /* State management area for mbrtowc to
                              use. */

  /* A table indexed by byte values that contains the corresponding wide
     character (if any) for that byte.  WEOF means the byte is not a
     valid single-byte character.  */
  wint_t mbrtowc_cache[FSATOKEN_NOTCHAR];

  mbcsets_set_t **mbcsets;
  size_t nmbcsets;
  size_t mbcsets_alloc;
};

/* Linked list of all instances created by this module.  */
static fsalex_ctxt_t *fsalex_instance_list_head = NULL;

/* Ensure that the array addressed by PTR holds at least NITEMS +
   (PTR || !NITEMS) items.  Either return PTR, or reallocate the array
   and return its new address.  Although PTR may be null, the returned
   value is never null.

   The array holds *NALLOC items; *NALLOC is updated on reallocation.
   ITEMSIZE is the size of one item.  Avoid O(N**2) behavior on arrays
   growing linearly.  */
static void *
maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
{
  if (nitems < *nalloc)
    return ptr;
  *nalloc = nitems;
  return x2nrealloc (ptr, nalloc, itemsize);
}

/* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
   is represented by a multi-byte sequence.  Even in unibyte locales,
   this may happen when folding case in weird Turkish locales where
   dotless i/dotted I are not included in the chosen character set.
   Return whether a bit was set in the charclass.  */
/* Set a bit for B and its case variants in the charclass C.
  We must be in an unibyte locale.  */
static void
setbit_case_fold_c (int b, charclass_t *c)
{
  int ub = toupper (b);
  int i;
  for (i = 0; i < FSATOKEN_NOTCHAR; i++)
    if (toupper (i) == ub)
      charclass_setbit (i, c);
}

/* Convert a possibly-signed character to an unsigned character.  This is
   a bit safer than casting to unsigned char, since it catches some type
   errors that the cast doesn't.  */
static unsigned char
to_uchar (char ch)
{
  return ch;
}

static void
initialise_uchar_to_wc_cache (fsalex_ctxt_t *lexer)
{
  int i;
  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
    {
      char c = i;
      unsigned char uc = i;
      mbstate_t s = { 0 };
      wchar_t wc;
      lexer->mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
    }
}

/* This function is intimately connected with multibyte (wide-char)
   handling in the macro FETCH_WC below, in the case where
   FETCH_SINGLE_CHAR has run but the result has been found to be
   inconclusive.  It works by unwinding the FETCH_SINGLE_CHAR
   side-effects (lexptr/lexleft), then calling mbrtowc on the pattern
   space, and communicates mbrtowc's understanding of the octet
   stream back to the caller:
     - If a valid multibyte octet sequence is next, then the wide
       character associated with this sequence is written back to
       *p_wchar, and the number of octets consumed is returned; or
     - If the sequence is invalid for any reason, the mbrtowc working
       state is reset (zeroed), *p_wchar is not modified, and 1 is
       returned.
   Lexer state variables, including cur_mb_len, mbs, lexleft and
   lexptr, are updated as appropriate by this function (mainly if
   mbrtowc succeeds).  The wide char NUL is unusual as it is a
   1-octet sequence, the length returned is 0, we report it as length
   1, but write the converted wide character in temp_wchar to the
   caller.  */
/* Additional notes:  This code, in partnership with the macro
   FETCH_WC, is closely related to mbs_to_wchar in dfa.c.  There is
   documentation there (e.g. pattern must end in a sentinel, shift
   encodings not supported, plus other comments/guarantees) that is
   important, but I'm deferring writing up anything at present until
   I see how this code is received.  */
static size_t
fetch_offset_wide_char (fsalex_ctxt_t *lexer, wchar_t *p_wchar)
{
  size_t nbytes;
  wchar_t temp_wchar;

  nbytes = mbrtowc (&temp_wchar,
                    lexer->lexptr - 1, lexer->lexleft + 1,
                    &lexer->mbrtowc_state);
  switch (nbytes)
    {
    case (size_t) -2:
    case (size_t) -1:
      /* Conversion failed: Incomplete (-2) or invalid (-1)
         sequence.  */
      memset (&lexer->mbrtowc_state, 0, sizeof (lexer->mbrtowc_state));
      return 1;

    case (size_t) 0:
      /* This is the wide NUL character, actually 1 byte long. */
      nbytes = 1;
      break;

    default:
      /* Converted character is in temp_wchar, and nbytes is a byte
         count.  */
      break;
    }
  /* We converted 1 or more bytes, tell result to caller.  */
  *p_wchar = temp_wchar;

  /* Update the number of bytes consumed (offset by 1 since
     FETCH_SINGLE_CHAR grabbed one earlier).  */
  lexer->lexptr  += nbytes - 1;
  lexer->lexleft -= nbytes - 1;

  return nbytes;
}

/* Single-character input fetch, with EOF/error handling.  Note that
   characters become unsigned here.  If no characters are available,
   the macro either returns END or reports an error, depending on
   eoferr.  Otherwise, one character is consumed (lexptr/lexleft),
   the char is converted into an unsigned char, and is written into
   the parameter c.  */
#define FETCH_SINGLE_CHAR(lexer, c, eoferr)                  \
  do {                                                       \
    if (! (lexer)->lexleft)                                  \
      {                                                      \
        if ((eoferr) != 0)                                   \
          (lexer)->abandon_with_error (eoferr);              \
        else                                                 \
          return FSATOKEN_TK_END;                            \
      }                                                      \
    (c) = to_uchar (*(lexer)->lexptr++);                     \
    (lexer)->lexleft--;                                      \
  } while (0)

/* Do the fetch in stages: Single char, octet+multibyte cache
   check, and possible wide char fetch if the cache result indicates
   that the input sequence is longer than a single octet.  The first
   fetch handles end-of-input cases (if this happens, control never
   reaches the rest of the macro); otherwise, it returns temp_uchar
   which is used in the cache lookup, and may be the single-octet
   result.  A cache result of WEOF means that the octet is not a
   complete sequence by itself, so a second fetch tweaks lexptr/lexleft
   to undo the single-char-fetch side-effects, and, depending on
   mbrtowc valid/invalid result, propagates either the multichar fetch
   or the single-char fetch back to the caller.  */
# define FETCH_WC(lexer, c, wc, eoferr)                      \
  do {                                                       \
    wchar_t temp_wc;                                         \
    unsigned char temp_uchar;                                \
    (lexer)->cur_mb_len = 1;                                 \
    FETCH_SINGLE_CHAR ((lexer), temp_uchar, (eoferr));       \
    temp_wc = (lexer)->mbrtowc_cache[temp_uchar];            \
    if (temp_wc != WEOF)                                     \
      {                                                      \
        (c)  = temp_uchar;                                   \
        (wc) = temp_wc;                                      \
      }                                                      \
    else                                                     \
      {                                                      \
        size_t nbytes;                                       \
        temp_wc = temp_uchar;                                \
        nbytes = fetch_offset_wide_char ((lexer), &temp_wc); \
        (wc) = temp_wc;                                      \
        (c) = nbytes == 1 ? temp_uchar : EOF;                \
        (lexer)->cur_mb_len = nbytes;                        \
      }                                                      \
  } while (0)

/* Given a predicate name, find it in a list, and report the list
   entry to the caller.  If the name is not recognised, the function
   returns NULL.  The list entry includes a charclass set and (if
   relevant) a wide-char descriptor for testing for the predicate.
   Lazy evaluation and caching are used to keep processing costs
   down.  */
static predicate_entry_t *
find_pred (fsalex_ctxt_t *lexer, const char *str)
{
  predicate_entry_t *p_entry;
  charclass_t *work_class;

  for (p_entry = lexer->predicates; p_entry->name; p_entry++)
    {
      if (STREQ (str, p_entry->name))
        break;
    }

  /* If there was no matching predicate name found, return NULL.  */
  if (! p_entry->name)
    return NULL;

  /* Is the charclass pointer NULL for this entry? */
  if (p_entry->class == NULL)
    {
      /* Yes, allocate, set up and cache a charclass for this
         predicate.  Note that the wchar_desc entries were set up in
         fsalex_syntax ().  */
      int i;
      charclass_index_t index;
      wctype_t wctype_desc;

      wctype_desc = p_entry->wchar_desc;
      work_class = charclass_alloc ();
      for (i = 0; i < FSATOKEN_NOTCHAR; i++)
        {
          wchar_t wc;

          /* Try integer->unsigned char->wide char using lexer's
             mbrtowc_cache array, and, if successful, test for class
             membership, and set the bit in the class if the value is
             a member.  */
          /* FIXME: iswctype depends on the *current* locale
             (LC_CTYPE); this breaks the long-term goal of having each
             lexer instance use only the locale in force when
             fsalex_syntax was called.  We can fix this by using
             isalnum_l () etc here, but carrying around a full copy of
             the locale in the lexer instance may be expensive or
             possibly non-portable, so it's being avoided while this
             code is still in an experimental/proof-of-concept
             form.  */
          wc = lexer->mbrtowc_cache[i];
          if (iswctype (wc, wctype_desc))
            charclass_setbit (i, work_class);
        }

      /* Mark the class as completed, and obtain a persistent class
         pointer.  */
      index = charclass_completed (work_class);
      p_entry->class = charclass_get_pointer (index);

    }

  /* Return predicate entry to the caller.  */
  return p_entry;
}

/* The set of wchar_t values C such that there's a useful locale
   somewhere where C != towupper (C) && C != towlower (towupper (C)).
   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
static short const lonesome_lower[] =
  {
    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,

    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
    0x03F2,

    0x03F5, 0x1E9B, 0x1FBE,
  };

/* Maximum number of wide characters that can be the case-folded
   counterparts of a single wide character, not counting the character
   itself.  This is 1 for towupper, 1 for towlower, and 1 for each
   entry in lonesome_lower[].  */
enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 };

/* Given a wide character c, and working to a search specification
   that demands that case-folded variants of the character must match
   wherever that character could match, generate a list of all
   possible match counterparts to the specified character, and return
   that list, the characters via a parameter pointer, and the list
   length (possibly 0) via the function return value.  */
/* ?? CHECKME: Should parameter "c" in, and/or "folded" out, be wint_t?  */
static int
case_folded_counterparts (fsalex_ctxt_t *lexer,
                          wchar_t c,
                          wchar_t folded[CASE_FOLDED_BUFSIZE])
{
  int i;
  int n = 0;
  wint_t uc = towupper (c);
  wint_t lc = towlower (uc);
  if (uc != c)
    folded[n++] = uc;
  if (lc != uc && lc != c && towupper (lc) == uc)
    folded[n++] = lc;
  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
    {
      wint_t li = lonesome_lower[i];
      if (li != lc && li != uc && li != c && towupper (li) == uc)
        folded[n++] = li;
    }
  return n;
}

/* Multibyte character handling sub-routine for lex.
   Parse a bracket expression and build a struct mb_char_classes.  */
static fsatoken_token_t
parse_bracket_exp (fsalex_ctxt_t *lexer)
{
  bool invert;
  int c, c1, c2;
  charclass_t *ccl;
  mbcsets_set_t *work_mbc = NULL;


  /* This is a bracket expression that dfaexec is known to
     process correctly.  */
  bool known_bracket_exp = true;

  /* Used to warn about [:space:].
     Bit 0 = first character is a colon.
     Bit 1 = last character is a colon.
     Bit 2 = includes any other character but a colon.
     Bit 3 = includes ranges, char/equiv classes or collation elements.  */
  int colon_warning_state;

  wint_t wc;
  wint_t wc2;
  wint_t wc1 = 0;

  if (lexer->multibyte_locale)
    {
      /* Ensure we have space to store the new set.  */
      lexer->mbcsets = maybe_realloc (lexer->mbcsets,
                                      lexer->nmbcsets,
                                      &lexer->mbcsets_alloc,
                                      sizeof work_mbc);

      /* Initialize work area.  */
      work_mbc = mbcsets_new ();
      lexer->mbcsets[lexer->nmbcsets++] = work_mbc;
    }

  ccl = charclass_alloc ();
  FETCH_WC (lexer, c, wc, _("unbalanced ["));
  if (c == '^')
    {
      FETCH_WC (lexer, c, wc, _("unbalanced ["));
      invert = true;
      known_bracket_exp = lexer->using_simple_locale;
    }
  else
    invert = false;

  colon_warning_state = (c == ':');
  do
    {
      c1 = FSATOKEN_NOTCHAR;	/* Mark c1 as not initialized.  */
      colon_warning_state &= ~2;

      /* Note that if we're looking at some other [:...:] construct,
         we just treat it as a bunch of ordinary characters.  We can do
         this because we assume regex has checked for syntax errors before
         dfa is ever called.  */
      if (c == '[')
        {
          FETCH_WC (lexer, c1, wc1, _("unbalanced ["));

          if ((c1 == ':' && (lexer->syntax_bits & RE_CHAR_CLASSES))
              || c1 == '.' || c1 == '=')
            {
              enum { MAX_BRACKET_STRING_LEN = 32 };
              char str[MAX_BRACKET_STRING_LEN + 1];
              size_t len = 0;
              for (;;)
                {
                  FETCH_WC (lexer, c, wc, _("unbalanced ["));
                  if ((c == c1 && *lexer->lexptr == ']')
                          || lexer->lexleft == 0)
                    break;
                  if (len < MAX_BRACKET_STRING_LEN)
                    str[len++] = c;
                  else
                    /* This is in any case an invalid class name.  */
                    str[0] = '\0';
                }
              str[len] = '\0';

              /* Fetch bracket.  */
              FETCH_WC (lexer, c, wc, _("unbalanced ["));
              if (c1 == ':')
                /* Find and merge named character class.  POSIX allows
                   character classes to match multicharacter collating
                   elements, but the regex code does not support that,
                   so do not worry about that possibility.  */
                {
                  char const *class;
                  predicate_entry_t *pred;

                  class = str;
                  if (lexer->case_fold && (STREQ (class, "upper")
                                      || STREQ (class, "lower")))
                    class = "alpha";
                  pred = find_pred (lexer, class);
                  if (! pred)
                    lexer->abandon_with_error (_("invalid character class"));
                  charclass_unionset (pred->class, ccl);

                  /* Does this class have a wide-char type descriptor? */
                  if (lexer->multibyte_locale && pred->may_be_multibyte)
                    {
                      mbcsets_add_class(work_mbc, pred->wchar_desc);
                    }
                }
              else
                known_bracket_exp = false;

              colon_warning_state |= 8;

              /* Fetch new lookahead character.  */
              FETCH_WC (lexer, c1, wc1, _("unbalanced ["));
              continue;
            }

          /* We treat '[' as a normal character here.  c/c1/wc/wc1
             are already set up.  */
        }

      if (c == '\\' && (lexer->syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
        FETCH_WC (lexer, c, wc, _("unbalanced ["));

      if (c1 == FSATOKEN_NOTCHAR)
        FETCH_WC (lexer, c1, wc1, _("unbalanced ["));

      if (c1 == '-')
        /* build range characters.  */
        {
          FETCH_WC (lexer, c2, wc2, _("unbalanced ["));

          /* A bracket expression like [a-[.aa.]] matches an unknown
             set.  Treat it like [-a[.aa.]] while parsing it, and
             remember that the set is unknown.  */
          if (c2 == '[' && *lexer->lexptr == '.')
            {
              known_bracket_exp = false;
              c2 = ']';
            }

          if (c2 != ']')
            {
              if (c2 == '\\' && (lexer->syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
                FETCH_WC (lexer, c2, wc2, _("unbalanced ["));

              if (lexer->multibyte_locale)
                {
                  /* Merely record range as given; don't try to handle the
                     case-folding variants here for now.  */
                  if (wc != WEOF && wc2 != WEOF)
                    {
                      mbcsets_add_range(work_mbc, wc, wc2);

                      /* FIXME, part 2:  Since the canonical measure of case
                         equivalence is uppercase(A) == uppercase(B), perhaps
                         we could do this transformation above if lexer->case_fold
                         is true... but need to be careful not to throw away
                         information.  */
                    }
                }
              else if (lexer->using_simple_locale)
                {
                  charclass_setbit_range (c, c2, ccl);
                  if (lexer->case_fold)
                    {
                      int uc = toupper (c);
                      int uc2 = toupper (c2);
                      for (c1 = 0; c1 < FSATOKEN_NOTCHAR; c1++)
                        {
                          int uc1 = toupper (c1);
                          if (uc <= uc1 && uc1 <= uc2)
                            charclass_setbit (c1, ccl);
                        }
                    }
                }
              else
                known_bracket_exp = false;

              colon_warning_state |= 8;
              FETCH_WC (lexer, c1, wc1, _("unbalanced ["));
              continue;
            }

          /* In the case [x-], the - is an ordinary hyphen,
             which is left in c1, the lookahead character.  */
          lexer->lexptr  -= lexer->cur_mb_len;
          lexer->lexleft += lexer->cur_mb_len;
        }

      colon_warning_state |= (c == ':') ? 2 : 4;

      if (lexer->unibyte_locale)
        {
          if (lexer->case_fold)
            setbit_case_fold_c (c, ccl);
          else
            charclass_setbit (c, ccl);
          continue;
        }

      if (wc == WEOF)
        known_bracket_exp = false;
      else
        {
          wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
          int n = 1;

          folded[0] = wc;
          if (lexer->case_fold)
            n += case_folded_counterparts (lexer, wc, &folded[1]);
          mbcsets_add_wchar_list (work_mbc, n, folded);
        }
    }
  while ((wc = wc1, (c = c1) != ']'));

  if (colon_warning_state == 7)
    lexer->warn_client (_("character class syntax is [[:space:]], not [:space:]"));

  if (! known_bracket_exp)
    return FSATOKEN_TK_BACKREF;

  if (lexer->multibyte_locale)
    {
      mbcsets_set_match_sense (work_mbc, invert);
      mbcsets_receive_incomplete_charclass (work_mbc, ccl);
      mbcsets_completed (work_mbc);
      return FSATOKEN_TK_MBCSET;
    }

  if (invert)
    {
      charclass_notset (ccl);
      if (lexer->syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
        charclass_clrbit (lexer->eolbyte, ccl);
    }

  return FSATOKEN_TK_CSET + charclass_completed (ccl);
}

fsatoken_token_t
fsalex_lex (fsalex_ctxt_t *lexer)
{
  int c;
  bool backslash = false;
  int i;
  predicate_entry_t *predicate;
  charclass_t *work_class;
  charclass_index_t class_index;
  mbcsets_set_t *work_mbc;
  bool invert;

  /* Ensure that syntax () has been called on this lexer instance; many
     things will fail if this isn't done.  */
  assert (lexer->syntax_initialised);

  /* Basic plan: We fetch a character.  If it's a backslash,
     we set the backslash flag and go through the loop again.
     On the plus side, this avoids having a duplicate of the
     main switch inside the backslash case.  On the minus side,
     it means that just about every case begins with
     "if (backslash) ...".  */
  for (i = 0; i < 2; ++i)
    {
      FETCH_WC (lexer, c, lexer->wctok, NULL);

      switch (c)
        {
        case '\\':
          if (backslash)
            goto normal_char;
          if (lexer->lexleft == 0)
            lexer->abandon_with_error (_("unfinished \\ escape"));
          backslash = true;
          break;

        case '^':
          if (backslash)
            goto normal_char;
          if (lexer->syntax_bits & RE_CONTEXT_INDEP_ANCHORS
              || lexer->lasttok == FSATOKEN_TK_END || lexer->lasttok == FSATOKEN_TK_LPAREN || lexer->lasttok == FSATOKEN_TK_OR)
            return lexer->lasttok = FSATOKEN_TK_BEGLINE;
          goto normal_char;

        case '$':
          if (backslash)
            goto normal_char;
          if (lexer->syntax_bits & RE_CONTEXT_INDEP_ANCHORS
              || lexer->lexleft == 0
              || (lexer->syntax_bits & RE_NO_BK_PARENS
                  ? lexer->lexleft > 0 && *lexer->lexptr == ')'
                  : lexer->lexleft > 1 && lexer->lexptr[0] == '\\' && lexer->lexptr[1] == ')')
              || (lexer->syntax_bits & RE_NO_BK_VBAR
                  ? lexer->lexleft > 0 && *lexer->lexptr == '|'
                  : lexer->lexleft > 1 && lexer->lexptr[0] == '\\' && lexer->lexptr[1] == '|')
              || ((lexer->syntax_bits & RE_NEWLINE_ALT)
                  && lexer->lexleft > 0 && *lexer->lexptr == '\n'))
            return lexer->lasttok = FSATOKEN_TK_ENDLINE;
          goto normal_char;

        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
          if (backslash && !(lexer->syntax_bits & RE_NO_BK_REFS))
            {
              lexer->laststart = false;
              return lexer->lasttok = FSATOKEN_TK_BACKREF;
            }
          goto normal_char;

        case '`':
          if (backslash && lexer->re_gnu_ops)
            return lexer->lasttok = FSATOKEN_TK_BEGLINE; /* FIXME: should be beginning of string */
          goto normal_char;

        case '\'':
          if (backslash && lexer->re_gnu_ops)
            return lexer->lasttok = FSATOKEN_TK_ENDLINE;   /* FIXME: should be end of string */
          goto normal_char;

        case '<':
          if (backslash && lexer->re_gnu_ops)
            return lexer->lasttok = FSATOKEN_TK_BEGWORD;
          goto normal_char;

        case '>':
          if (backslash && lexer->re_gnu_ops)
            return lexer->lasttok = FSATOKEN_TK_ENDWORD;
          goto normal_char;

        case 'b':
          if (backslash && lexer->re_gnu_ops)
            return lexer->lasttok = FSATOKEN_TK_LIMWORD;
          goto normal_char;

        case 'B':
          if (backslash && lexer->re_gnu_ops)
            return lexer->lasttok = FSATOKEN_TK_NOTLIMWORD;
          goto normal_char;

        case '?':
          if (lexer->syntax_bits & RE_LIMITED_OPS)
            goto normal_char;
          if (backslash != ((lexer->syntax_bits & RE_BK_PLUS_QM) != 0))
            goto normal_char;
          if (!(lexer->syntax_bits & RE_CONTEXT_INDEP_OPS) && lexer->laststart)
            goto normal_char;
          return lexer->lasttok = FSATOKEN_TK_QMARK;

        case '*':
          if (backslash)
            goto normal_char;
          if (!(lexer->syntax_bits & RE_CONTEXT_INDEP_OPS) && lexer->laststart)
            goto normal_char;
          return lexer->lasttok = FSATOKEN_TK_STAR;

        case '+':
          if (lexer->syntax_bits & RE_LIMITED_OPS)
            goto normal_char;
          if (backslash != ((lexer->syntax_bits & RE_BK_PLUS_QM) != 0))
            goto normal_char;
          if (!(lexer->syntax_bits & RE_CONTEXT_INDEP_OPS) && lexer->laststart)
            goto normal_char;
          return lexer->lasttok = FSATOKEN_TK_PLUS;

        case '{':
          if (!(lexer->syntax_bits & RE_INTERVALS))
            goto normal_char;
          if (backslash != ((lexer->syntax_bits & RE_NO_BK_BRACES) == 0))
            goto normal_char;
          if (!(lexer->syntax_bits & RE_CONTEXT_INDEP_OPS) && lexer->laststart)
            goto normal_char;

          /* Cases:
             {M} - exact count
             {M,} - minimum count, maximum is infinity
             {,N} - 0 through N
             {,} - 0 to infinity (same as '*')
             {M,N} - M through N */
          {
            char const *p = lexer->lexptr;
            char const *lim = p + lexer->lexleft;
            int minrep = -1;
            int maxrep = -1;
            for (; p != lim && ISASCIIDIGIT (*p); p++)
              {
                if (minrep < 0)
                  minrep = *p - '0';
                else
                  minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0');
              }
            if (p != lim)
              {
                if (*p != ',')
                  maxrep = minrep;
                else
                  {
                    if (minrep < 0)
                      minrep = 0;
                    while (++p != lim && ISASCIIDIGIT (*p))
                      {
                        if (maxrep < 0)
                          maxrep = *p - '0';
                        else
                          maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - '0');
                      }
                  }
              }
            if (! ((! backslash || (p != lim && *p++ == '\\'))
                   && p != lim && *p++ == '}'
                   && 0 <= minrep && (maxrep < 0 || minrep <= maxrep)))
              {
                if (lexer->syntax_bits & RE_INVALID_INTERVAL_ORD)
                  goto normal_char;
                lexer->abandon_with_error (_("invalid content of \\{\\}"));
              }
            if (RE_DUP_MAX < maxrep)
              lexer->abandon_with_error (_("regular expression too big"));
            lexer->lexptr = p;
            lexer->lexleft = lim - p;
            lexer->minrep = minrep;
            lexer->maxrep = maxrep;
          }
          lexer->laststart = false;
          return lexer->lasttok = FSATOKEN_TK_REPMN;

        case '|':
          if (lexer->syntax_bits & RE_LIMITED_OPS)
            goto normal_char;
          if (backslash != ((lexer->syntax_bits & RE_NO_BK_VBAR) == 0))
            goto normal_char;
          lexer->laststart = true;
          return lexer->lasttok = FSATOKEN_TK_OR;

        case '\n':
          if (lexer->syntax_bits & RE_LIMITED_OPS
              || backslash || !(lexer->syntax_bits & RE_NEWLINE_ALT))
            goto normal_char;
          lexer->laststart = true;
          return lexer->lasttok = FSATOKEN_TK_OR;

        case '(':
          if (backslash != ((lexer->syntax_bits & RE_NO_BK_PARENS) == 0))
            goto normal_char;
          ++lexer->parens;
          lexer->laststart = true;
          return lexer->lasttok = FSATOKEN_TK_LPAREN;

        case ')':
          if (backslash != ((lexer->syntax_bits & RE_NO_BK_PARENS) == 0))
            goto normal_char;
          if (lexer->parens == 0 && lexer->syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
            goto normal_char;
          --lexer->parens;
          lexer->laststart = false;
          return lexer->lasttok = FSATOKEN_TK_RPAREN;

        case '.':
          if (backslash)
            goto normal_char;
          lexer->laststart = false;
          if (lexer->multibyte_locale)
            {
              /* In multibyte environment period must match with a single
                 character not a byte.  So we use FSATOKEN_TK_ANYCHAR.  */
              return lexer->lasttok = FSATOKEN_TK_ANYCHAR;
            }
          return lexer->lasttok = FSATOKEN_TK_CSET + lexer->dotclass_index;

        case 's':
        case 'S':
           /* "\s" == "[[:space:]]"; "\S" == "[^[:space:]]".  */
          if (! (backslash && lexer->re_gnu_ops))
            goto normal_char;
          lexer->laststart = false;
          invert = c == 'S';
          predicate = find_pred (lexer, "space");
          if (c == 's')
            class_index = charclass_get_index (predicate->class);
          else
            {
              /* Invert the predicate class in a work class.  */
              work_class = charclass_alloc ();
              charclass_copyset (predicate->class, work_class);
              charclass_notset (work_class);
              class_index = charclass_completed (work_class);
            }
          if (lexer->unibyte_locale)
            return lexer->lasttok = FSATOKEN_TK_CSET + class_index;

          /* FIXME: see if optimizing this, as is done with FSATOKEN_TK_ANYCHAR and
             add_utf8_anychar, makes sense.  */

          /* Multibyte locale:  Fill out an entire set description.  */
          lexer->mbcsets = maybe_realloc (lexer->mbcsets, lexer->nmbcsets,
                                          &lexer->mbcsets_alloc,
                                          sizeof work_mbc);
          work_mbc = mbcsets_new ();
          lexer->mbcsets[lexer->nmbcsets++] = work_mbc;
          mbcsets_set_match_sense (work_mbc, invert);
          mbcsets_add_class (work_mbc, predicate->wchar_desc);

          /* ?? REVIEWME: "invert" in parse_bracket_exp leads to FSATOKEN_TK_BACKREF
             (== "this is too hard for me") token via known_bracket_exp
             flag. */
          if ((c == 'S') && !lexer->using_simple_locale)
            return lexer->lasttok = FSATOKEN_TK_BACKREF;
          else
            return lexer->lasttok = FSATOKEN_TK_MBCSET;

         case 'w':
         case 'W':
           /* Can mean "[_[:alnum:]]" (\w) or its inverse (\W).  */
           if (! (backslash && lexer->re_gnu_ops))
             goto normal_char;
           lexer->laststart = false;
           predicate = find_pred (lexer, "alnum");
           work_class = charclass_alloc ();
           charclass_copyset (predicate->class, work_class);
           charclass_setbit ('_', work_class);
           if (c == 'W')
             charclass_notset (work_class);
           return lexer->lasttok = FSATOKEN_TK_CSET
                       + charclass_completed (work_class);

        case '[':
          if (backslash)
            goto normal_char;
          lexer->laststart = false;
          return lexer->lasttok = parse_bracket_exp (lexer);

        default:
        normal_char:
          lexer->laststart = false;
          /* For multibyte character sets, folding is done in atom, so
             always return FSATOKEN_TK_WCHAR.  */
          if (lexer->multibyte_locale)
            return lexer->lasttok = FSATOKEN_TK_WCHAR;

          if (lexer->case_fold && isalpha (c))
            {
              charclass_t *ccl = charclass_alloc ();
              setbit_case_fold_c (c, ccl);
              return lexer->lasttok = FSATOKEN_TK_CSET
                          + charclass_completed (ccl);
            }

          return lexer->lasttok = c;
        }
    }

  /* The above loop should consume at most a backslash
     and some other character.  */
  abort ();
  return FSATOKEN_TK_END;                   /* keeps pedantic compilers happy.  */
}

/* Receive the pattern, and reset the lexical analyser state.  The
   interpretation of the chars (octets?) in the pattern (ASCII chars?
   variable-length UTF-8 sequences?  Simplified Chinese?  etc.) depends on
   the locale that was in force when fsalex_syntax () was called.  NULs may
   be present amongst the codes, which is why the length is given
   explicitly, rather than relying on strlen(3).  */
void
fsalex_pattern (fsalex_ctxt_t *lexer,
                char const *pattern, size_t const pattern_len)

{
  /* Copy parameters to internal state variables.  */
  lexer->lexptr = pattern;
  lexer->lexleft = pattern_len;

  /* Reset lexical scanner state.  */
  lexer->lasttok = FSATOKEN_TK_END;
  lexer->laststart = 1;
  lexer->parens = 0;

  /* Reset multibyte parsing state.  */
  lexer->cur_mb_len = 1;
  memset (&lexer->mbrtowc_state, 0, sizeof (lexer->mbrtowc_state));
}

/* Report whether codes 0-127 conform to ASCII encoding.  This is handy
   for optimisation, particularly lower-case and upper-case characters,
   as the codes are contiguous, unlike EBCDIC.  ASCII is also a common
   denominator in many other unibyte code pages (locales), and also in
   the multibyte UTF-8 locale.  */
static bool _GL_ATTRIBUTE_PURE
char_encoding_7bits_is_ascii (void)
{
  /* True if the native character set is known to be compatible with
     the C locale.  The following test isn't perfect, but it's good
     enough in practice, as only ASCII and EBCDIC are in common use
     and this test correctly accepts ASCII and rejects EBCDIC.  */
  const bool is_ascii =
        '\b' ==   8 && '\t' ==   9 && '\n' ==  10 && '\v' ==  11
     && '\f' ==  12 && '\r' ==  13 && ' '  ==  32 && '!'  ==  33
     && '"'  ==  34 && '#'  ==  35 && '%'  ==  37 && '&'  ==  38
     && '\'' ==  39 && '('  ==  40 && ')'  ==  41 && '*'  ==  42
     && '+'  ==  43 && ','  ==  44 && '-'  ==  45 && '.'  ==  46
     && '/'  ==  47 && '0'  ==  48 && '9'  ==  57 && ':'  ==  58
     && ';'  ==  59 && '<'  ==  60 && '='  ==  61 && '>'  ==  62
     && '?'  ==  63 && 'A'  ==  65 && 'Z'  ==  90 && '['  ==  91
     && '\\' ==  92 && ']'  ==  93 && '^'  ==  94 && '_'  ==  95
     && 'a'  ==  97 && 'z'  == 122 && '{'  == 123 && '|'  == 124
     && '}'  == 125 && '~'  == 126;
  return is_ascii;
}

/* Receive syntax directives, and other pattern interpretation
   instructions such as case folding and end-of-line character.
   In addition, this function configures various internal structures
   based on the locale in force.  */
void
fsalex_syntax (fsalex_ctxt_t *lexer,
               reg_syntax_t bits, int fold, unsigned char eol)
{
  charclass_t *work_class;
  predicate_entry_t *pred;
  char const *locale_name;

  /* Set a flag noting that this lexer has had its syntax params
     set.  */
  lexer->syntax_initialised = true;

  /* Record the function parameters in our local context.  */
  lexer->syntax_bits = bits;
  lexer->case_fold = fold;
  lexer->eolbyte = eol;

  /* Set up unibyte/multibyte flags, based on MB_CUR_MAX, which
     depends on the current locale.  We capture this information here
     as the locale may change later.  At present, we don't capture
     MB_CUR_MAX itself.  */
  if (MB_CUR_MAX > 1)
    {
      /* Multibyte locale: Prepare booleans to make code easier to
         read */
      lexer->unibyte_locale = false;
      lexer->multibyte_locale = true;

      /* Discard any earlier storage we may have acquired.  */
      free (lexer->mbcsets);
      lexer->mbcsets = NULL;

      /* Set up an array of structures to hold multibyte character
         sets.  */
      lexer->nmbcsets = 0;
      lexer->mbcsets_alloc = 2;
      lexer->mbcsets = xzalloc (sizeof (*lexer->mbcsets)
                                      * lexer->mbcsets_alloc);
    }
    else
    {
      /* Unibyte locale: Prepare booleans to make code easier to
         read */
      lexer->unibyte_locale = true;
      lexer->multibyte_locale = false;
    }

  /* Charclass guarantees that class index 0 is zeroclass, so we don't
     need to set it up here.  */

 /* Set up a character class to match anychar ('.'), tailored to
    accommodate options from the regex syntax.  */
  work_class = charclass_alloc ();
  charclass_notset (work_class);
  if (! (lexer->syntax_bits & RE_DOT_NEWLINE))
    {
      charclass_clrbit (lexer->eolbyte, work_class);
    }
  if (lexer->syntax_bits & RE_DOT_NOT_NULL)
    {
      charclass_clrbit (0, work_class);
    }
  lexer->dotclass_index = charclass_completed (work_class);
  lexer->dotclass = charclass_get_pointer (lexer->dotclass_index);

  /* Testing for the absence of RE_NO_GNU_OPS in syntax_bits happens
     often, so set a direct flag variable:  This makes code more
     readable.  */
  lexer->re_gnu_ops = ! (lexer->syntax_bits & RE_NO_GNU_OPS);

  /* Initialise cache and other tables that have syntax and/or locale
     influences.  */

  /* Set up the wchar_desc fields of the predicate table.  */
  for (pred = lexer->predicates; pred->name != NULL; pred++)
    pred->wchar_desc = wctype (pred->name);

  /* Record the name of the current locale.  Note that setlocale can
     return NULL for the name.  */
  lexer->locale_name = NULL;
  locale_name = setlocale (LC_ALL, NULL);
  if (locale_name != NULL)
    lexer->locale_name = xstrdup (locale_name);

  /* Determine if the locale's treatment of codes 0-127 matches ASCII,
     as some operations (e.g. "[c-v]") become easier to handle.  */
  lexer->ascii_7bit_encoding = char_encoding_7bits_is_ascii ();

  /* Further optimisations are possible if the locale is an unibyte
     ASCII 7-bit C/POSIX environment, and this is a common case, so
     spend some effort looking for this situation.  */
  lexer->using_simple_locale = false;
  if (lexer->unibyte_locale && lexer->ascii_7bit_encoding)
    {
      lexer->using_simple_locale = ! lexer->locale_name
                                   || STREQ (lexer->locale_name, "C")
                                   || STREQ (lexer->locale_name, "POSIX");
    }

  /* Initialise cache that distinguishes between unibyte and multibyte
     (wide) characters based on the leading octet.  */
  initialise_uchar_to_wc_cache (lexer);

  /* ?? If a complete copy of the locale needs to be stored within
     this instance, see  uselocale(3) and duplocale(3) (and free the
     copy with freelocale(3)).  */

}

/* Define external function to do non-core data exchanges between the lexer
   and the parser.  This function must conform to
   proto_lexparse_exchange_fn_t.  This interface lets two instances
   communicate without requiring help from an outside party.  */
int
fsalex_exchange (fsalex_ctxt_t *lexer,
                 proto_lexparse_opcode_t opcode,
                 void *param)
{
  switch (opcode)
    {
    case PROTO_LEXPARSE_OP_GET_IS_MULTIBYTE_LOCALE:
      return (int) lexer->multibyte_locale;
    case PROTO_LEXPARSE_OP_GET_REPMN_MIN:
      return lexer->minrep;
    case PROTO_LEXPARSE_OP_GET_REPMN_MAX:
      return lexer->maxrep;
    case PROTO_LEXPARSE_OP_GET_WIDE_CHAR_LIST_MAX:
      /* Original token, plus upper/lowercase, plus extended variants.  */
      return 1 + CASE_FOLDED_BUFSIZE;
    case PROTO_LEXPARSE_OP_GET_WIDE_CHARS:
      {
        wchar_t *p_wc = (wchar_t *) param;

        /* Record the original token (wctok) implicitly associated with
           WCHAR.  */
        *p_wc++ = lexer->wctok;

        /* Add in direct and/or indirect case equivalents, as appropriate.  */
        if (lexer->case_fold)
          p_wc += case_folded_counterparts (lexer, lexer->wctok, p_wc);

        /* Return how many wide chars (at least 1) are in the list.  */
        return p_wc - (wchar_t *) param;
      }
      break;
    case PROTO_LEXPARSE_OP_GET_DOTCLASS:
      *((charclass_t **) param) = lexer->dotclass;
      break;
    case PROTO_LEXPARSE_OP_GET_MBCSET:
      /* Return the set associated with the most recent MBCSET token.  */
      *((mbcsets_set_t **) param) = lexer->mbcsets[lexer->nmbcsets - 1];
      break;
    default:
      /* ?? Not sure if we should complain/assert or merely ignore an
         opcode that we don't recognise here.  */
      assert (!"unrecognised PROTO_LEXPARSE opcode");
      /* NOTREACHED */
      break;
    }

    /* If we reach here, return value is unimportant, so just say 0.  */
    return 0;
}

/* Receive functions to deal with exceptions detected by the lexer:
   Warnings and errors.  Internally, we add the _Noreturn attribute
   to the error callback, to help the compiler with code flow
   analysis.  */
void
fsalex_exception_fns (fsalex_ctxt_t *lexer,
                      fsalex_warn_callback_fn *warningfn,
                      fsalex_error_callback_fn *errorfn)
{
  /* Record the provided functions in the lexer's context.  */
  lexer->warn_client        = warningfn;
  lexer->abandon_with_error = errorfn;
}

/* Add "not provided!" stub function that gets called if the client
   fails to provide proper resources.  This is a hack, merely to get
   the module started; better treatment needs to be added later.  */
static void
no_function_provided (void *unused)
{
 assert (!"fsalex: Plug-in function required, but not provided.");
}

/* Generate a new instance of an FSA lexer.  */
fsalex_ctxt_t *
fsalex_new (void)
{
  fsalex_ctxt_t *new_context;

  /* Acquire zeroed memory for new lexer context.  */
  new_context = XZALLOC (fsalex_ctxt_t);

  /* ?? Point warning and error functions to a "you need to tell me
     these first!" function? */
  new_context->warn_client        = (fsalex_warn_callback_fn *)
                                    no_function_provided;
  new_context->abandon_with_error = (fsalex_error_callback_fn *)
                                    no_function_provided;

  /* Default to working in a non-multibyte locale.  In some cases,
     FETCH_WC never sets this variable (as it's assumed to be 1), so
     fulfil this expectation here.  */
  new_context->cur_mb_len = 1;

  /* Copy the template predicate list into this context, so that we
     can have lexer-specific named predicate classes.  */
  memcpy (new_context->predicates, template_predicate_list,
         sizeof (new_context->predicates));

  /* Default to unibyte locale at first; the final locale setting is
     made according to what's in force when fsalex_syntax () is
     called.  */
  new_context->unibyte_locale = true;
  new_context->multibyte_locale = false;

  /* Many things depend on decisions made in fsalex_syntax (), so note
     here that it hasn't been called yet, and fail gracefully later if
     the client hasn't called the function before commencing work.  */
  new_context->syntax_initialised = false;

  /* Add this instance at the head of the module's list.  */
  new_context->next_instance = fsalex_instance_list_head;
  fsalex_instance_list_head = new_context;

  return new_context;
}

/* Internal function to free all resources directly or indirectly used
   by an instance.  The pointer is no longer valid after this call.  */
static void
free_instance (fsalex_ctxt_t *lexer)
{
  /* Free top-level structures.  */
  free (lexer->locale_name);
  free (lexer->mbcsets);
  free (lexer);

  /* Note that multibyte sets are no longer stored explicitly in
     the instance; apart from the pointer list (freed above), the
     set contents are freed separately when mbcsets module is
     destroyed.  */
}

/* Destroy all lexer instances, plus any associated resources owned by the
   module.  */
void
fsalex_destroy_module (void)
{
  fsalex_ctxt_t *p_list;
  fsalex_ctxt_t *p_next;

  /* Move the global list head into a local variable, and immediately
     clear the global.  This is a half-hearted attempt to avoid race
     conditions; to do things properly, a system-wide atomic
     operation (locked, including multi-CPU cache coherency) operation
     should be used.  */
  p_list = fsalex_instance_list_head;
  fsalex_instance_list_head = NULL;

  /* Traverse the list of instances, releasing all resources
     associated with each one.  */
  while (p_list)
    {
      p_next = p_list->next_instance;
      free_instance (p_list);
      p_list = p_next;
    }
}

/* Prepare module for operation.  */
void
fsalex_initialise (void)
{
  /* Initialise the linked list of instances created by this module.  */
  fsalex_instance_list_head = NULL;

  atexit (fsalex_destroy_module);
}

/* vim:set shiftwidth=2: */