/* mbcsets -- Handle multi-byte and/or locale-dependent sets of chars. Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2014 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ /* Written June, 1988 by Mike Haertel Modified July, 1988 by Arthur David Olson to assist BMG speedups */ /* 2014: Repackaged by "untangle" script, written by behoffski. */ /* ?? Need to document things here... */ /* Always import environment-specific configuration items first. */ #include #include "charclass.h" #include "mbcsets.h" #include #include #include #include #include #include "xalloc.h" /* Flesh out opaque type given in the header. */ struct mbcsets_set_struct { /* Singly-linked list of all instances, so destroy_module can release all resources by traversing the list. */ mbcsets_set_t *next_instance; charclass_t *charclass; bool invert; wchar_t *chars; /* Normal characters. */ size_t nchars; size_t chars_alloc; wctype_t *ch_classes; /* Character classes. */ size_t nch_classes; size_t ch_classes_alloc; struct /* Range characters. */ { wchar_t beg; /* Range start. */ wchar_t end; /* Range end. */ } *ranges; size_t nranges; size_t ranges_alloc; char **equivs; /* Equivalence classes. */ size_t nequivs; size_t equivs_alloc; char **coll_elems; size_t ncoll_elems; /* Collating elements. */ size_t coll_alloc; }; /* Linked list of all instances created by this module. */ static mbcsets_set_t *mbcsets_instances_list_head = NULL; /* Ensure that the array addressed by PTR holds at least NITEMS + (PTR || !NITEMS) items. Either return PTR, or reallocate the array and return its new address. Although PTR may be null, the returned value is never null. The array holds *NALLOC items; *NALLOC is updated on reallocation. ITEMSIZE is the size of one item. Avoid O(N**2) behavior on arrays growing linearly. */ static void * maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize) { if (nitems < *nalloc) return ptr; *nalloc = nitems; return x2nrealloc (ptr, nalloc, itemsize); } /* By default, classes match the specified characters. Regular expressions allow this sense to be inverted, usually by the convention of "^" being the first character of a bracketed class. By default, positive sense is selected; this function lets the user specify the sense, probably to specify inverted matching. */ void mbcsets_set_match_sense (mbcsets_set_t *mbc, bool invert) { mbc->invert = invert; } /* Individual wide characters. */ void mbcsets_add_wchar (mbcsets_set_t *mbc, wint_t wc) { /* ?? Quietly ignore WEOF... is this reasonable? */ if (wc == WEOF) return; /* Could this character fit into a charclass set? */ /* ?? We don't know directly the signedness of wint_t; in gnulib, it is unsigned, and so testing for "wc >= 0" is flagged as an error as it is redundant. Have removed the test here for now, but not confident that this is the best treatment. */ if (/* wc >= 0 && */ wc < CHARCLASS_NOTCHAR) { /* Yes, does this wide char have a valid unichar representation? */ /* ?? The documentation states that wctob should not be used; perhaps the unibyte cache mbrtowc_cache in fsalex, built by initialise_uchar_to_wc_cache (), might be relevant here? */ int b = wctob (wc); if (b != EOF) { /* Yes, add the char (byte?) (octet?) to the charclass set. */ charclass_setbit (b, mbc->charclass); return; } } /* Ensure we have space to store another character. */ mbc->chars = maybe_realloc(mbc->chars, mbc->nchars, &mbc->chars_alloc, sizeof *mbc->chars); /* Add the character to the list. */ mbc->chars[mbc->nchars++] = wc; } /* Add a list of wide characters (note: not wide integers). */ void mbcsets_add_wchar_list (mbcsets_set_t *mbc, size_t len, wchar_t *wc_list) { size_t i; /* Ensure we have space to store the incoming list element(s). */ mbc->chars = maybe_realloc(mbc->chars, mbc->nchars + len, &mbc->chars_alloc, sizeof *mbc->chars); /* Add all the characters to the list. (?? use memcpy here?) */ for (i = 0; i < len; i++) mbc->chars[mbc->nchars++] = wc_list[i]; } /* Common character classes, e.g. alpha, digit, punct etc. */ void mbcsets_add_class (mbcsets_set_t *mbc, wctype_t wchar_class) { /* ?? We don't check that the descriptor is valid. */ /* Ensure we have space to store another class descriptor. */ mbc->ch_classes = maybe_realloc(mbc->ch_classes, mbc->nch_classes, &mbc->ch_classes_alloc, sizeof *mbc->ch_classes); /* Add the class descriptor to the list. */ mbc->ch_classes[mbc->nch_classes++] = wchar_class; } /* Explicit character ranges. */ void mbcsets_add_range (mbcsets_set_t *mbc, wint_t beg, wint_t end) { /* ?? We don't check that the begin/end chars are valid. */ /* Ensure we have space to store another begin/end char pair. */ mbc->ranges = maybe_realloc(mbc->ranges, mbc->nranges + 1, &mbc->ranges_alloc, sizeof *mbc->ranges); /* Add the range to the list. */ mbc->ranges[mbc->nranges].beg = beg; mbc->ranges[mbc->nranges++].end = end; } /* Receive an "in-work" character class, which may or may not have members. Mbcset takes ownership of this set, and, depending on the circumstances, either maintains it internally, or else copies its contents (if any) to its internals, and releases (abandons) the supplied set. This function must not applied to a set that has been completed. */ void mbcsets_receive_incomplete_charclass (mbcsets_set_t *mbc, charclass_t *ccl) { charclass_unionset (ccl, mbc->charclass); charclass_abandon (ccl); } /* Mark a set as completed; the implementation may also analyse and optimise the set at this point (e.g. use charclasses to represent unibyte characters; merge overlapping ranges; remove the individual listing of a character if it is covered by a range, etc.) In addition, note that no further changes (e.g. receive another incomplete charclass) are allowed for this set, once "completed" is called. */ void mbcsets_completed (mbcsets_set_t *mbc) { charclass_t *zeroset; /* Did we end up putting anything into the charclass? */ zeroset = charclass_get_pointer (0); if (charclass_equal (mbc->charclass, zeroset)) { /* No, abandon the class, and use NULL as our sentinel. */ charclass_abandon (mbc->charclass); mbc->charclass = NULL; } else { /* Yes, complete the class, and obtain a persistent pointer. */ charclass_index_t index; index = charclass_completed (mbc->charclass); mbc->charclass = charclass_get_pointer (index); } } /* Retrieve high-level information about the class, which is useful (in fsaparse) for deciding on how to deal with it. We are forced to provide significant query resources since we demand that the type internal remain opaque (even though the initial implementation may do a poor job of this effort). */ void mbcsets_get_characteristics (mbcsets_set_t *mbc, bool *p_invert, charclass_t **pp_charclass, size_t *p_nchars, size_t *p_nch_classes, size_t *p_nranges, size_t *p_nequivs, size_t *p_ncoll_elems) { *p_invert = mbc->invert; *pp_charclass = mbc->charclass; *p_nchars = mbc->nchars; *p_nch_classes = mbc->nch_classes; *p_nranges = mbc->nranges; *p_nequivs = mbc->nequivs; *p_ncoll_elems = mbc->ncoll_elems; } /* Copy wide char list to caller's work area. */ void mbcsets_get_chars (mbcsets_set_t *mbc, wchar_t *char_list) { memcpy (char_list, mbc->chars, mbc->nchars * sizeof(*char_list)); } /* Prepare module for operation. */ void mbcsets_initialise (void) { /* Initialise the linked list of instances created by this module. */ mbcsets_instances_list_head = NULL; atexit (mbcsets_destroy_module); } /* Internal function to free all resources directly or indirectly used by an instance. The pointer is no longer valid after this call. */ static void free_instance (mbcsets_set_t *mbc) { size_t i; free (mbc->chars); free (mbc->ch_classes); free (mbc->ranges); for (i = 0; i < mbc->nequivs; ++i) free (mbc->equivs[i]); free (mbc->equivs); for (i = 0; i < mbc->ncoll_elems; ++i) free (mbc->coll_elems[i]); free (mbc->coll_elems); free (mbc); } /* Destroy all classes, plus any associated resources owned by the module. */ void mbcsets_destroy_module (void) { mbcsets_set_t *p_list; mbcsets_set_t *p_next; /* Move the global list head into a local variable, and immediately clear the global. This is a half-hearted attempt to avoid race conditions; to do things properly, a system-wide atomic operation (locked, including multi-CPU cache coherency) operation should be used. */ p_list = mbcsets_instances_list_head; mbcsets_instances_list_head = NULL; /* Traverse the list of instances, releasing all resources associated with each one. */ while (p_list) { p_next = p_list->next_instance; free_instance (p_list); p_list = p_next; } } /* Generate a new instance of a multibyte-character set descriptor. */ mbcsets_set_t * mbcsets_new (void) { mbcsets_set_t *new_set; /* Allocate memory for new instance. */ new_set = xzalloc (sizeof (*new_set)); /* Lint new instance into list of instances created by this module. */ new_set->next_instance = mbcsets_instances_list_head; mbcsets_instances_list_head = new_set; /* Allocate a charclass set to the instance, so we can easily push codes into that representation if possible. */ new_set->charclass = charclass_alloc (); /* Report created instance to the caller. */ return new_set; } /* vim:set shiftwidth=2: */