[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[ELPA-diffs] /srv/bzr/emacs/elpa r205: Add tNFA.el
From: |
Toby S. Cubitt |
Subject: |
[ELPA-diffs] /srv/bzr/emacs/elpa r205: Add tNFA.el |
Date: |
Sun, 29 Apr 2012 13:45:01 +0200 |
User-agent: |
Bazaar (2.3.1) |
------------------------------------------------------------
revno: 205
committer: Toby S. Cubitt <address@hidden>
branch nick: elpa
timestamp: Sun 2012-04-29 13:45:01 +0200
message:
Add tNFA.el
added:
packages/tNFA/
packages/tNFA/tNFA.el
=== added directory 'packages/tNFA'
=== added file 'packages/tNFA/tNFA.el'
--- a/packages/tNFA/tNFA.el 1970-01-01 00:00:00 +0000
+++ b/packages/tNFA/tNFA.el 2012-04-29 11:45:01 +0000
@@ -0,0 +1,1112 @@
+;;; tNFA.el --- tagged non-deterministic finite-state automata
+
+
+;; Copyright (C) 2008-2010, 2012 Free Software Foundation, Inc
+
+;; Author: Toby Cubitt <address@hidden>
+;; Version: 0.1.1
+;; Keywords: extensions, matching, data structures
+;; tNFA, NFA, DFA, finite state automata, automata, regexp
+;; Package-Requires: ((queue "0.1"))
+;; URL: http://www.dr-qubit.org/emacs.php
+;; Repository: http://www.dr-qubit.org/git/predictive.git
+
+;; This file is part of Emacs.
+;;
+;; GNU Emacs is free software: you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation, either version 3 of the License, or (at your option)
+;; any later version.
+;;
+;; GNU Emacs is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+;; more details.
+;;
+;; You should have received a copy of the GNU General Public License along
+;; with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
+
+
+;;; Commentary:
+;;
+;; A tagged, non-deterministic finite state automata (NFA) is an abstract
+;; computing machine that recognises regular languages. In layman's terms,
+;; they are used to decide whether a string matches a regular expression. The
+;; "tagged" part allows the NFA to do group-capture: it returns information
+;; about which parts of a string matched which subgroup of the regular
+;; expression.
+;;
+;; Why re-implement regular expression matching when Emacs comes with
+;; extensive built-in support for regexps? Primarily, because some algorithms
+;; require access to the NFA states produced part way through the regular
+;; expression matching process (see the trie.el package for an
+;; example). Secondarily, because Emacs regexps only work on strings, whereas
+;; regular expressions can usefully be used in Elisp code to match other
+;; sequence types, not just strings.
+;;
+;; A tagged NFA can be created from a regular expression using
+;; `tNFA-from-regexp', and its state can be updated using
+;; `tNFA-next-state'. You can discover whether a state is a matching state
+;; using `tNFA-match-p', extract subgroup capture data from it using
+;; `tNFA-group-data', check whether a state has any wildcard transitions using
+;; `tNFA-wildcard-p', and get a list of non-wildcard transitions using
+;; `tNFA-transitions'. Finally, `tNFA-regexp-match' uses tagged NFAs to decide
+;; whether a regexp matches a given string.
+;;
+;; Note that Emacs' regexps are not regular expressions in the original
+;; meaning of that phrase. Emacs regexps implement additional features (in
+;; particular, back-references) that allow them to match far more than just
+;; regular languages. This comes at a cost: regexp matching can potentially be
+;; very slow (NP-hard in fact, though the hard cases rarely crop up in
+;; practise), whereas there are efficient (polynomial-time) algorithms for
+;; matching regular expressions (in the original sense). Therefore, this
+;; package only supports a subset of the full Emacs regular expression
+;; syntax. See the function docstrings for more information.
+;;
+;; This package essentially implements Laurikari's algorithm, as described in
+;; his master's thesis, but it builds the corresponding tagged deterministic
+;; finite state automaton (DFA) on-the-fly as needed.
+;;
+;; This package uses the queue package queue.el.
+
+
+;;; Change Log:
+;;
+;; Version 0.1.1
+;; * work-around mysterious byte-compiler bug by defining
+;; `tNFA--NFA-state-create' and `tNFA--NFA-state-create-tag' via `defun'
+;; instead of directly in `defstruct'
+;;
+;; Version 0.1
+;; * initial version
+
+
+
+;;; Code:
+
+(eval-when-compile (require 'cl))
+(require 'queue)
+
+
+
+;;; ================================================================
+;;; Replcements for CL functions
+
+(defun* tNFA--assoc (item alist &key (test 'eq))
+ ;; Return first cons cell in ALIST whose CAR matches ITEM according to
+ ;; :test function (defaulting to `eq')
+ (while (and alist
+ (or (not (consp (car alist)))
+ (not (funcall test item (caar alist)))))
+ (setq alist (cdr alist)))
+ (car alist))
+
+
+
+;;; ================================================================
+;;; Data structures
+
+;;; ----------------------------------------------------------------
+;;; tagged NFA states
+
+(defstruct
+ (tNFA--state
+ (:constructor nil)
+ (:constructor tNFA--state-create-initial
+ (NFA-state num-tags min-tags max-tags
+ &aux
+ (tags (tNFA--tags-create num-tags min-tags max-tags))))
+ (:constructor tNFA--state-create (NFA-state tags))
+ (:copier nil))
+ NFA-state tags)
+
+(defmacro tNFA--state-id (state)
+ `(tNFA--NFA-state-id (tNFA--state-NFA-state ,state)))
+
+(defmacro tNFA--state-type (state)
+ `(tNFA--NFA-state-type (tNFA--state-NFA-state ,state)))
+
+(defmacro tNFA--state-label (state)
+ `(tNFA--NFA-state-label (tNFA--state-NFA-state ,state)))
+
+(defmacro tNFA--state-in-degree (state)
+ `(tNFA--NFA-state-in-degree (tNFA--state-NFA-state ,state)))
+
+(defmacro tNFA--state-next (state)
+ `(tNFA--NFA-state-next (tNFA--state-NFA-state ,state)))
+
+(defmacro tNFA--state-count (state)
+ `(tNFA--NFA-state-count (tNFA--state-NFA-state ,state)))
+
+
+
+;;; ----------------------------------------------------------------
+;;; NFA states
+
+(declare (special NFA--state-id))
+
+(defstruct
+ (tNFA--NFA-state
+ (:type vector)
+ (:constructor nil)
+ (:constructor tNFA---NFA-state-create
+ (&optional type label next
+ &aux
+ (in-degree 0)
+ (count 0)
+ (id (incf NFA--state-id))
+ ;; (dummy
+ ;; (when next
+ ;; (setf (tNFA--NFA-state-count next)
+ ;; (incf (tNFA--NFA-state-in-degree next)))))
+ ))
+ (:constructor tNFA--NFA-state-create-branch
+ (&rest next
+ &aux
+ (type 'branch)
+ (in-degree 0)
+ (count 0)
+ (id (incf NFA--state-id))))
+ (:constructor tNFA---NFA-state-create-tag
+ (tag &optional next
+ &aux
+ (type 'tag)
+ (label tag)
+ (in-degree 0)
+ (count 0)
+ (id (incf NFA--state-id))
+ ;; (dummy
+ ;; (when next
+ ;; (setf (tNFA--NFA-state-count next)
+ ;; (incf (tNFA--NFA-state-in-degree next)))))
+ ))
+ (:copier nil))
+ id type label in-degree
+ count tNFA-state ; used internally in NFA evolution algorithms
+ next)
+
+
+;; Define these via defun instead of using the dummy argument in the
+;; above defstruct to work around a mysterious byte-compiler bug.
+
+(defun tNFA--NFA-state-create (&optional type label next)
+ (when next
+ (setf (tNFA--NFA-state-count next)
+ (incf (tNFA--NFA-state-in-degree next))))
+ (tNFA---NFA-state-create type label next))
+
+(defun tNFA--NFA-state-create-tag (tag &optional next)
+ (when next
+ (setf (tNFA--NFA-state-count next)
+ (incf (tNFA--NFA-state-in-degree next))))
+ (tNFA---NFA-state-create-tag tag next))
+
+
+;; tag number for a tagged epsilon transition is stored in label slot
+(defalias 'tNFA--NFA-state-tag 'tNFA--NFA-state-label)
+
+(defmacro tNFA--NFA-state-tags (state)
+ `(tNFA--state-tags (tNFA--NFA-state-tNFA-state ,state)))
+
+
+(defun tNFA--NFA-state-patch (attach state)
+ ;; patch STATE onto ATTACH. Return value is meaningless
+ (setf
+ (tNFA--NFA-state-type attach)
+ (tNFA--NFA-state-type state)
+ (tNFA--NFA-state-label attach)
+ (tNFA--NFA-state-label state)
+ (tNFA--NFA-state-next attach)
+ (tNFA--NFA-state-next state)
+ (tNFA--NFA-state-count state)
+ (incf (tNFA--NFA-state-in-degree state))))
+
+
+(defun tNFA--NFA-state-make-epsilon (state next)
+ ;; create an epsilon transition from STATE to NEXT
+ (setf
+ (tNFA--NFA-state-type state) 'epsilon
+ (tNFA--NFA-state-label state) nil
+ (tNFA--NFA-state-next state) next
+ (tNFA--NFA-state-count next)
+ (incf (tNFA--NFA-state-in-degree next))))
+
+
+(defun tNFA--NFA-state-make-branch (state next)
+ ;; create a branch from STATE to all states in NEXT list
+ (setf (tNFA--NFA-state-type state) 'branch
+ (tNFA--NFA-state-label state) nil
+ (tNFA--NFA-state-next state) next)
+ (dolist (n next)
+ (setf (tNFA--NFA-state-count n)
+ (incf (tNFA--NFA-state-in-degree n)))))
+
+
+(defun tNFA--NFA-state-copy (state)
+ ;; Return a copy of STATE. The next link is *not* copied, it is `eq'
+ ;; to the original next link. Use `tNFA--fragment-copy' if you want to
+ ;; recursively copy a chain of states. Note: NFA--state-id must be
+ ;; bound to something appropriate when this function is called.
+ (let ((copy (copy-sequence state)))
+ (setf (tNFA--NFA-state-id copy) (incf NFA--state-id))
+ copy))
+
+
+
+;;; ----------------------------------------------------------------
+;;; NFA fragments
+
+(defstruct
+ (tNFA--fragment
+ (:type vector)
+ (:constructor nil)
+ (:constructor tNFA--fragment-create (initial final))
+ (:copier nil))
+ initial final)
+
+
+(defun tNFA--fragment-patch (frag1 frag2)
+ ;; patch FRAG2 onto end of FRAG1; return value is meaningless
+ (tNFA--NFA-state-patch (tNFA--fragment-final frag1)
+ (tNFA--fragment-initial frag2))
+ (setf (tNFA--fragment-final frag1) (tNFA--fragment-final frag2)))
+
+
+(defun tNFA--fragment-copy (fragment)
+ ;; return a copy of FRAGMENT.
+ (declare (special copied-states))
+ (let (copied-states)
+ (tNFA--fragment-create
+ (tNFA--do-fragment-copy (tNFA--fragment-initial fragment))
+ (cdr (assq (tNFA--fragment-final fragment) copied-states)))))
+
+
+(defun tNFA--do-fragment-copy (state)
+ ;; return a copy of STATE, recursively following and copying links
+ ;; (note: NFA--state-id must be bound to something appropriate when
+ ;; this is called)
+ (declare (special copied-states))
+ (let ((copy (tNFA--NFA-state-copy state)))
+ (push (cons state copy) copied-states)
+
+ ;; if STATE is a branch, copy all links
+ (cond
+ ((eq (tNFA--NFA-state-type copy) 'branch)
+ (setf (tNFA--NFA-state-next copy)
+ (mapcar (lambda (next)
+ (or (cdr (assq next copied-states))
+ (tNFA--do-fragment-copy next)))
+ (tNFA--NFA-state-next copy))))
+
+ ;; if state doesn't have a next link, return
+ ((or (eq (tNFA--NFA-state-type copy) 'match)
+ (null (tNFA--NFA-state-type copy))))
+
+ ;; otherwise, copy next link
+ ((tNFA--NFA-state-type copy)
+ ;; for a non-branch STATE, copy next link
+ (setf (tNFA--NFA-state-next copy)
+ ;; if we've already copied next state, set next link to that
+ (or (cdr (assq (tNFA--NFA-state-next copy) copied-states))
+ ;; otherwise, recursively copy next state
+ (tNFA--do-fragment-copy (tNFA--NFA-state-next copy))))))
+ copy))
+
+
+
+;;; ----------------------------------------------------------------
+;;; DFA states
+
+(defstruct
+ (tNFA--DFA-state
+ :named
+ (:constructor nil)
+ (:constructor tNFA--DFA-state--create
+ (list pool
+ &key
+ (test 'eq)
+ &aux
+ (transitions ())))
+ (:copier nil))
+ list transitions test wildcard match pool)
+
+
+(defun* tNFA--DFA-state-create (state-list state-pool &key (test 'eq))
+ ;; create DFA state and add it to the state pool
+ (let ((DFA-state (tNFA--DFA-state--create
+ state-list state-pool :test test))
+ tmp-list)
+ (puthash state-list DFA-state (tNFA--DFA-state-pool DFA-state))
+
+ (dolist (state state-list)
+ ;; if state in state list is...
+ (cond
+ ;; literal state: add literal transition
+ ((eq (tNFA--state-type state) 'literal)
+ (setq tmp-list (tNFA--DFA-state-transitions DFA-state))
+ (add-to-list 'tmp-list (cons (tNFA--state-label state) t))
+ (setf (tNFA--DFA-state-transitions DFA-state) tmp-list))
+
+ ;; character alternative: add transitions for all alternatives
+ ((eq (tNFA--state-type state) 'char-alt)
+ (dolist (c (tNFA--state-label state))
+ (setq tmp-list (tNFA--DFA-state-transitions DFA-state))
+ (add-to-list 'tmp-list (cons c t))
+ (setf (tNFA--DFA-state-transitions DFA-state) tmp-list)))
+
+ ;; wildcard or negated character alternative: add wildcard
+ ;; transistion
+ ((or (eq (tNFA--state-type state) 'wildcard)
+ (eq (tNFA--state-type state) 'neg-char-alt))
+ (setf (tNFA--DFA-state-wildcard DFA-state) t))
+
+ ;; match state: set match tags
+ ((eq (tNFA--state-type state) 'match)
+ (setf (tNFA--DFA-state-match DFA-state)
+ (tNFA--state-tags state)))))
+
+ ;; return constructed state
+ DFA-state))
+
+
+(defun* tNFA--DFA-state-create-initial (state-list &key (test 'eq))
+ ;; create initial DFA state from initial tNFA state INITIAL-STATE
+ (tNFA--DFA-state-create state-list
+ (make-hash-table :test 'equal)
+ :test test))
+
+
+(defalias 'tNFA-match-p 'tNFA--DFA-state-match
+ "Return non-nil if STATE is a matching state, otherwise return nil.")
+
+
+(defalias 'tNFA-wildcard-p 'tNFA--DFA-state-wildcard
+ "Return non-nil if STATE has a wildcard transition,
+ otherwise return nil.")
+
+
+(defun tNFA-transitions (state)
+ "Return list of literal transitions from tNFA state STATE."
+ (mapcar 'car (tNFA--DFA-state-transitions state)))
+
+
+
+;;; ----------------------------------------------------------------
+;;; tag tables
+
+(defun tNFA--tags-create (num-tags min-tags max-tags)
+ ;; construct a new tags table
+ (let ((vec (make-vector num-tags nil)))
+ (dolist (tag min-tags)
+ (aset vec tag (cons -1 'min)))
+ (dolist (tag max-tags)
+ (aset vec tag (cons -1 'max)))
+ vec))
+
+
+(defun tNFA--tags-copy (tags)
+ ;; return a copy of TAGS table
+ (let* ((len (length tags))
+ (vec (make-vector len nil)))
+ (dotimes (i len)
+ (aset vec i (cons (car (aref tags i))
+ (cdr (aref tags i)))))
+ vec))
+
+
+(defmacro tNFA--tags-set (tags tag val)
+ ;; set value of TAG in TAGS table to VAL
+ `(setcar (aref ,tags ,tag) ,val))
+
+
+(defmacro tNFA--tags-get (tags tag)
+ ;; get value of TAG in TAGS table
+ `(car (aref ,tags ,tag)))
+
+
+(defmacro tNFA--tags-type (tags tag)
+ ;; return tag type ('min or 'max)
+ `(cdr (aref ,tags ,tag)))
+
+
+(defun tNFA--tags< (val tag tags)
+ ;; return non-nil if VAL takes precedence over the value of TAG in
+ ;; TAGS table, nil otherwise
+ (setq tag (aref tags tag))
+ (or (and (eq (cdr tag) 'min)
+ (< val (car tag)))
+ ;;(and (eq (cdr tag) 'max)
+ (> val (car tag));)
+ ))
+
+
+(defun tNFA--tags-to-groups (tags)
+ ;; Convert TAGS table to a list of indices of group matches. The n'th
+ ;; element of the list is a cons cell, whose car is the starting index
+ ;; of the nth group and whose cdr is its end index. If a group didn't
+ ;; match, the corresponding list element will be null."
+ (let ((groups (make-list (/ (length tags) 2) nil))
+ group-stack
+ (grp 0))
+ (dotimes (i (length tags))
+ (if (eq (tNFA--tags-type tags i) 'max)
+ (unless (= (tNFA--tags-get tags i) -1)
+ (setf (nth (caar group-stack) groups)
+ (cons (cdr (pop group-stack))
+ (tNFA--tags-get tags i))))
+ (unless (= (tNFA--tags-get tags i) -1)
+ (push (cons grp (tNFA--tags-get tags i)) group-stack))
+ (incf grp)))
+ groups))
+
+
+
+
+;;; ================================================================
+;;; Regexp -> tNFA
+
+(defun* tNFA-from-regexp (regexp &key (test 'eq))
+ "Create a tagged NFA that recognizes the regular expression REGEXP.
+The return value is the initial state of the tagged NFA.
+
+REGEXP can be any sequence type (vector, list, or string); it
+need not be an actual string. Special characters in REGEXP are
+still just that: elements of the sequence that are characters
+which have a special meaning in regexps.
+
+The :test keyword argument specifies how to test whether two
+individual elements of STRING are identical. The default is `eq'.
+
+Only a subset of the full Emacs regular expression syntax is
+supported. There is no support for regexp constructs that are
+only meaningful for strings (character ranges and character
+classes inside character alternatives, and syntax-related
+backslash constructs). Back-references and non-greedy postfix
+operators are not supported, so `?' after a postfix operator
+loses its special meaning. Also, matches are always anchored, so
+`$' and `^' lose their special meanings (use `.*' at the
+beginning and end of the regexp to get an unanchored match)."
+
+ ;; convert regexp to list, build NFA, and return initial state
+ (declare (special NFA--state-id))
+ (destructuring-bind (fragment num-tags min-tags max-tags regexp)
+ (let ((NFA--state-id -1))
+ (tNFA--from-regexp (append regexp nil) 0 '() '() 'top-level))
+ (if regexp
+ (error "Syntax error in regexp: missing \"(\"")
+ (setf (tNFA--NFA-state-type (tNFA--fragment-final fragment))
+ 'match)
+ (tNFA--DFA-state-create-initial
+ (tNFA--epsilon-boundary
+ (list
+ (tNFA--state-create-initial
+ (tNFA--fragment-initial fragment) num-tags min-tags max-tags))
+ 0)
+ :test test)
+ )))
+
+
+(defmacro tNFA--regexp-postfix-p (regexp)
+ ;; return t if next token in REGEXP is a postfix operator, nil
+ ;; otherwise
+ `(or (eq (car ,regexp) ?*)
+ (eq (car ,regexp) ?+)
+ (eq (car ,regexp) ??)
+ (and (eq (car ,regexp) ?\\)
+ (cdr ,regexp)
+ (eq (cadr ,regexp) ?{))))
+
+
+(defun tNFA--from-regexp (regexp num-tags min-tags max-tags
+ &optional top-level shy-group)
+ ;; Construct a tagged NFA fragment from REGEXP, up to first end-group
+ ;; character or end of REGEXP. The TAGS arguments are used to pass the
+ ;; tags created so far. A non-nil TOP-LEVEL indicates that REGEXP is
+ ;; the complete regexp, so we're constructing the entire tNFA. A
+ ;; non-nil SHY-GROUP indicates that we're constructing a shy subgroup
+ ;; fragment. (Both optional arguments are only used for spotting
+ ;; syntax errors in REGEXP.)
+ ;;
+ ;; Returns a list: (FRAGMENT NUM-TAGS MIN-TAGS MAX-TAGS
+ ;; REGEXP). FRAGMENT is the constructed tNFA fragment, REGEXP is the
+ ;; remaining, unused portion of the regexp, and the TAGS return values
+ ;; give the tags created so far.
+
+ (let* ((new (tNFA--NFA-state-create))
+ (fragment-stack (list (tNFA--fragment-create new new)))
+ fragment copy attach token type group-end-tag)
+
+ (catch 'constructed
+ (while t
+ (setq regexp (tNFA--regexp-next-token regexp)
+ type (nth 0 regexp)
+ token (nth 1 regexp)
+ regexp (nth 2 regexp))
+ (setq fragment nil
+ group-end-tag nil)
+
+ ;; ----- construct new fragment -----
+ (cond
+ ;; syntax error: missing )
+ ((and (null type) (not top-level))
+ (error "Syntax error in regexp:\
+ extra \"(\" or missing \")\""))
+
+ ;; syntax error: extra )
+ ((and (eq type 'shy-group-end) top-level)
+ (error "Syntax error in regexp:\
+ extra \")\" or missing \"(\""))
+
+ ;; syntax error: ) ending a shy group
+ ((and (eq type 'shy-group-end) (not shy-group))
+ (error "Syntax error in regexp: \"(\" matched with \")?\""))
+
+ ;; syntax error: )? ending a group
+ ((and (eq type 'group-end) shy-group)
+ (error "Syntax error in regexp: \"(?\" matched with \")\""))
+
+ ;; syntax error: postfix operator not after atom
+ ((eq type 'postfix)
+ (error "Syntax error in regexp: unexpected \"%s\""
+ (char-to-string token)))
+
+
+ ;; regexp atom: construct new literal fragment
+ ((or (eq type 'literal) (eq type 'wildcard)
+ (eq type 'char-alt) (eq type 'neg-char-alt))
+ (setq new (tNFA--NFA-state-create
+ type token (tNFA--NFA-state-create))
+ fragment (tNFA--fragment-create
+ new (tNFA--NFA-state-next new))))
+
+ ;; shy subgroup start: recursively construct subgroup fragment
+ ((eq type 'shy-group-start)
+ (setq new (tNFA--from-regexp
+ regexp num-tags min-tags max-tags nil t)
+ num-tags (nth 1 new)
+ min-tags (nth 2 new)
+ max-tags (nth 3 new)
+ regexp (nth 4 new)
+ fragment (nth 0 new)))
+
+ ;; subgroup start: add minimize tag to current fragment, and
+ ;; recursively construct subgroup fragment
+ ((eq type 'group-start)
+ (setq new (tNFA--NFA-state-create))
+ (setq fragment
+ (tNFA--fragment-create
+ (tNFA--NFA-state-create-tag
+ (car (push (1- (incf num-tags)) min-tags))
+ new)
+ new))
+ (tNFA--fragment-patch (car fragment-stack) fragment)
+ ;; reserve next tag number for subgroup end tag
+ (setq group-end-tag num-tags)
+ (incf num-tags)
+ ;; recursively construct subgroup fragment
+ (setq new (tNFA--from-regexp
+ regexp num-tags min-tags max-tags)
+ num-tags (nth 1 new)
+ min-tags (nth 2 new)
+ max-tags (nth 3 new)
+ regexp (nth 4 new)
+ fragment (nth 0 new)))
+
+
+ ;; end of regexp or subgroup: ...
+ ((or (null type) (eq type 'shy-group-end) (eq type 'group-end))
+
+ ;; if fragment-stack contains only one fragment, throw
+ ;; fragment up to recursion level above
+ (cond
+ ((null (nth 1 fragment-stack))
+ (throw 'constructed
+ (list (car fragment-stack)
+ num-tags min-tags max-tags regexp)))
+
+ ;; if fragment-stack contains multiple alternation fragments,
+ ;; attach them all together
+ ;;
+ ;; .--fragment--.
+ ;; / \
+ ;; /----fragment----\
+ ;; / \
+ ;; ---o------fragment------o--->
+ ;; \ . /
+ ;; \ . /
+ ;; .
+ (t
+ ;; create a new fragment containing start and end of
+ ;; alternation
+ (setq fragment
+ (tNFA--fragment-create
+ (tNFA--NFA-state-create-branch)
+ (tNFA--NFA-state-create)))
+ ;; patch alternation fragments into new fragment
+ (dolist (frag fragment-stack)
+ (push (tNFA--fragment-initial frag)
+ (tNFA--NFA-state-next
+ (tNFA--fragment-initial fragment)))
+ (setf (tNFA--NFA-state-count
+ (tNFA--fragment-initial frag))
+ (incf (tNFA--NFA-state-in-degree
+ (tNFA--fragment-initial frag))))
+ (tNFA--NFA-state-make-epsilon (tNFA--fragment-final frag)
+ (tNFA--fragment-final fragment)))
+ ;; throw constructed fragment up to recursion level above
+ (throw 'constructed
+ (list fragment num-tags min-tags max-tags regexp)))
+ ))
+
+ ;; | alternation: start new fragment
+ ((eq type 'alternation)
+ (setq new (tNFA--NFA-state-create))
+ (push (tNFA--fragment-create new new) fragment-stack)))
+
+
+ ;; ----- attach new fragment -----
+ (when fragment
+ ;; if next token is not a postfix operator, attach new
+ ;; fragment onto end of current NFA fragment
+ (if (not (tNFA--regexp-postfix-p regexp))
+ (tNFA--fragment-patch (car fragment-stack) fragment)
+
+ ;; if next token is a postfix operator, splice new fragment
+ ;; into NFA as appropriate
+ (when (eq type 'alternation)
+ (error "Syntax error in regexp: unexpected \"%s\""
+ (char-to-string token)))
+ (setq regexp (tNFA--regexp-next-token regexp)
+ type (nth 0 regexp)
+ token (nth 1 regexp)
+ regexp (nth 2 regexp))
+
+ (while fragment
+ (setq attach (tNFA--fragment-final (car fragment-stack)))
+ (setq new (tNFA--NFA-state-create))
+ (cond
+
+ ;; * postfix = \{0,\}:
+ ;;
+ ;; .--fragment--.
+ ;; / \
+ ;; \ ______/
+ ;; \ /
+ ;; ---attach-----new---
+ ;;
+ ((and (eq (car token) 0) (null (cdr token)))
+ (tNFA--NFA-state-make-branch
+ attach (list (tNFA--fragment-initial fragment) new))
+ (tNFA--NFA-state-make-epsilon
+ (tNFA--fragment-final fragment) attach)
+ (setf (tNFA--fragment-final (car fragment-stack)) new)
+ (setq fragment nil))
+
+ ;; + postfix = \{1,\}:
+ ;;
+ ;; .----.
+ ;; / \
+ ;; / \
+ ;; \ /
+ ;; ---fragment-----new---
+ ;;
+ ((and (eq (car token) 1) (null (cdr token)))
+ (tNFA--NFA-state-patch
+ attach (tNFA--fragment-initial fragment))
+ (tNFA--NFA-state-make-branch
+ (tNFA--fragment-final fragment) (list attach new))
+ (setf (tNFA--fragment-final (car fragment-stack)) new)
+ (setq fragment nil))
+
+ ;; \{0,n\} (note: ? postfix = \{0,1\}):
+ ;;
+ ;; .--fragment--.
+ ;; / \
+ ;; ---attach new---
+ ;; \______________/
+ ;;
+ ((eq (car token) 0)
+ ;; ? postfix = \{0,1\}: after this we're done
+ (if (eq (cdr token) 1)
+ (setq copy nil)
+ (setq copy (tNFA--fragment-copy fragment)))
+ ;; attach fragment
+ (tNFA--NFA-state-make-branch
+ attach (list (tNFA--fragment-initial fragment) new))
+ (tNFA--NFA-state-make-epsilon
+ (tNFA--fragment-final fragment) new)
+ (setf (tNFA--fragment-final (car fragment-stack)) new)
+ ;; prepare for next iteration
+ (decf (cdr token))
+ (setq fragment copy))
+
+ ;; \{n,\} or \{n,m\}:
+ ;;
+ ;; ---attach----fragment----new---
+ ;;
+ (t
+ (setq copy (tNFA--fragment-copy fragment))
+ (tNFA--fragment-patch (car fragment-stack) fragment)
+ ;; prepare for next iteration
+ (decf (car token))
+ (when (cdr token) (decf (cdr token)))
+ (if (eq (cdr token) 0)
+ (setq fragment nil)
+ (setq fragment copy)))
+ )))
+
+
+ ;; if ending a group, add a maximize tag to end
+ (when group-end-tag
+ (setq new (tNFA--NFA-state-create)
+ fragment (tNFA--fragment-create
+ (tNFA--NFA-state-create-tag
+ group-end-tag new)
+ new))
+ (push group-end-tag max-tags)
+ (tNFA--fragment-patch (car fragment-stack) fragment)))
+ )) ; end of infinite loop and catch
+ ))
+
+
+
+;; Note: hard-coding the parsing like this is ugly, though sufficient
+;; for our purposes. Perhaps it would be more elegant to implement
+;; this in terms of a proper parser...
+
+(defun tNFA--regexp-next-token (regexp)
+ ;; if regexp is empty, return null values for next token type, token
+ ;; and remaining regexp
+ (if (null regexp)
+ (list nil nil nil)
+
+ (let ((token (pop regexp))
+ (type 'literal)) ; assume token is literal initially
+ (cond
+
+ ;; [: gobble up to closing ]
+ ((eq token ?\[)
+ ;; character alternatives are stored in lists
+ (setq token '())
+ (cond
+ ;; gobble ] appearing straight after [
+ ((eq (car regexp) ?\]) (push (pop regexp) token))
+ ;; gobble ] appearing straight after [^
+ ((and (eq (car regexp) ?^) (eq (nth 1 regexp) ?\]))
+ (push (pop regexp) token)
+ (push (pop regexp) token)))
+ ;; gobble everything up to closing ]
+ (while (not (eq (car regexp) ?\]))
+ (push (pop regexp) token)
+ (unless regexp
+ (error "Syntax error in regexp: missing \"]\"")))
+ (pop regexp) ; dump closing ]
+ (if (not (eq (car (last token)) ?^))
+ (setq type 'char-alt)
+ (setq type 'neg-char-alt)
+ (setq token (butlast token))))
+
+ ;; ]: syntax error (always gobbled when parsing [)
+ ((eq token ?\])
+ (error "Syntax error in regexp: missing \"[\""))
+
+ ;; . * + ?: set appropriate type
+ ((eq token ?*) (setq type 'postfix token (cons 0 nil)))
+ ((eq token ?+) (setq type 'postfix token (cons 1 nil)))
+ ((eq token ??) (setq type 'postfix token (cons 0 1)))
+ ((eq token ?.) (setq type 'wildcard))
+
+ ;; \: look at next character
+ ((eq token ?\\)
+ (unless (setq token (pop regexp))
+ (error "Syntax error in regexp:\
+ missing character after \"\\\""))
+ (cond
+ ;; |: alternation
+ ((eq token ?|) (setq type 'alternation))
+ ;; \(?: shy group start
+ ((and (eq token ?\() (eq (car regexp) ??))
+ (setq type 'shy-group-start)
+ (pop regexp))
+ ;; \)?: shy group end
+ ((and (eq token ?\)) (eq (car regexp) ??))
+ (setq type 'shy-group-end)
+ (pop regexp))
+ ;; \(: group start
+ ((eq token ?\() (setq type 'group-start))
+ ;; \): group end
+ ((eq token ?\)) (setq type 'group-end))
+
+ ;; \{: postfix repetition operator
+ ((eq token ?{)
+ (setq type 'postfix token (cons nil nil))
+ ;; extract first number from repetition operator
+ (while (if (null regexp)
+ (error "Syntax error in regexp:\
+ malformed \\{...\\}")
+ (not (or (eq (car regexp) ?,)
+ (eq (car regexp) ?\\))))
+ (setcar token
+ (concat (car token) (char-to-string (pop regexp)))))
+ (if (null (car token))
+ (setcar token 0)
+ (unless (string-match "[0-9]+" (car token))
+ (error "Syntax error in regexp: malformed \\{...\\}"))
+ (setcar token (string-to-number (car token))))
+ (cond
+ ;; if next character is "\", we expect "}" to follow
+ ((eq (car regexp) ?\\)
+ (pop regexp)
+ (unless (eq (car regexp) ?})
+ (error "Syntax error in regexp: expected \"}\""))
+ (pop regexp)
+ (unless (car token)
+ (error "Syntax error in regexp: malformed \\{...\\}"))
+ (setcdr token (car token)))
+ ;; if next character is ",", we expect a second number to
+ ;; follow
+ ((eq (car regexp) ?,)
+ (pop regexp)
+ (while (if (null regexp)
+ (error "Syntax error in regexp:\
+ malformed \\{...\\}")
+ (not (eq (car regexp) ?\\)))
+ (setcdr token
+ (concat (cdr token)
+ (char-to-string (pop regexp)))))
+ (unless (null (cdr token))
+ (unless (string-match "[0-9]+" (cdr token))
+ (error "Syntax error in regexp: malformed \\{...\\}"))
+ (setcdr token (string-to-number (cdr token))))
+ (pop regexp)
+ (unless (eq (car regexp) ?})
+ (error "Syntax error in regexp: expected \"}\""))
+ (pop regexp))))
+ ))
+ )
+
+ ;; return first token type, token, and remaining regexp
+ (list type token regexp))))
+
+
+
+;;; ================================================================
+;;; tNFA evolution
+
+(defun tNFA-next-state (tNFA chr pos)
+ "Evolve tNFA according to CHR, which corresponds to position
+POS in a string."
+ (let (elem state)
+ ;; if there is a transition for character CHR...
+ (cond
+ ((setq elem (tNFA--assoc chr (tNFA--DFA-state-transitions tNFA)
+ :test (tNFA--DFA-state-test tNFA)))
+ ;; if next state has not already been computed, do so
+ (unless (tNFA--DFA-state-p (setq state (cdr elem)))
+ (setq state (tNFA--DFA-next-state tNFA chr pos nil))
+ (setcdr elem state)))
+
+ ;; if there's a wildcard transition...
+ ((setq state (tNFA--DFA-state-wildcard tNFA))
+ ;; if next state has not already been computed, do so
+ (unless (tNFA--DFA-state-p state)
+ (setq state (tNFA--DFA-next-state tNFA chr pos t))
+ (setf (tNFA--DFA-state-wildcard tNFA) state))))
+ state))
+
+
+
+(defun tNFA--DFA-next-state (DFA-state chr pos wildcard)
+ (let (state-list state)
+ ;; add all states reached by a CHR transition from DFA-STATE to
+ ;; state list
+ (if wildcard
+ (dolist (state (tNFA--DFA-state-list DFA-state))
+ (when (or (eq (tNFA--state-type state) 'wildcard)
+ (and (eq (tNFA--state-type state) 'neg-char-alt)
+ (not (memq chr (tNFA--state-label state)))))
+ (push (tNFA--state-create
+ (tNFA--state-next state)
+ (tNFA--tags-copy (tNFA--state-tags state)))
+ state-list)))
+ (dolist (state (tNFA--DFA-state-list DFA-state))
+ (when (or (and (eq (tNFA--state-type state) 'literal)
+ (eq chr (tNFA--state-label state)))
+ (and (eq (tNFA--state-type state) 'char-alt)
+ (memq chr (tNFA--state-label state)))
+ (and (eq (tNFA--state-type state) 'neg-char-alt)
+ (not (memq chr (tNFA--state-label state))))
+ (eq (tNFA--state-type state) 'wildcard))
+ (push (tNFA--state-create
+ (tNFA--state-next state)
+ (tNFA--tags-copy (tNFA--state-tags state)))
+ state-list))))
+
+ ;; if state list is empty, return empty, failure DFA state
+ (when state-list
+ ;; otherwise, construct new DFA state and add it to the pool if
+ ;; it's not already there
+ (setq state-list (tNFA--epsilon-boundary state-list (1+ pos)))
+ (setq state
+ (or (gethash state-list (tNFA--DFA-state-pool DFA-state))
+ (tNFA--DFA-state-create
+ state-list
+ (tNFA--DFA-state-pool DFA-state)
+ :test (tNFA--DFA-state-test DFA-state))))
+ ;; return next state
+ state)))
+
+
+
+(defun tNFA--epsilon-boundary (state-set pos)
+ ;; Return the tagged epsilon-boundary of the NFA states listed in
+ ;; STATE-SET, that is the set of all states that can be reached via
+ ;; epsilon transitions from some state in STATE-SET (not including
+ ;; states in STATE-SET itself).
+ (let ((queue (queue-create))
+ (result '())
+ (reset '())
+ state next tags)
+ ;; temporarily link the NFA states to their corresponding tNFA
+ ;; states, and add them to the queue
+ (dolist (t-state state-set)
+ (setf state (tNFA--state-NFA-state t-state)
+ (tNFA--NFA-state-tNFA-state state) t-state)
+ (push state reset)
+ (queue-enqueue queue state))
+
+ (while (setq state (queue-dequeue queue))
+ (cond
+ ;; branch or epsilon: add next states as necessary, copying tags
+ ;; across
+ ((or (eq (tNFA--NFA-state-type state) 'branch)
+ (eq (tNFA--NFA-state-type state) 'epsilon))
+ (dolist (next (if (eq (tNFA--NFA-state-type state) 'epsilon)
+ (list (tNFA--NFA-state-next state))
+ (tNFA--NFA-state-next state)))
+ (unless (tNFA--NFA-state-tNFA-state next)
+ (setf (tNFA--NFA-state-tNFA-state next)
+ (tNFA--state-create
+ next (tNFA--tags-copy (tNFA--NFA-state-tags state))))
+ (push next reset)
+ ;; if next state hasn't already been seen in-degree times,
+ ;; add it to the end of the queue
+ (if (/= (decf (tNFA--NFA-state-count next)) 0)
+ (queue-enqueue queue next)
+ ;; if it has now been seen in-degree times, reset count
+ ;; and add it back to the front of the queue
+ (setf (tNFA--NFA-state-count next)
+ (tNFA--NFA-state-in-degree next))
+ (queue-prepend queue next)))))
+
+ ;; tag: add next state if necessary, updating tags if necessary
+ ((eq (tNFA--NFA-state-type state) 'tag)
+ (setq next (tNFA--NFA-state-next state))
+ ;; if next state is not already in results list, or it is
+ ;; already in results but new tag value takes precedence...
+ (when (or (not (tNFA--NFA-state-tNFA-state next))
+ (tNFA--tags< pos (tNFA--NFA-state-tag state)
+ (tNFA--NFA-state-tags next)))
+ ;; if next state is already in results, update tag value
+ (if (tNFA--NFA-state-tNFA-state next)
+ (tNFA--tags-set (tNFA--NFA-state-tags next)
+ (tNFA--NFA-state-tag state) pos)
+ ;; if state is not already in results, copy tags, updating
+ ;; tag value, and add next state to results list
+ (setq tags (tNFA--tags-copy (tNFA--NFA-state-tags state)))
+ (tNFA--tags-set tags (tNFA--NFA-state-tag state) pos)
+ (setf (tNFA--NFA-state-tNFA-state next)
+ (tNFA--state-create next tags))
+ (push next reset))
+ ;; if next state hasn't already been seen in-degree times, add
+ ;; it to the end of the queue
+ (if (/= (decf (tNFA--NFA-state-count next)) 0)
+ (queue-enqueue queue next)
+ ;; if it has now been seen in-degree times, reset count and
+ ;; add it back to the front of the queue
+ (setf (tNFA--NFA-state-count next)
+ (tNFA--NFA-state-in-degree next))
+ (queue-prepend queue next))))
+
+ ;; anything else is a non-epsilon-transition state, so add it to
+ ;; result
+ (t (push (tNFA--NFA-state-tNFA-state state) result))
+ ))
+
+ ;; reset temporary NFA state link and count
+ (dolist (state reset)
+ (setf (tNFA--NFA-state-tNFA-state state) nil
+ (tNFA--NFA-state-count state)
+ (tNFA--NFA-state-in-degree state)))
+ ;; sort result states
+ (sort result
+ (lambda (a b) (< (tNFA--state-id a) (tNFA--state-id b))))
+ ))
+
+
+
+;;; ================================================================
+;;; tNFA matching
+
+(defun* tNFA-regexp-match (regexp string &key (test 'eq))
+ "Return non-nil if STRING matches REGEXP, nil otherwise.
+Sets the match data if there was a match; see `match-beginning',
+`match-end' and `match-string'.
+
+REGEXP and STRING can be any sequence type (vector, list, or
+string); they need not be actual strings. Special characters in
+REGEXP are still just that: elements of the sequence that are
+characters which have a special meaning in regexps.
+
+The :test keyword argument specifies how to test whether two
+individual elements of STRING are identical. The default is `eq'.
+
+Only a subset of the full Emacs regular expression syntax is
+supported. There is no support for regexp constructs that are
+only meaningful for strings (character ranges and character
+classes inside character alternatives, and syntax-related
+backslash constructs). Back-references and non-greedy postfix
+operators are not supported, so `?' after a postfix operator
+loses its special meaning. Also, matches are always anchored, so
+`$' and `^' lose their special meanings (use `.*' at the
+beginning and end of the regexp to get an unanchored match)."
+
+ (let ((tNFA (tNFA-from-regexp regexp :test test))
+ (i -1) tags match-data group-stack (grp 0))
+
+ ;; evolve tNFA according to characters of STRING
+ (catch 'fail
+ (dolist (chr (append string nil))
+ (unless (setq tNFA (tNFA-next-state tNFA chr (incf i)))
+ (throw 'fail nil)))
+
+ ;; if REGEXP matched...
+ (when (setq tags (tNFA--DFA-state-match tNFA))
+ (setq match-data (make-list (+ (length tags) 2) nil))
+ ;; set match data
+ (setf (nth 0 match-data) 0
+ (nth 1 match-data) (length string))
+ ;; set group match data if there were any groups
+ (dotimes (i (length tags))
+ (if (eq (tNFA--tags-type tags i) 'max)
+ (unless (= (tNFA--tags-get tags i) -1)
+ (setf (nth (1+ (* 2 (pop group-stack))) match-data)
+ (tNFA--tags-get tags i)))
+ (incf grp)
+ (unless (= (tNFA--tags-get tags i) -1)
+ (push grp group-stack)
+ (setf (nth (* 2 grp) match-data)
+ (tNFA--tags-get tags i)))))
+ (set-match-data match-data)
+ tags))))
+
+
+(defun tNFA-group-data (tNFA)
+ "Return the group match data associated with a tNFA state."
+ (tNFA--tags-to-groups (tNFA--DFA-state-match tNFA)))
+
+
+
+(provide 'tNFA)
+
+;;; tNFA.el ends here
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [ELPA-diffs] /srv/bzr/emacs/elpa r205: Add tNFA.el,
Toby S. Cubitt <=