>From bd59d72dca9da87ac11d4a0e77eefd1adb5c9147 Mon Sep 17 00:00:00 2001
From: Peter Bex
Date: Sun, 16 Sep 2012 20:04:14 +0200
Subject: [PATCH 2/4] Irregex: Implement Laurikari's algorithm for
tNFA->(t)DFA compilation. (upstream changesets
6ec98fa4f5a9 and 71c42f9974ce)
---
irregex-core.scm | 938 ++++++++++++++++++++++++------------------------
irregex.scm | 11 +-
tests/test-irregex.scm | 4 +-
types.db | 34 +-
4 files changed, 500 insertions(+), 487 deletions(-)
diff --git a/irregex-core.scm b/irregex-core.scm
index 017e090..e0fc210 100644
--- a/irregex-core.scm
+++ b/irregex-core.scm
@@ -101,24 +101,21 @@
(define (irregex-dfa/search x)
(internal "##sys#check-structure" x 'regexp 'irregex-dfa/search)
(internal "##sys#slot" x 2))
- (define (irregex-dfa/extract x)
- (internal "##sys#check-structure" x 'regexp 'irregex-dfa/extract)
- (internal "##sys#slot" x 3))
(define (irregex-nfa x)
(internal "##sys#check-structure" x 'regexp 'irregex-nfa)
- (internal "##sys#slot" x 4))
+ (internal "##sys#slot" x 3))
(define (irregex-flags x)
(internal "##sys#check-structure" x 'regexp 'irregex-flags)
- (internal "##sys#slot" x 5))
+ (internal "##sys#slot" x 4))
(define (irregex-num-submatches x)
(internal "##sys#check-structure" x 'regexp 'irregex-num-submatches)
- (internal "##sys#slot" x 6))
+ (internal "##sys#slot" x 5))
(define (irregex-lengths x)
(internal "##sys#check-structure" x 'regexp 'irregex-lengths)
- (internal "##sys#slot" x 7))
+ (internal "##sys#slot" x 6))
(define (irregex-names x)
(internal "##sys#check-structure" x 'regexp 'irregex-names)
- (internal "##sys#slot" x 8))
+ (internal "##sys#slot" x 7))
;; make-irregex-match defined elsewhere
(define (irregex-new-matches irx)
(make-irregex-match (irregex-num-submatches irx) (irregex-names irx)))
@@ -174,22 +171,19 @@
(else
(begin
(define irregex-tag '*irregex-tag*)
- (define (make-irregex dfa dfa/search dfa/extract nfa flags
- submatches lengths names)
- (vector irregex-tag dfa dfa/search dfa/extract nfa flags
- submatches lengths names))
+ (define (make-irregex dfa dfa/search nfa flags submatches lengths names)
+ (vector irregex-tag dfa dfa/search nfa flags submatches lengths names))
(define (irregex? obj)
(and (vector? obj)
- (= 9 (vector-length obj))
+ (= 8 (vector-length obj))
(eq? irregex-tag (vector-ref obj 0))))
(define (irregex-dfa x) (vector-ref x 1))
(define (irregex-dfa/search x) (vector-ref x 2))
- (define (irregex-dfa/extract x) (vector-ref x 3))
- (define (irregex-nfa x) (vector-ref x 4))
- (define (irregex-flags x) (vector-ref x 5))
- (define (irregex-num-submatches x) (vector-ref x 6))
- (define (irregex-lengths x) (vector-ref x 7))
- (define (irregex-names x) (vector-ref x 8))
+ (define (irregex-nfa x) (vector-ref x 3))
+ (define (irregex-flags x) (vector-ref x 4))
+ (define (irregex-num-submatches x) (vector-ref x 5))
+ (define (irregex-lengths x) (vector-ref x 6))
+ (define (irregex-names x) (vector-ref x 7))
(define (irregex-new-matches irx)
(make-irregex-match (irregex-num-submatches irx) (irregex-names irx)))
(define (irregex-reset-matches! m)
@@ -255,6 +249,14 @@
(define (irregex-match-end-index-set! m n end)
(vector-set! m (+ 6 (* n 4)) end))
+;; Tags use indices that are aligned to start/end positions just like the
+;; match vectors. ie, a tag 0 is a start tag, 1 is its corresponding end tag.
+;; They start at 0, which requires us to map them to submatch index 1.
+;; Sorry for the horrible name ;)
+(define (irregex-match-chunk&index-from-tag-set! m t chunk index)
+ (vector-set! m (+ 7 (* t 2)) chunk)
+ (vector-set! m (+ 8 (* t 2)) index))
+
;; Helper procedure to convert any type of index from a rest args list
;; to a numeric index. Named submatches are converted to their corresponding
;; numeric index, and numeric submatches are checked for validity.
@@ -1610,8 +1612,6 @@
(nfa->dfa nfa (* dfa-limit (nfa-num-states nfa)))))
(else #f)))
(submatches (sre-count-submatches sre-dfa))
- (extractor
- (and dfa dfa/search (sre-match-extractor sre-dfa submatches)))
(names (sre-names sre-dfa 1 '()))
(lens (sre-length-ranges sre-dfa names))
(flags (flag-join
@@ -1619,10 +1619,10 @@
(and (sre-consumer? sre) ~consumer?))))
(cond
(dfa
- (make-irregex dfa dfa/search extractor #f flags submatches lens names))
+ (make-irregex dfa dfa/search #f flags submatches lens names))
(else
(let ((f (sre->procedure sre pat-flags names)))
- (make-irregex #f #f #f f flags submatches lens names))))))
+ (make-irregex #f #f f flags submatches lens names))))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; SRE Analysis
@@ -1968,11 +1968,6 @@
((dfa-match/longest (irregex-dfa irx) cnk src i #f #f matches 0)
(irregex-match-start-chunk-set! matches 0 src)
(irregex-match-start-index-set! matches 0 i)
- ((irregex-dfa/extract irx)
- cnk src i
- (%irregex-match-end-chunk matches 0)
- (%irregex-match-end-index matches 0)
- matches)
matches)
(else
#f)))
@@ -1989,11 +1984,6 @@
((dfa-match/longest dfa cnk src i #f #f matches 0)
(irregex-match-start-chunk-set! matches 0 src)
(irregex-match-start-index-set! matches 0 i)
- ((irregex-dfa/extract irx)
- cnk src i
- (%irregex-match-end-chunk matches 0)
- (%irregex-match-end-index matches 0)
- matches)
matches)
((>= i end)
(let ((next (get-next src)))
@@ -2063,11 +2053,6 @@
(irregex-match-start-index-set! matches
0
((chunker-get-start cnk) src))
- ((irregex-dfa/extract irx)
- cnk src ((chunker-get-start cnk) src)
- (%irregex-match-end-chunk matches 0)
- (%irregex-match-end-index matches 0)
- matches)
matches)))
(else
(let* ((matcher (irregex-nfa irx))
@@ -2100,8 +2085,10 @@
(define (dfa-init-state dfa)
(vector-ref dfa 0))
(define (dfa-next-state dfa node)
- (vector-ref dfa (cdr node)))
-(define (dfa-final-state? dfa state)
+ (vector-ref dfa (cadr node)))
+(define (dfa-cell-commands dfa node)
+ (cddr node))
+(define (dfa-finalizer dfa state)
(car state))
;; this searches for the first end index for which a match is possible
@@ -2109,15 +2096,17 @@
(let ((get-str (chunker-get-str cnk))
(get-start (chunker-get-start cnk))
(get-end (chunker-get-end cnk))
- (get-next (chunker-get-next cnk)))
- (let lp1 ((src src) (start start) (state (dfa-init-state dfa)))
+ (get-next (chunker-get-next cnk))
+ ;; Skip the "set-up" state, we don't need to set tags.
+ (start-state (dfa-next-state dfa (cadr (dfa-init-state dfa)))))
+ (let lp1 ((src src) (start start) (state start-state))
(and
src
(let ((str (get-str src))
(end (get-end src)))
(let lp2 ((i start) (state state))
(cond
- ((dfa-final-state? dfa state)
+ ((dfa-finalizer dfa state)
(cond
(index
(irregex-match-end-chunk-set! matches index src)
@@ -2135,28 +2124,60 @@
(let ((next (get-next src)))
(and next (lp1 next (get-start next) state)))))))))))
+(define (finalize! finalizer memory matches)
+ (for-each
+ (lambda (tag&slot)
+ (let* ((tag (car tag&slot))
+ (slot (vector-ref memory (cdr tag&slot)))
+ (chunk&pos (vector-ref slot tag)))
+ (irregex-match-chunk&index-from-tag-set!
+ matches tag
+ (and chunk&pos (car chunk&pos))
+ (and chunk&pos (cdr chunk&pos)))))
+ finalizer))
+(define (make-initial-memory slots matches)
+ (let ((size (* (irregex-match-num-submatches matches) 2))
+ (memory (make-vector slots)))
+ (do ((i 0 (+ i 1)))
+ ((= i slots) memory)
+ (vector-set! memory i (make-vector size #f)))))
+
;; this finds the longest match starting at a given index
(define (dfa-match/longest dfa cnk src start end-src end matches index)
- (let ((get-str (chunker-get-str cnk))
- (get-start (chunker-get-start cnk))
- (get-end (chunker-get-end cnk))
- (get-next (chunker-get-next cnk))
- (start-is-final? (dfa-final-state? dfa (dfa-init-state dfa))))
+ (let* ((get-str (chunker-get-str cnk))
+ (get-start (chunker-get-start cnk))
+ (get-end (chunker-get-end cnk))
+ (get-next (chunker-get-next cnk))
+ (initial-state (dfa-init-state dfa))
+ (memory-size (car initial-state))
+ (submatches? (not (zero? memory-size)))
+ ;; A vector of vectors, each of size
+ (memory (make-initial-memory memory-size matches))
+ (init-cell (cadr initial-state))
+ (start-state (dfa-next-state dfa init-cell))
+ (start-finalizer (dfa-finalizer dfa start-state)))
(cond
(index
(irregex-match-end-chunk-set! matches index #f)
(irregex-match-end-index-set! matches index #f)))
+ (cond (submatches?
+ (for-each (lambda (s)
+ (let ((slot (vector-ref memory (cdr s))))
+ (vector-set! slot (car s) (cons src start))))
+ (cdr (dfa-cell-commands dfa init-cell)))))
(let lp1 ((src src)
(start start)
- (state (dfa-init-state dfa))
- (res-src (and start-is-final? src))
- (res-index (and start-is-final? start)))
+ (state start-state)
+ (res-src (and start-finalizer src))
+ (res-index (and start-finalizer start))
+ (finalizer start-finalizer))
(let ((str (get-str src))
(end (if (eq? src end-src) end (get-end src))))
(let lp2 ((i start)
(state state)
(res-src res-src)
- (res-index res-index))
+ (res-index res-index)
+ (finalizer finalizer))
(cond
((>= i end)
(cond
@@ -2165,9 +2186,11 @@
(irregex-match-end-index-set! matches index res-index)))
(let ((next (and (not (eq? src end-src)) (get-next src))))
(if next
- (lp1 next (get-start next) state res-src res-index)
+ (lp1 next (get-start next) state res-src res-index finalizer)
(and index
(%irregex-match-end-chunk matches index)
+ (or (not submatches?)
+ (finalize! finalizer memory matches))
#t))))
(else
(let* ((ch (string-ref str i))
@@ -2178,17 +2201,37 @@
(cdr state))))
(cond
(cell
+ (cond
+ (submatches?
+ (let ((cmds (dfa-cell-commands dfa cell)))
+ (for-each (lambda (s)
+ (let ((slot (vector-ref memory (cdr s)))
+ (chunk&position (cons src (+ i 1))))
+ (vector-set! slot (car s) chunk&position)))
+ (cdr cmds))
+ (for-each (lambda (c)
+ (let* ((tag (vector-ref c 0))
+ (ss (vector-ref memory (vector-ref c 1)))
+ (ds (vector-ref memory (vector-ref c 2))))
+ (vector-set! ds tag (vector-ref ss tag))))
+ (car cmds)))))
(let ((next (dfa-next-state dfa cell)))
- (if (dfa-final-state? dfa next)
- (lp2 (+ i 1) next src (+ i 1))
- (lp2 (+ i 1) next res-src res-index))))
+ (cond
+ ((dfa-finalizer dfa next) =>
+ (lambda (new-finalizer)
+ (lp2 (+ i 1) next src (+ i 1) new-finalizer)))
+ (else (lp2 (+ i 1) next res-src res-index finalizer)))))
(res-src
(cond
(index
(irregex-match-end-chunk-set! matches index res-src)
(irregex-match-end-index-set! matches index res-index)))
+ (cond (submatches?
+ (finalize! finalizer memory matches)))
#t)
((and index (%irregex-match-end-chunk matches index))
+ (cond (submatches?
+ (finalize! finalizer memory matches)))
#t)
(else
#f))))))))))
@@ -2329,6 +2372,10 @@
(define (nfa-num-tags nfa)
(vector-ref nfa 0))
+(define (nfa-highest-map-index nfa)
+ (vector-ref nfa 1))
+(define (nfa-set-highest-map-index! nfa idx)
+ (vector-set! nfa 1 idx))
(define (nfa-get-state-trans nfa i)
(if (= i 0) '() (vector-ref nfa (* i *nfa-num-fields*))))
@@ -2336,7 +2383,7 @@
(vector-set! nfa (* i *nfa-num-fields*) x))
(define (nfa-get-epsilons nfa i)
- (vector-ref nfa (+ (* i *nfa-num-fields*) 1)))
+ (if (= i 0) '() (vector-ref nfa (+ (* i *nfa-num-fields*) 1))))
(define (nfa-set-epsilons! nfa i x)
(vector-set! nfa (+ (* i *nfa-num-fields*) 1) x))
(define (nfa-add-epsilon! nfa i x t)
@@ -2344,21 +2391,24 @@
(if (not (assv x eps))
(nfa-set-epsilons! nfa i (cons (cons x t) eps)))))
-(define (nfa-get-state-closure nfa i)
- (vector-ref nfa (+ (* i *nfa-num-fields*) 2)))
-(define (nfa-set-state-closure! nfa i x)
- (vector-set! nfa (+ (* i *nfa-num-fields*) 2) x))
+(define (nfa-get-reorder-commands nfa mst)
+ (cond ((assoc mst
+ (vector-ref nfa (+ (* (nfa-multi-state-hash nfa mst)
+ *nfa-num-fields*) 2)))
+ => cdr)
+ (else #f)))
+(define (nfa-set-reorder-commands! nfa mst x)
+ (let ((i (+ (* (nfa-multi-state-hash nfa mst) *nfa-num-fields*) 2)))
+ (vector-set! nfa i (cons (cons mst x) (vector-ref nfa i)))))
(define (nfa-get-closure nfa mst)
(cond ((assoc mst
(vector-ref nfa (+ (* (nfa-multi-state-hash nfa mst)
- *nfa-num-fields*)
- (- *nfa-num-fields* 1))))
+ *nfa-num-fields*) 3)))
=> cdr)
(else #f)))
(define (nfa-add-closure! nfa mst x)
- (let ((i (+ (* (nfa-multi-state-hash nfa mst) *nfa-num-fields*)
- (- *nfa-num-fields* 1))))
+ (let ((i (+ (* (nfa-multi-state-hash nfa mst) *nfa-num-fields*) 3)))
(vector-set! nfa i (cons (cons mst x) (vector-ref nfa i)))))
;; Compile and return the vector of NFA states (in groups of
@@ -2377,6 +2427,9 @@
;; We abuse the transitions slot for state 0 (the final state,
;; which can have no transitions) to store the number of tags.
(vector-set! buf 0 (* max 2))
+ ;; We abuse the epsilons slot for state 0 to store the highest
+ ;; encountered memory slot mapping index. Initialize to -1.
+ (vector-set! buf 1 -1)
res)
((pair? (car sre))
;; The appends here should be safe (are they?)
@@ -2615,445 +2668,400 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; NFA multi-state representation
-;; Cache closures in a simple hash-table keyed on the smallest state
-;; (define (nfa-multi-state-hash nfa mst)
-;; (car mst))
-
-;; Original sorted list-based representation
-
-;; (define (make-nfa-multi-state nfa)
-;; '())
-
-;; (define (nfa-state->multi-state nfa state)
-;; (list state))
-
-;; (define (nfa-multi-state-copy mst)
-;; (map (lambda (x) x) mst))
-
-;; (define (list->nfa-multi-state nfa ls)
-;; (nfa-multi-state-copy ls))
-
-;; (define (nfa-multi-state-contains? mst i)
-;; (memq i mst))
-
-;; (define (nfa-multi-state-fold mst kons knil)
-;; (fold kons knil mst))
-
-;; (define (nfa-multi-state-add! mst i)
-;; (insert-sorted i mst))
-
-;; (define (nfa-multi-state-add mst i)
-;; (insert-sorted i mst))
-
-;; (define (nfa-multi-state-union a b)
-;; (merge-sorted a b))
-
-;; Sorted List Utilities
-
-;; (define (insert-sorted n ls)
-;; (cond
-;; ((null? ls)
-;; (cons n '()))
-;; ((<= n (car ls))
-;; (if (= n (car ls))
-;; ls
-;; (cons n ls)))
-;; (else
-;; (cons (car ls) (insert-sorted n (cdr ls))))))
-
-;; (define (insert-sorted! n ls)
-;; (cond
-;; ((null? ls)
-;; (cons n '()))
-;; ((<= n (car ls))
-;; (if (= n (car ls))
-;; ls
-;; (cons n ls)))
-;; (else
-;; (let lp ((head ls) (tail (cdr ls)))
-;; (cond ((or (null? tail) (< n (car tail)))
-;; (set-cdr! head (cons n tail)))
-;; ((> n (car tail))
-;; (lp tail (cdr tail)))))
-;; ls)))
-
-;; (define (merge-sorted a b)
-;; (cond ((null? a) b)
-;; ((null? b) a)
-;; ((< (car a) (car b))
-;; (cons (car a) (merge-sorted (cdr a) b)))
-;; ((> (car a) (car b))
-;; (cons (car b) (merge-sorted a (cdr b))))
-;; (else (merge-sorted (cdr a) b))))
-
-;; ========================================================= ;;
-
-;; Presized bit-vector based
-
(define (nfa-multi-state-hash nfa mst)
- (modulo (vector-ref mst 0) (nfa-num-states nfa)))
+ ;; We could do (modulo X (nfa-num-states nfa)) here which would be faster,
+ ;; but we can't assume a full numerical tower (and updating *could*
+ ;; produce a bignum), so we do it each time when updating the hash.
+ (vector-ref mst 2))
+
+;; Returns #f if NFA state does not occur in multi-state
+(define (nfa-state-mappings mst state)
+ (vector-ref mst (+ state 3)))
+
+(define (nfa-multi-state-mappings-summary mst)
+ (vector-ref mst 0))
+
+;; A multi-state holds a set of states with their tag-to-slot mappings.
+;; Slot 0 contains a summary of all mappings for all states in the multi-state.
+;; Slot 1 contains the total number of states in the multi-state.
+;; Slot 2 contains a hash value, which is used for quick lookup of cached
+;; reorder-commands or epsilon-closure in the NFA. This is the sum of all
+;; state numbers plus each tag value (once per occurrence). This is a silly
+;; hashing calculation, but it seems to produce a well-spread out hash table and
+;; it has the added advantage that we can use the value as a quick check if the
+;; state is definitely NOT equivalent to another in nfa-multi-state-same-states?
+;; The other slots contain mappings for each corresponding state.
(define (make-nfa-multi-state nfa)
- (make-vector (quotient (+ (nfa-num-states nfa) 24 -1) 24) 0))
-
-(define (nfa-state->multi-state nfa state)
- (nfa-multi-state-add! (make-nfa-multi-state nfa) state))
-
-(define (nfa-multi-state-copy mst)
- (let ((res (make-vector (vector-length mst))))
- (do ((i (- (vector-length mst) 1) (- i 1)))
- ((< i 0) res)
- (vector-set! res i (vector-ref mst i)))))
-
-(define (nfa-multi-state-contains? mst i)
- (let ((cell (quotient i 24))
- (bit (remainder i 24)))
- (not (zero? (bit-and (vector-ref mst cell) (bit-shl 1 bit))))))
-
-(define (nfa-multi-state-contains-only? mst i)
- (let ((cell (quotient i 24))
- (bit (remainder i 24)))
- (= (vector-ref mst cell) (bit-shl 1 bit))))
-
-(define (nfa-multi-state-add! mst i)
- (let ((cell (quotient i 24))
- (bit (remainder i 24)))
- (vector-set! mst cell (bit-ior (vector-ref mst cell) (bit-shl 1 bit)))
+ (let ((mst (make-vector (+ (nfa-num-states nfa) 3) #f)))
+ (vector-set! mst 0 (make-vector (nfa-num-tags nfa) '())) ; tag summary
+ (vector-set! mst 1 0) ; total number of states
+ (vector-set! mst 2 0) ; states and tags hash
mst))
-(define (nfa-multi-state-add mst i)
- (nfa-multi-state-add! (nfa-multi-state-copy mst) i))
+;; NOTE: This doesn't do a deep copy of the mappings. Don't mutate them!
+(define (nfa-multi-state-copy mst)
+ (let ((v (vector-copy mst)))
+ (vector-set! v 0 (vector-copy (vector-ref mst 0)))
+ v))
-(define (nfa-multi-state-union! a b)
- (do ((i (- (vector-length a) 1) (- i 1)))
- ((< i 0) a)
- (vector-set! a i (bit-ior (vector-ref a i) (vector-ref b i)))))
+(define (nfa-state->multi-state nfa state mappings)
+ (let ((mst (make-nfa-multi-state nfa)))
+ (nfa-multi-state-add! nfa mst state mappings)
+ mst))
-(define (nfa-multi-state-union a b)
- (nfa-multi-state-union! (nfa-multi-state-copy a) b))
+;; Extend multi-state with a state and add its tag->slot mappings.
+(define (nfa-multi-state-add! nfa mst state mappings)
+ (let ((hash-value (vector-ref mst 2)))
+ (cond ((not (vector-ref mst (+ state 3))) ; Update state hash & count?
+ (set! hash-value (+ (vector-ref mst 2) state))
+ (vector-set! mst 1 (+ (vector-ref mst 1) 1))))
+ (vector-set! mst (+ state 3) mappings)
+ (let ((all-mappings (vector-ref mst 0)))
+ (for-each
+ (lambda (tag&slot)
+ (let* ((t (car tag&slot))
+ (s (cdr tag&slot))
+ (m (vector-ref all-mappings t)))
+ (cond ((not (memv s m))
+ (set! hash-value (+ hash-value t))
+ (vector-set! all-mappings t (cons s m))))))
+ mappings))
+ (vector-set! mst 2 (modulo hash-value (nfa-num-states nfa)))))
+
+;; Same as above, but skip updating mappings summary.
+;; Called when we know all the tag->slot mappings are already in the summary.
+(define (nfa-multi-state-add/fast! nfa mst state mappings)
+ (cond ((not (vector-ref mst (+ state 3))) ; Update state hash & count?
+ (vector-set! mst 2 (modulo (+ (vector-ref mst 2) state)
+ (nfa-num-states nfa)))
+ (vector-set! mst 1 (+ (vector-ref mst 1) 1))))
+ (vector-set! mst (+ state 3) mappings))
+
+;; Same as above, assigning a new slot for a tag. This slot is then
+;; added to the summary, if it isn't in there yet. This is more efficient
+;; than looping through all the mappings.
+(define (nfa-multi-state-add-tagged! nfa mst state mappings tag slot)
+ (let* ((mappings-summary (vector-ref mst 0))
+ (summary-tag-slots (vector-ref mappings-summary tag))
+ (new-mappings (let lp ((m mappings)
+ (res '()))
+ (cond ((null? m) (cons (cons tag slot) res))
+ ((= (caar m) tag)
+ (append res (cons (cons tag slot) (cdr m))))
+ (else (lp (cdr m) (cons (car m) res))))))
+ (hash-value (vector-ref mst 2)))
+ (cond ((not (vector-ref mst (+ state 3))) ; Update state hash & count?
+ (set! hash-value (+ hash-value state))
+ (vector-set! mst 1 (+ (vector-ref mst 1) 1))))
+ (vector-set! mst (+ state 3) new-mappings)
+ (cond ((not (memv slot summary-tag-slots)) ; Update tag/slot summary
+ (set! hash-value (+ hash-value tag))
+ (vector-set! mappings-summary tag (cons slot summary-tag-slots))))
+ (vector-set! mst 2 (modulo hash-value (nfa-num-states nfa)))
+ new-mappings))
+
+(define (nfa-multi-state-same-states? a b)
+ ;; First check if hash and state counts match, then check each state
+ (and (= (vector-ref a 2) (vector-ref b 2))
+ (= (vector-ref a 1) (vector-ref b 1))
+ (let ((len (vector-length a)))
+ (let lp ((i 3))
+ (or (= i len)
+ (and (equal? (not (vector-ref a i))
+ (not (vector-ref b i)))
+ (lp (+ i 1))))))))
(define (nfa-multi-state-fold mst kons knil)
(let ((limit (vector-length mst)))
- (let lp1 ((i 0)
- (acc knil))
- (if (>= i limit)
+ (let lp ((i 3)
+ (acc knil))
+ (if (= i limit)
acc
- (let lp2 ((n (vector-ref mst i))
- (acc acc))
- (if (zero? n)
- (lp1 (+ i 1) acc)
- (let* ((n2 (bit-and n (- n 1)))
- (n-tail (- n n2))
- (bit (+ (* i 24) (integer-log n-tail))))
- (lp2 n2 (kons bit acc)))))))))
+ (let ((m (vector-ref mst i)))
+ (lp (+ i 1) (if m (kons (- i 3) m acc) acc)))))))
+
+;; Find the lowest fresh index for this tag that's unused
+;; in the multi-state. This also updates the nfa's highest
+;; tag counter if a completely new slot number was assigned.
+(define (next-index-for-tag! nfa tag mst)
+ (let* ((highest (nfa-highest-map-index nfa))
+ (tag-slots (vector-ref (vector-ref mst 0) tag))
+ (new-index (do ((slot 0 (+ slot 1)))
+ ((not (memv slot tag-slots)) slot))))
+ (cond ((> new-index highest)
+ (nfa-set-highest-map-index! nfa new-index)))
+ new-index))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; NFA->DFA compilation
-;;
+;;;; tNFA->DFA compilation
;; During processing, the DFA is a list of the form:
;;
-;; ((NFA-states ...) accepting-state? transitions ...)
+;; ((annotated-tNFA-states ...) finalizer transitions ...)
;;
;; where the transitions are as in the NFA, except there are no
;; epsilons, duplicate characters or overlapping char-set ranges, and
;; the states moved to are closures (sets of NFA states). Multiple
-;; DFA states may be accepting states.
+;; DFA states may be accepting states. If the state is an accepting state,
+;; the finalizer is a list of (tag . memory-slot) retrieval commands.
+;; tNFA-states are annotated with mappings which store the tag values of
+;; memory slots, if any. There is always at most one slot for a tag.
+;;
+;; The DFA itself simulates a NFA by representing all the simultaneous
+;; states the NFA can be in at any given point in time as one DFA state.
+;; The tag values are ambiguous since each NFA transition can set a tag.
+;; To solve this we keep a bank of memory slots around which tracks tag
+;; values for each distinct path through the NFA.
+;;
+;; Once we get to a final state we can pluck the tag values from the
+;; memory slots corresponding to the path through which the NFA could have
+;; reached the final state. To resolve ambiguities, states are assigned
+;; priorities, and the path to the final state is chosen correspondingly.
+;;
+;; For a more detailed explanation about this process, see
+;; Ville Laurikari; ``NFAs with Tagged Transitions, their Conversion to
+;; Deterministic Automata and Application to Regular Expressions'' (2000).
+;; Laurikari also wrote a master's thesis about this approach which is
+;; less terse but the algorithms are not exactly the same.
+;; ``Efficient submatch addressing for regular expressions'' (2001).
+;; This implementation follows the 2000 paper where they differ.
(define (nfa->dfa nfa . o)
- (let ((max-states (and (pair? o) (car o))))
- (let lp ((ls (list (nfa-cache-state-closure! nfa (nfa-start-state nfa))))
- (i 0)
- (res '()))
+ (let* ((max-states (and (pair? o) (car o)))
+ (start (nfa-state->multi-state nfa (nfa-start-state nfa) '()))
+ (start-closure (nfa-epsilon-closure nfa start))
+ ;; Set up a special "initializer" state from which we reach the
+ ;; start-closure to ensure that leading tags are set properly.
+ (init-set (tag-set-commands-for-closure nfa start start-closure '()))
+ (dummy (make-nfa-multi-state nfa))
+ (init-state (list dummy #f `((,start-closure #f () . ,init-set)))))
+ ;; Unmarked states are just sets of NFA states with tag-maps, marked states
+ ;; are sets of NFA states with transitions to sets of NFA states
+ (let lp ((unmarked-states (list start-closure))
+ (marked-states (list init-state))
+ (dfa-size 0))
(cond
- ((null? ls)
- (dfa-renumber nfa (reverse res)))
- ((assoc (car ls) res) ;; already seen this combination of states
- (lp (cdr ls) i res))
- ((and max-states (> i max-states)) ;; too many DFA states
+ ((null? unmarked-states)
+ ;; Abuse finalizer slot for storing the number of memory slots we need
+ (set-car! (cdr init-state) (+ (nfa-highest-map-index nfa) 1))
+ (dfa-renumber (reverse marked-states)))
+ ((and max-states (> dfa-size max-states)) ; Too many DFA states
#f)
+ ((assoc (car unmarked-states) marked-states) ; Seen set of NFA-states?
+ (lp (cdr unmarked-states) marked-states dfa-size))
(else
- (let* ((states (car ls))
- (trans (nfa-state-transitions nfa states))
- (accept? (and (nfa-multi-state-contains? states 0) #t)))
- (lp (append (map cdr trans) (cdr ls))
- (+ i 1)
- `((,states ,accept? ,@trans) ,@res))))))))
-
-;; When the conversion is complete we renumber the DFA sets-of-states
-;; in order and convert the result to a vector for fast lookup.
-;; Charsets containing single characters are converted to those characters
-;; for quick matching of the literal parts in a regex.
-(define (dfa-renumber nfa dfa)
- (let* ((len (length dfa))
- (states (make-vector (nfa-num-states nfa) '()))
- (res (make-vector len)))
- (define (renumber mst)
- (cdr (assoc mst (vector-ref states (nfa-multi-state-hash nfa mst)))))
- (let lp ((ls dfa) (i 0))
- (cond ((pair? ls)
- (let ((j (nfa-multi-state-hash nfa (caar ls))))
- (vector-set! states j (cons (cons (caar ls) i)
- (vector-ref states j))))
- (lp (cdr ls) (+ i 1)))))
- (let lp ((ls dfa) (i 0))
- (cond ((pair? ls)
- (for-each
- (lambda (x)
- (set-car! x (maybe-cset->char (car x)))
- (set-cdr! x (renumber (cdr x))))
- (cddar ls))
- (vector-set! res i (cdar ls))
- (lp (cdr ls) (+ i 1)))))
- res))
+ (let ((dfa-state (car unmarked-states)))
+ (let lp2 ((trans (get-distinct-transitions nfa dfa-state))
+ (unmarked-states (cdr unmarked-states))
+ (dfa-trans '()))
+ (if (null? trans)
+ (let ((finalizer (nfa-state-mappings dfa-state 0)))
+ (lp unmarked-states
+ (cons (list dfa-state finalizer dfa-trans) marked-states)
+ (+ dfa-size 1)))
+ (let* ((closure (nfa-epsilon-closure nfa (cdar trans)))
+ (reordered (find-reorder-commands nfa closure marked-states))
+ (copy-cmds (if reordered (cdr reordered) '()))
+ ;; Laurikari doesn't mention what "k" is, but it seems it
+ ;; must be the mappings of the state's reach
+ (set-cmds (tag-set-commands-for-closure nfa (cdar trans) closure copy-cmds))
+ (trans-closure (if reordered (car reordered) closure)))
+ (lp2 (cdr trans)
+ (if reordered
+ unmarked-states
+ (cons trans-closure unmarked-states))
+ (cons `(,trans-closure ,(caar trans) ,copy-cmds . ,set-cmds)
+ dfa-trans)))))))))))
+
+(define (dfa-renumber states)
+ (let ((indexes (let lp ((i 0) (states states) (indexes '()))
+ (if (null? states)
+ indexes
+ (lp (+ i 1) (cdr states)
+ (cons (cons (caar states) i) indexes)))))
+ (dfa (make-vector (length states))))
+ (do ((i 0 (+ i 1))
+ (states states (cdr states)))
+ ((null? states) dfa)
+ (let ((maybe-finalizer (cadar states))
+ (transitions (caddar states)))
+ (vector-set!
+ dfa i
+ (cons maybe-finalizer
+ (map (lambda (tr)
+ `(,(and (cadr tr) (maybe-cset->char (cadr tr)))
+ ,(cdr (assoc (car tr) indexes)) . ,(cddr tr)))
+ transitions)))))))
;; Extract all distinct ranges and the potential states they can transition
;; to from a given set of states. Any ranges that would overlap with
;; distinct characters are split accordingly.
-(define (nfa-state-transitions nfa states)
- (let ((res (nfa-multi-state-fold
- states
- (lambda (st res)
- (let ((trans (nfa-get-state-trans nfa st)))
- (if (null? trans)
- res
- (nfa-join-transitions! nfa res (car trans) (cdr trans)))))
- '())))
- (for-each (lambda (x) (set-cdr! x (nfa-closure nfa (cdr x)))) res)
- res))
-(define (nfa-join-transitions! nfa existing elt state)
+;; This function is like "reach" in Laurikari's papers, but for each
+;; possible distinct range of characters rather than per character.
+(define (get-distinct-transitions nfa annotated-states)
(define (csets-intersect? a b)
(let ((i (cset-intersection a b)))
(and (not (cset-empty? i)) i)))
- (let lp ((ls existing) (res '()))
- (cond
- ((null? ls)
- (cond
- ;; First try to find a group that includes *only* this state.
- ;; TRICKY!: If it contains other states too, we will end up in trouble
- ;; later on if the group needs to be broken up because of overlapping
- ;; csets, since then you don't know what parts of the overlap "belong"
- ;; to the state we are about to add or the one that was already there.
- ((find (lambda (x) (nfa-multi-state-contains-only? (cdr x) state)) existing) =>
- (lambda (existing-state) ; If found, merge charsets with it
- (set-car! existing-state (cset-union (car existing-state) elt))
- existing))
- ;; State not seen yet? Add a new state transition
- (else (cons (cons elt (nfa-state->multi-state nfa state)) existing))))
- ((cset=? elt (caar ls)) ; Add state to existing set for this charset
- (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state))
- existing)
- ((csets-intersect? elt (caar ls)) => ; overlapping charset, but diff state
- (lambda (intersection)
- (let* ((only-in-old (cset-difference (caar ls) elt))
- (states-for-old (and (not (cset-empty? only-in-old))
- (nfa-multi-state-copy (cdar ls))))
- (result (if states-for-old
- (cons (cons only-in-old states-for-old)
- (append res (cdr ls)))
- (append res (cdr ls))))
- (only-in-new (cset-difference elt (caar ls))))
- ;; Add this state to the states already here and restrict to
- ;; the overlapping charset
- (set-car! (car ls) intersection)
- (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state))
- ;; Continue with the remaining subset of the new cset (if nonempty)
- (cons (car ls)
- (if (cset-empty? only-in-new)
- result
- (nfa-join-transitions! nfa result only-in-new state))))))
- (else
- (lp (cdr ls) (cons (car ls) res))))))
-
-(define (nfa-cache-state-closure! nfa state)
- (let ((cached (nfa-get-state-closure nfa state)))
- (cond
- ((not (null? cached))
- cached)
- (else
- (let ((res (nfa-state-closure-internal nfa state)))
- (nfa-set-state-closure! nfa state res)
- res)))))
-
-;; The `closure' of a list of NFA states - all states that can be
-;; reached from any of them using any number of epsilon transitions.
-(define (nfa-state-closure-internal nfa state)
- (let lp ((ls (list state))
- (res (make-nfa-multi-state nfa)))
- (cond
- ((null? ls)
- res)
- ((nfa-multi-state-contains? res (car ls))
- (lp (cdr ls) res))
- (else
- ;; Ignore any epsilon tags for now
- (lp (append (map car (nfa-get-epsilons nfa (car ls))) (cdr ls))
- (nfa-multi-state-add! res (car ls)))))))
-
-(define (nfa-closure-internal nfa states)
(nfa-multi-state-fold
- states
- (lambda (st res)
- (nfa-multi-state-union! res (nfa-cache-state-closure! nfa st)))
- (make-nfa-multi-state nfa)))
+ annotated-states
+ (lambda (st mappings res)
+ (let ((trans (nfa-get-state-trans nfa st))) ; Always one state per trans
+ (if (null? trans)
+ res
+ (let lp ((ls res) (cs (car trans)) (state (cdr trans)) (res '()))
+ (cond
+ ;; State not seen yet? Add a new state transition
+ ((null? ls)
+ ;; TODO: We should try to find an existing DFA state with only
+ ;; this NFA state in it, and extend the cset with the current one.
+ ;; This produces smaller DFAs, but takes longer to compile.
+ (cons (cons cs (nfa-state->multi-state nfa state mappings))
+ res))
+ ((cset=? cs (caar ls)) ; Add state to existing set for this charset
+ (nfa-multi-state-add! nfa (cdar ls) state mappings)
+ (append ls res))
+ ((csets-intersect? cs (caar ls)) =>
+ (lambda (intersection)
+ (let* ((only-in-new (cset-difference cs (caar ls)))
+ (only-in-old (cset-difference (caar ls) cs))
+ (states-in-both (cdar ls))
+ (states-for-old (and (not (cset-empty? only-in-old))
+ (nfa-multi-state-copy states-in-both)))
+ (res (if states-for-old
+ (cons (cons only-in-old states-for-old) res)
+ res)))
+ (nfa-multi-state-add! nfa states-in-both state mappings)
+ ;; Add this state to the states already here and restrict to
+ ;; the overlapping charset and continue with the remaining subset
+ ;; of the new cset (if nonempty)
+ (if (cset-empty? only-in-new)
+ (cons (cons intersection states-in-both)
+ (append (cdr ls) res))
+ (lp (cdr ls) only-in-new state
+ (cons (cons intersection states-in-both) res))))))
+ (else
+ (lp (cdr ls) cs state (cons (car ls) res))))))))
+ '()))
+
+;; The epsilon-closure of a set of states is all the states reachable
+;; through epsilon transitions, with the tags encountered on the way.
+(define (nfa-epsilon-closure-internal nfa annotated-states)
+ ;; The stack _MUST_ be in this order for some reason I don't fully understand
+ (let lp ((stack (nfa-multi-state-fold annotated-states
+ (lambda (st m res)
+ (cons (cons st m) res))
+ '()))
+ (priorities (make-vector (nfa-num-states nfa) 0))
+ (closure (nfa-multi-state-copy annotated-states)))
+ (if (null? stack)
+ closure
+ (let ((prio/orig-state (caar stack)) ; priority is just the state nr.
+ (mappings (cdar stack)))
+ (let lp2 ((trans (nfa-get-epsilons nfa prio/orig-state))
+ (stack (cdr stack)))
+ (if (null? trans)
+ (lp stack priorities closure)
+ (let ((state (caar trans)))
+ (cond
+ ;; Our priorities are inverted because we start at
+ ;; the highest state number and go downwards to 0.
+ ((> prio/orig-state (vector-ref priorities state))
+ (vector-set! priorities state prio/orig-state)
+ (cond
+ ((cdar trans) => ; tagged transition?
+ (lambda (tag)
+ (let* ((index (next-index-for-tag! nfa tag closure))
+ (new-mappings (nfa-multi-state-add-tagged!
+ nfa closure state mappings tag index)))
+ (lp2 (cdr trans) (cons (cons state new-mappings) stack)))))
+ (else
+ (nfa-multi-state-add/fast! nfa closure state mappings)
+ (lp2 (cdr trans) (cons (cons state mappings) stack)))))
+ (else (lp2 (cdr trans) stack))))))))))
+
-(define (nfa-closure nfa states)
+(define (nfa-epsilon-closure nfa states)
(or (nfa-get-closure nfa states)
- (let ((res (nfa-closure-internal nfa states)))
+ (let ((res (nfa-epsilon-closure-internal nfa states)))
(nfa-add-closure! nfa states res)
res)))
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Match Extraction
-;;
-;; DFAs don't give us match information, so once we match and
-;; determine the start and end, we need to recursively break the
-;; problem into smaller DFAs to get each submatch.
-;;
-;; See http://compilers.iecc.com/comparch/article/07-10-026
-
-(define (match-vector-ref v i) (vector-ref v (+ 3 i)))
-
-(define (match-vector-set! v i x) (vector-set! v (+ 3 i) x))
+;; Generate "set" commands for all tags in the closure that are
+;; not present in the original state.
+(define (tag-set-commands-for-closure nfa orig-state closure copy-cmds)
+ (let ((num-tags (nfa-num-tags nfa))
+ (closure-summary (nfa-multi-state-mappings-summary closure))
+ (state-summary (nfa-multi-state-mappings-summary orig-state)))
+ (let lp ((t 0) (cmds '()))
+ (if (= t num-tags)
+ cmds
+ (let lp2 ((s1 (vector-ref closure-summary t))
+ (s2 (vector-ref state-summary t))
+ (cmds cmds))
+ (cond ((null? s1) (lp (+ t 1) cmds))
+ ((or (memv (car s1) s2) ; Tag in original state?
+ ;; Try to avoid generating set-commands for any slots
+ ;; that will be overwritten by copy commands, but only
+ ;; if that slot isn't copied to another slot.
+ (and (not (null? copy-cmds)) ; null check for performance
+ ;; Look for copy command overwriting this tag-slot
+ (any (lambda (c)
+ (and (= (vector-ref c 0) t)
+ (= (vector-ref c 2) (car s1))))
+ copy-cmds)
+ ;; Ensure it's not copied to another slot before
+ ;; discarding the set-command.
+ (not (any (lambda (c)
+ (and (= (vector-ref c 0) t)
+ (= (vector-ref c 1) (car s1))))
+ copy-cmds))))
+ (lp2 (cdr s1) s2 cmds))
+ (else (lp2 (cdr s1) s2
+ (cons (cons t (car s1)) cmds)))))))))
+
+;; Look in dfa-states for an already existing state which matches
+;; closure, but has different tag value mappings.
+;; If found, calculate reordering commands so we can map the closure
+;; to that state instead of adding a new DFA state.
+;; This is completely handwaved away in Laurikari's paper (it basically
+;; says "insert reordering algorithm here"), so this code was constructed
+;; after some experimentation. In other words, bugs be here.
+(define (find-reorder-commands-internal nfa closure dfa-states)
+ (let ((num-states (nfa-num-states nfa))
+ (num-tags (nfa-num-tags nfa))
+ (closure-summary (nfa-multi-state-mappings-summary closure)))
+ (let lp ((dfa-states dfa-states))
+ (if (null? dfa-states)
+ #f
+ (if (not (nfa-multi-state-same-states? (caar dfa-states) closure))
+ (lp (cdr dfa-states))
+ (let lp2 ((state-summary (nfa-multi-state-mappings-summary
+ (caar dfa-states)))
+ (t 0) (cmds '()))
+ (if (= t num-tags)
+ (cons (caar dfa-states) cmds)
+ (let lp3 ((closure-slots (vector-ref closure-summary t))
+ (state-slots (vector-ref state-summary t))
+ (cmds cmds))
+ (cond ((null? closure-slots)
+ (if (null? state-slots)
+ (lp2 state-summary (+ t 1) cmds)
+ (lp (cdr dfa-states))))
+ ((null? state-slots) (lp (cdr dfa-states)))
+ (else (lp3 (cdr closure-slots)
+ (cdr state-slots)
+ (if (= (car closure-slots) (car state-slots))
+ cmds
+ (cons (vector t (car closure-slots) (car state-slots))
+ cmds)))))))))))))
+
+(define (find-reorder-commands nfa closure dfa-states)
+ (or (nfa-get-reorder-commands nfa closure)
+ (let ((res (find-reorder-commands-internal nfa closure dfa-states)))
+ (nfa-set-reorder-commands! nfa closure res)
+ res)))
-(define (sre-match-extractor sre num-submatches)
- (let* ((tmp (+ num-submatches 1))
- (tmp-end-src-offset (+ 2 (* tmp 4)))
- (tmp-end-index-offset (+ 3 (* tmp 4))))
- (let lp ((sre sre) (n 1) (submatch-deps? #f))
- (cond
- ((not (sre-has-submatches? sre))
- (if (not submatch-deps?)
- (lambda (cnk start i end j matches) #t)
- (let ((dfa (nfa->dfa (sre->nfa sre ~none))))
- (lambda (cnk start i end j matches)
- (dfa-match/longest dfa cnk start i end j matches tmp)))))
- ((pair? sre)
- (case (car sre)
- ((: seq)
- (let* ((right (sre-sequence (cddr sre)))
- (match-left (lp (cadr sre) n #t))
- (match-right
- (lp right (+ n (sre-count-submatches (cadr sre))) #t)))
- (lambda (cnk start i end j matches)
- (let lp1 ((end2 end) (j2 j) (best-src #f) (best-index #f))
- (let ((limit (if (eq? start end2)
- i
- ((chunker-get-start cnk) end2))))
- (let lp2 ((k j2) (best-src best-src) (best-index best-index))
- (if (< k limit)
- (cond
- ((not (eq? start end2))
- (let ((prev (chunker-prev-chunk cnk start end2)))
- (lp1 prev
- ((chunker-get-end cnk) prev)
- best-src
- best-index)))
- (best-src
- (match-vector-set! matches tmp-end-src-offset best-src)
- (match-vector-set! matches tmp-end-index-offset best-index)
- #t)
- (else
- #f))
- (if (and (match-left cnk start i end2 k matches)
- (eq? end2 (match-vector-ref matches
- tmp-end-src-offset))
- (eqv? k (match-vector-ref matches
- tmp-end-index-offset))
- (match-right cnk end2 k end j matches))
- (let ((right-src
- (match-vector-ref matches tmp-end-src-offset))
- (right
- (match-vector-ref matches tmp-end-index-offset)))
- (cond
- ((and (eq? end right-src) (eqv? j right))
- (match-vector-set! matches tmp-end-src-offset end)
- (match-vector-set! matches tmp-end-index-offset j)
- #t)
- ((or (not best-src)
- (if (eq? best-src right-src)
- (> right best-index)
- (chunk-before? cnk
- best-src
- right-src)))
- (lp2 (- k 1) right-src right))
- (else
- (lp2 (- k 1) best-src best-index))))
- (lp2 (- k 1) best-src best-index)))))))))
- ((or)
- (if (null? (cdr sre))
- (lambda (cnk start i end j matches) #f)
- (let* ((rest (sre-alternate (cddr sre)))
- (match-first
- (lp (cadr sre) n #t))
- (match-rest
- (lp rest
- (+ n (sre-count-submatches (cadr sre)))
- submatch-deps?)))
- (lambda (cnk start i end j matches)
- (or (and (match-first cnk start i end j matches)
- (eq? end (match-vector-ref matches tmp-end-src-offset))
- (eqv? j (match-vector-ref matches tmp-end-index-offset)))
- (match-rest cnk start i end j matches))))))
- ((* +)
- (letrec ((match-once
- (lp (sre-sequence (cdr sre)) n #t))
- (match-all
- (lambda (cnk start i end j matches)
- (if (match-once cnk start i end j matches)
- (let ((src (match-vector-ref matches tmp-end-src-offset))
- (k (match-vector-ref matches tmp-end-index-offset)))
- (if (and src (or (not (eq? start src)) (< i k)))
- (match-all cnk src k end j matches)
- #t))
- (begin
- (match-vector-set! matches tmp-end-src-offset start)
- (match-vector-set! matches tmp-end-index-offset i)
- #t)))))
- (if (eq? '* (car sre))
- match-all
- (lambda (cnk start i end j matches)
- (and (match-once cnk start i end j matches)
- (let ((src (match-vector-ref matches tmp-end-src-offset))
- (k (match-vector-ref matches tmp-end-index-offset)))
- (match-all cnk src k end j matches)))))))
- ((?)
- (let ((match-once (lp (sre-sequence (cdr sre)) n #t)))
- (lambda (cnk start i end j matches)
- (cond
- ((match-once cnk start i end j matches)
- #t)
- (else
- (match-vector-set! matches tmp-end-src-offset start)
- (match-vector-set! matches tmp-end-index-offset i)
- #t)))))
- (($ submatch => submatch-named)
- (let ((match-one
- (lp (sre-sequence (if (memq (car sre) '($ submatch))
- (cdr sre)
- (cddr sre)))
- (+ n 1)
- #t))
- (start-src-offset (* n 4))
- (start-index-offset (+ 1 (* n 4)))
- (end-src-offset (+ 2 (* n 4)))
- (end-index-offset (+ 3 (* n 4))))
- (lambda (cnk start i end j matches)
- (cond
- ((match-one cnk start i end j matches)
- (match-vector-set! matches start-src-offset start)
- (match-vector-set! matches start-index-offset i)
- (match-vector-set! matches end-src-offset
- (match-vector-ref matches tmp-end-src-offset))
- (match-vector-set! matches end-index-offset
- (match-vector-ref matches tmp-end-index-offset))
- #t)
- (else
- #f)))))
- (else
- (%irregex-error "unknown regexp operator" (car sre)))))
- (else
- (%irregex-error "unknown regexp" sre))))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; Closure Compilation
diff --git a/irregex.scm b/irregex.scm
index 9349bcf..659d874 100644
--- a/irregex.scm
+++ b/irregex.scm
@@ -142,9 +142,9 @@
(define-compiler-syntax make-irregex
(syntax-rules ()
- ((_ dfa dfa/search dfa/extract nfa flags submatches lengths names)
+ ((_ dfa dfa/search nfa flags submatches lengths names)
(##sys#make-structure
- 'regexp dfa dfa/search dfa/extract nfa flags submatches lengths names))))
+ 'regexp dfa dfa/search nfa flags submatches lengths names))))
(define-compiler-syntax make-irregex-match
(syntax-rules ()
@@ -208,6 +208,13 @@
((_ m n end)
(vector-set! (##sys#slot m 1) (+ 3 (* n 4)) end))))
+(define-compiler-syntax irregex-match-chunk&index-from-tag-set!
+ (syntax-rules ()
+ ((_ m t chunk index)
+ (begin
+ (vector-set! (##sys#slot m 1) (+ 4 (* t 2)) chunk)
+ (vector-set! (##sys#slot m 1) (+ 5 (* t 2)) index)))))
+
(define-compiler-syntax %irregex-error
(syntax-rules ()
((_ args ...)
diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm
index fd2cb97..cef431c 100644
--- a/tests/test-irregex.scm
+++ b/tests/test-irregex.scm
@@ -276,8 +276,8 @@
(test-group "predicates"
(test-assert (irregex? (irregex "a.*b")))
(test-assert (irregex? (irregex '(: "a" (* any) "b"))))
- (test-assert (not (irregex? (vector '*irregex-tag* #f #f #f #f #f #f #f))))
- (test-assert (not (irregex? (vector #f #f #f #f #f #f #f #f #f))))
+ (test-assert (not (irregex? (vector '*irregex-tag* #f #f #f #f #f #f))))
+ (test-assert (not (irregex? (vector #f #f #f #f #f #f #f #f))))
(test-assert (irregex-match-data? (irregex-search "a.*b" "axxxb")))
(test-assert (irregex-match-data? (irregex-match "a.*b" "axxxb")))
(test-assert (not (irregex-match-data? (vector '*irregex-match-tag* #f #f #f #f #f #f #f #f #f))))
diff --git a/types.db b/types.db
index 84dbab0..a9a8791 100644
--- a/types.db
+++ b/types.db
@@ -1300,23 +1300,31 @@
(irregex-dfa (#(procedure #:clean #:enforce) irregex-dfa ((struct regexp)) *)
(((struct regexp)) (##sys#slot #(1) '1)))
-(irregex-dfa/extract (#(procedure #:clean #:enforce) irregex-dfa/extract ((struct regexp)) *)
- (((struct regexp)) (##sys#slot #(1) '3)))
-
(irregex-dfa/search (#(procedure #:clean #:enforce) irregex-dfa/search ((struct regexp)) *)
(((struct regexp)) (##sys#slot #(1) '2)))
-(irregex-extract (#(procedure #:enforce) irregex-extract (* string #!optional fixnum fixnum) list)) ;XXX specialize?
+(irregex-nfa (#(procedure #:clean #:enforce) irregex-nfa ((struct regexp)) *)
+ (((struct regexp)) (##sys#slot #(1) '3)))
+
(irregex-flags (#(procedure #:clean #:enforce) irregex-flags ((struct regexp)) *)
- (((struct regexp)) (##sys#slot #(1) '5)))
+ (((struct regexp)) (##sys#slot #(1) '4)))
+
+(irregex-num-submatches (#(procedure #:clean #:enforce) irregex-num-submatches ((struct regexp))
+ fixnum)
+ (((struct regexp)) (##sys#slot #(1) '5)))
+
+(irregex-lengths (#(procedure #:clean #:enforce) irregex-lengths ((struct regexp)) *)
+ (((struct regexp)) (##sys#slot #(1) '6)))
+
+(irregex-names (#(procedure #:clean #:enforce) irregex-names ((struct regexp)) *)
+ (((struct regexp)) (##sys#slot #(1) '7)))
+
+(irregex-extract (#(procedure #:enforce) irregex-extract (* string #!optional fixnum fixnum) list)) ;XXX specialize?
(irregex-fold (#(procedure #:enforce) irregex-fold (* (procedure (fixnum (struct regexp-match) *) *) * string #!optional (procedure (fixnum *) *) fixnum fixnum) *))
(irregex-fold/chunked (#(procedure #:enforce) irregex-fold/chunked (* (procedure (* fixnum (struct regexp-match) *) *) * procedure * #!optional (procedure (* fixnum *) *) fixnum fixnum) *))
-(irregex-lengths (#(procedure #:clean #:enforce) irregex-lengths ((struct regexp)) *)
- (((struct regexp)) (##sys#slot #(1) '7)))
-
(irregex-match (#(procedure #:enforce) irregex-match (* string #!optional fixnum fixnum) *))
;irregex-match?
@@ -1338,18 +1346,8 @@
(irregex-match-substring (#(procedure) irregex-match-substring (* #!optional *) *))
(irregex-match/chunked (#(procedure #:enforce) irregex-match/chunked (* * * #!optional fixnum) *))
-(irregex-names (#(procedure #:clean #:enforce) irregex-names ((struct regexp)) *)
- (((struct regexp)) (##sys#slot #(1) '8)))
-
(irregex-new-matches (procedure irregex-new-matches (*) *))
-(irregex-nfa (#(procedure #:clean #:enforce) irregex-nfa ((struct regexp)) *)
- (((struct regexp)) (##sys#slot #(1) '4)))
-
-(irregex-num-submatches (#(procedure #:clean #:enforce) irregex-num-submatches ((struct regexp))
- fixnum)
- (((struct regexp)) (##sys#slot #(1) '6)))
-
(irregex-opt (#(procedure #:enforce) irregex-opt (list) *))
(irregex-quote (#(procedure #:enforce) irregex-quote (string) string))
(irregex-replace (#(procedure #:enforce) irregex-replace (* string #!rest) string))
--
1.7.9.1