help-smalltalk
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Help-smalltalk] [rfc] regex rewrite


From: Paolo Bonzini
Subject: [Help-smalltalk] [rfc] regex rewrite
Date: Fri, 27 May 2005 13:57:37 +0200
User-agent: Mozilla Thunderbird 0.9 (Macintosh/20041103)

This patch is actually bigger than I expected. :-)

Basically, matching with #searchRegex:from:to:, #searchRegex:startingAt:
and #searchRegex: now returns a full fledged RegexResults object or nil
if the matching did not succeed.  I have not provided yet the 'ifAbsent:'
variants.  If you do not care about the subexpressions, use #indexOfRegex:
and friends, which will return an Interval as before.

Most methods previously did the matching twice, in practice.  First,
they found the starting position of the match; then, they called re_match
to find the length of the match.  This sucks, obviously, so I rewrote
all the methods.  I am using the internal #searchRegexInternal:from:to:
method (which is what Mike called #searchRegexFull:from:to:) from all
the methods.  I could have used #indexOfRegex: as well, but this avoids
the overhead of creating an Interval.

There is a contextual modification to Interval, making it more compatible
with VisualWorks as well.  Interval can return a #first and #last even
if the Interval is empty.  These are the start and stop object that it
was created with.  The private methods #start, #stop and #step have thus
been superseded by #first, #last and #increment.

You'll note that this patch already uses the new pragma-like syntax
for C call-outs.  Unfortunately the arch repository is down currently,
I'll have to write to the guys at sourcecontrol.net.

I was about to make the #copyReplacingRegex:with: methods use
#bindWithArguments: but I decided that this patch was already big enough.
What do you think about this additional change?

Now, all that remains before this stuff can be made ``more'' official,
is writing some SUnit tests...  I'll get round to it, but if somebody
wants to help, patches are welcome as usual.

Paolo

2005-05-27  Paolo Bonzini  <address@hidden>
            Mike Anderson  <address@hidden>

        * kernel/Interval.st: Add #first, #last, #increment.

        * examples/re.c (): Return a struct pre_registers * from reh_search.
        New reh_free_registers function, register it.
        * examples/regex.st: New class CRegexRegisters, modify all the methods
        to use it.  Name the C call-out method #searchRegexInternal:from:to:.
        New class RegexResults, return it from #searchRegex:from:to:,
        #searchRegex:startingAt:, #searchRegex:.


--- orig/examples/re.c
+++ mod/examples/re.c
@@ -71,8 +71,10 @@ static void markRegexAsMRU (int i);
 /* Functions exported to Smalltalk */
 static OOP reh_make_cacheable (OOP patternOOP);
 
-static int reh_search (OOP srcOOP, OOP patternOOP, int from, int to),
-reh_match (OOP srcOOP, OOP patternOOP, int from, int to);
+static struct pre_registers *reh_search (OOP srcOOP, OOP patternOOP,
+                                        int from, int to);
+static int reh_match (OOP srcOOP, OOP patternOOP, int from, int to);
+static void reh_free_registers(struct pre_registers *regs);
 
 static RegexCacheEntry cache[REGEX_CACHE_SIZE];
 
@@ -230,26 +232,35 @@ reh_make_cacheable (OOP patternOOP)
 }
 
 /* Search helper function */
-int
+struct pre_registers *
 reh_search (OOP srcOOP, OOP patternOOP, int from, int to)
 {
   int res = 0;
   const char *src;
   struct pre_pattern_buffer *regex;
+  struct pre_registers *regs;
   RegexCaching caching;
 
   caching = lookupRegex (patternOOP, &regex);
   if (caching != REGEX_CACHE_HIT && compileRegex (patternOOP, regex) != NULL)
-    return -100;
+    return NULL;
 
   /* now search */
   src = &STRING_OOP_AT (OOP_TO_OBJ (srcOOP), 1);
-  res = pre_search (regex, src, to, from - 1, to - from + 1, NULL);
+  regs = (struct pre_registers *) calloc (1, sizeof (struct pre_registers));
+  res = pre_search (regex, src, to, from - 1, to - from + 1, regs);
 
   if (caching == REGEX_NOT_CACHED)
     pre_free_pattern (regex);
 
-  return res + 1;
+  return regs;
+}
+
+void 
+reh_free_registers(struct pre_registers *regs)
+{
+       pre_free_registers(regs);
+       free(regs);
 }
 
 /* Match helper function */
@@ -281,6 +292,7 @@ gst_initModule (VMProxy * proxy)
   vmProxy = proxy;
   vmProxy->defineCFunc ("reh_search", reh_search);
   vmProxy->defineCFunc ("reh_match", reh_match);
+  vmProxy->defineCFunc ("reh_free_registers", reh_free_registers);
   vmProxy->defineCFunc ("reh_make_cacheable", reh_make_cacheable);
 
   regexClass = vmProxy->classNameToOOP ("Regex");


--- orig/examples/regex.st
+++ mod/examples/regex.st
@@ -9,7 +9,7 @@
 "======================================================================
 |
 | Copyright 2001, 2003, 2005 Free Software Foundation, Inc.
-| Written by Dragomir Milivojevic.
+| Written by Dragomir Milevojevic, Paolo Bonzini, Mike Anderson.
 |
 | This file is part of the GNU Smalltalk class library.
 |
@@ -50,6 +50,91 @@ them outside the loop, but special care 
 object is used whenever possible (when converting Strings to Regex, the
 cache is sought for an equivalent, already constructed Regex).'.
 
+CStruct subclass: #CRegexRegisters
+        declaration: #( (#allocated #int)
+                       (#numRegs #int)
+                       (#beg (#ptr #int))
+                       (#end (#ptr #int)) )
+        classVariableNames: ''
+        poolDictionaries: ''
+        category: 'Regex'
+!
+
+Object subclass: #RegexResults
+        instanceVariableNames: 'subject from to registers match cache'
+        classVariableNames: ''
+        poolDictionaries: ''
+        category: 'Regex'
+!
+
+!RegexResults methodsFor: 'accessing'!
+
+subject
+    ^subject!
+
+size
+    ^registers size!
+
+from
+    ^from!
+
+fromAt: anIndex
+    | reg |
+    anIndex = 0 ifTrue: [ ^from ].
+    reg := registers at: anIndex.
+    ^reg isNil ifTrue: [ nil ] ifFalse: [ reg first ]!
+
+to
+    ^to!
+
+toAt: anIndex
+    | reg |
+    anIndex = 0 ifTrue: [ ^from ].
+    reg := registers at: anIndex.
+    ^reg isNil ifTrue: [ nil ] ifFalse: [ reg last ]!
+
+match
+    match isNil
+       ifTrue: [ match := self subject copyFrom: from to: to ].
+    ^match!
+
+matchInterval
+    ^from to: to!
+
+at: anIndex
+    | reg text |
+    anIndex = 0 ifTrue: [ ^self match ].
+    (cache at: anIndex) isNil
+       ifTrue: [
+           reg := registers at: anIndex.
+           text := reg isNil
+               ifTrue: [ nil ]
+               ifFalse: [ self subject copyFrom: reg first to: reg last ].
+           cache at: anIndex put: text ].
+    ^cache at: anIndex!
+
+intervalAt: anIndex
+    ^anIndex = 0
+       ifTrue: [ from to: to ]
+       ifFalse: [ registers at: anIndex ]!
+
+size
+    ^registers size! !
+
+!RegexResults methodsFor: 'private'!
+
+initialize: regs subject: aString
+    from := regs matchBeg.
+    to := regs matchEnd.
+    registers := (1 to: regs numRegs value - 1) collect: [ :i |
+       | beg end |
+       beg := (regs begAt: i).
+       end := (regs endAt: i).
+       end < 0 ifTrue: [ nil ] ifFalse: [ beg to: end ] ].
+    cache := Array new: registers size.
+    subject := aString!
+
+    
 " --- external function definitions --- "
 
 !Regex class methodsFor: 'C call-outs'!
@@ -59,14 +144,38 @@ fromString: aString
 
 !String methodsFor: 'C call-outs'!
 
-searchRegex: pattern from: from to: to
-    <cCall: 'reh_search' returning: #int
-       args: #(#selfSmalltalk #smalltalk #int #int)>!
-
 lengthOfRegexMatch: pattern from: from to: to
     <cCall: 'reh_match' returning: #int
+       args: #(#selfSmalltalk #smalltalk #int #int)>!
+
+searchRegexInternal: pattern from: from to: to
+    <cCall: 'reh_search' returning: CRegexRegisters type
        args: #(#selfSmalltalk #smalltalk #int #int)>! !
 
+!CRegexRegisters methodsFor: 'C call-outs'!
+
+begAt: i
+    ^(self beg value + i) value + 1!
+
+matchBeg
+    | begValue |
+    begValue := self beg value.
+    begValue isNil ifTrue: [ ^-1 ].
+    ^begValue value + 1!
+
+endAt: i
+    ^(self end value + i) value!
+
+matchEnd
+    | endValue |
+    endValue := self end value.
+    endValue isNil ifTrue: [ ^-1 ].
+    ^endValue value!
+
+free
+    <cCall: 'reh_free_registers' returning: #void
+       args: #(#self)>! !
+
 "--------------------------------------------------------------------------"
 
 !Regex class methodsFor: 'instance creation'!
@@ -138,7 +247,11 @@ printOn: aStream
 
 =~ regexString
     "Answer whether an occurrence of the regex is present in the receiver"
-    ^(self searchRegex: regexString from: 1 to: self size) > 0
+    | regs gotIt |
+    regs := self searchRegexInternal: regexString from: 1 to: self size.
+    gotIt := regs beg value notNil.
+    regs free.
+    ^gotIt
 !
 
 asRegex
@@ -146,60 +259,92 @@ asRegex
     ^Regex fromString: self
 !
 
+searchRegex: pattern
+    | regs |
+    regs := self searchRegexInternal: pattern from: 1 to: self size.
+    regs matchBeg = -1 ifTrue: [ regs free. ^nil ].
+    ^[ RegexResults new initialize: regs subject: self ]
+       ensure: [ regs free ]!
+
+searchRegex: pattern startingAt: anIndex
+    | regs |
+    regs := self searchRegexInternal: pattern from: anIndex to: self size.
+    regs matchBeg = -1 ifTrue: [ regs free. ^nil ].
+    ^[ RegexResults new initialize: regs subject: self ]
+       ensure: [ regs free ]!
+
+searchRegex: pattern from: from to: to
+    | regs |
+    regs := self searchRegexInternal: pattern from: from to: to.
+    regs matchBeg = -1 ifTrue: [ regs free. ^nil ].
+    ^[ RegexResults new initialize: regs subject: self ]
+       ensure: [ regs free ]!
+
 indexOfRegex: regexString ifAbsent: excBlock
     "Answer whether an occurrence of the regex is present in the receiver"
-    | start len |
-    start := self searchRegex: regexString from: 1 to: self size.
-    start > 0 ifFalse: [ ^excBlock value ].
-
-    len := self lengthOfRegexMatch: regexString from: start to: self size.
-    ^start to: start + len - 1
+    | regs beg end |
+    regs := self searchRegexInternal: regexString from: 1 to: self size.
+    beg := regs matchBeg.
+    end := regs matchEnd.
+    regs free.
+    ^beg >= 1
+       ifTrue: [ beg to: end ]
+       ifFalse: [ excBlock value ]
 !
 
 indexOfRegex: regexString startingAt: index ifAbsent: excBlock
-    | start len |
-    start := self searchRegex: regexString from: index to: self size.
-    start > 0 ifFalse: [ ^excBlock value ].
-
-    len := self lengthOfRegexMatch: regexString from: start to: self size.
-    ^start to: start + len - 1
-!
-
-indexOfRegex: regexString from: start to: end ifAbsent: excBlock
-    | idx len |
-    idx := self searchRegex: regexString from: idx to: end.
-    idx > 0 ifFalse: [ ^excBlock value ].
-    
-    len := self lengthOfRegexMatch: regexString from: idx to: self size.
-    ^start to: start + len - 1
+    | regs beg end |
+    regs := self searchRegexInternal: regexString from: index to: self size.
+    beg := regs matchBeg.
+    end := regs matchEnd.
+    regs free.
+    ^beg >= 1
+       ifTrue: [ beg to: end ]
+       ifFalse: [ excBlock value ]
+!
+
+indexOfRegex: regexString from: from to: to ifAbsent: excBlock
+    | regs beg end |
+    regs := self searchRegexInternal: regexString from: from to: to.
+    beg := regs matchBeg.
+    end := regs matchEnd.
+    regs free.
+    ^beg >= 1
+       ifTrue: [ beg to: end ]
+       ifFalse: [ excBlock value ]
 !
 
 indexOfRegex: regexString
-    "Answer whether an occurrence of the regex is present in the receiver"
-    | start len |
-    start := self searchRegex: regexString from: 1 to: self size.
-    start > 0 ifFalse: [ ^nil ].
-
-    len := self lengthOfRegexMatch: regexString from: start to: self size.
-    ^start to: start + len - 1
+    | regs beg end |
+    regs := self searchRegexInternal: regexString from: 1 to: self size.
+    beg := regs matchBeg.
+    end := regs matchEnd.
+    regs free.
+    ^beg >= 1
+       ifTrue: [ beg to: end ]
+       ifFalse: [ nil ]
 !
 
 indexOfRegex: regexString startingAt: index
-    | start len |
-    start := self searchRegex: regexString from: index to: self size.
-    start > 0 ifFalse: [ ^nil ].
-
-    len := self lengthOfRegexMatch: regexString from: start to: self size.
-    ^start to: start + len - 1
-!
-
-indexOfRegex: regexString from: start to: end
-    | idx len |
-    idx := self searchRegex: regexString from: idx to: end.
-    idx > 0 ifFalse: [ ^nil ].
-    
-    len := self lengthOfRegexMatch: regexString from: start to: self size.
-    ^start to: start + len - 1
+    | regs beg end |
+    regs := self searchRegexInternal: regexString from: index to: self size.
+    beg := regs matchBeg.
+    end := regs matchEnd.
+    regs free.
+    ^beg >= 1
+       ifTrue: [ beg to: end ]
+       ifFalse: [ nil ]
+!
+
+indexOfRegex: regexString from: from to: to
+    | regs beg end |
+    regs := self searchRegexInternal: regexString from: from to: to.
+    beg := regs matchBeg.
+    end := regs matchEnd.
+    regs free.
+    ^beg >= 1
+       ifTrue: [ beg to: end ]
+       ifFalse: [ nil ]
 !
 
 matchRegex: pattern
@@ -217,20 +362,18 @@ matchRegex: pattern from: from to: to
 occurrencesOfRegex: pattern from: from to: to
     "Returns count of how many times pattern repeats in string"
 
-    | res idx len regex |
+    | res idx regex beg end regs |
     regex := pattern asRegex.
     res := 0.
     idx := from.
     [
-       idx <= to and: [
-           idx := self searchRegex: regex from: idx to: to.
-           idx > 0 ]
+        regs := self searchRegexInternal: regex from: idx to: to.
+        beg := regs matchBeg.
+        end := regs matchEnd.
+        regs free.
+        beg >= 1
     ] whileTrue: [
-       
-       len := self lengthOfRegexMatch: regex from: idx to: to.
-       len = 0 ifTrue: [ len := 1 ].
-       
-       idx := idx + len.
+       idx := end max: beg + 1.
        res := res + 1.
     ].
 
@@ -252,23 +395,21 @@ occurrencesOfRegex: pattern
 copyFrom: from to: to replacingRegex: pattern with: str
     "Replaces first occurance of pattern with provided string"
 
-    | res idx len |
+    | res regs beg end |
+    regs := self searchRegexInternal: pattern from: from to: to.
+    beg := regs matchBeg.
+    end := regs matchEnd.
+    regs free.
+
+    beg >= 1
+       ifTrue: [
+           res := self species new: (to - from) - (end - beg) + str size.
+           res replaceFrom: 1 to: beg - from with: self startingAt: from.
+           res replaceFrom: beg - from + 1 to: beg - from + str size with: str.
+           res replaceFrom: beg - from + str size + 1 to: res size with: self 
startingAt: end - from + 2 ]
+       ifFalse: [ res := self copyFrom: from to: to ].
 
-    idx := self searchRegex: pattern from: from to: to.
-
-    idx > 0 ifTrue: [
-       res := self copyFrom: from to: idx - 1.
-       res := res, str.
-
-       idx := idx + (self lengthOfRegexMatch: pattern from: idx to: to).
-
-       idx <= to ifTrue: 
-           [ res := res, (self copyFrom: idx to: to) ].
-
-       ^ res
-    ].
-       
-    ^self copyFrom: from to: to
+    ^res
 !
 
 copyReplacingRegex: pattern with: str
@@ -280,29 +421,24 @@ copyReplacingRegex: pattern with: str
 copyFrom: from to: to replacingAllRegex: pattern with: str
     "Replaces all occurances of pattern between boundaries with specified 
string"
 
-    | res oldIdx idx len regex |
-    idx := from.
-    res := WriteStream on: self copyEmpty.
+    | res idx regex beg end regs |
     regex := pattern asRegex.
-
+    res := WriteStream on: (String new: to - from + 1).
+    idx := from.
     [
-       oldIdx := idx.
-       idx <= to and: [
-           idx := self searchRegex: regex from: idx to: to.
-           idx > 0 ]
+        regs := self searchRegexInternal: regex from: idx to: to.
+        beg := regs matchBeg.
+        end := regs matchEnd.
+        regs free.
+        beg >= 1
     ] whileTrue: [
-       oldIdx to: idx - 1 do: [ :each |
-           res nextPut: (self at: each) ].
-           
-       len := self lengthOfRegexMatch: regex from: idx to: to.
-       len = 0 ifTrue: [ len := 1 ].
-       
+       res next: beg - idx putAll: self startingAt: idx.
        res nextPutAll: str.
-       idx := idx + len.
+       idx := end + 1.
+       beg > end ifTrue: [ res nextPut: (self at: idx). idx := idx + 1 ].
+       idx > self size ifTrue: [ ^res contents ].
     ].
-
-    oldIdx to: to do: [ :each |
-       res nextPut: (self at: each) ].
+    res next: to - idx + 1 putAll: self startingAt: idx.
 
     ^res contents
 !
@@ -316,20 +452,18 @@ copyReplacingAllRegex: pattern with: str
 onOccurrencesOfRegex: pattern from: from to: to do: body
 "Searches for pattern and executed passed instruction-body (as a trigger)"
 
-    | idx len res regex |
-    idx := from.
+    | idx regex beg end regs |
     regex := pattern asRegex.
-    
+    idx := from.
     [
-       idx <= to and: [
-           idx := self searchRegex: regex from: idx to: to.
-           idx > 0 ]
+        regs := self searchRegexInternal: regex from: idx to: to.
+        beg := regs matchBeg.
+        end := regs matchEnd.
+        regs free.
+        beg >= 1
     ] whileTrue: [
-       len := self lengthOfRegexMatch: regex from: idx to: to.
-       
-       body value: idx value: len.
-       len = 0 ifTrue: [ len := 1 ].
-       idx := idx + len.
+       body value: beg value: end - beg + 1.
+       idx := end + 1 max: beg + 1.
     ].
 !
 
@@ -340,28 +474,24 @@ onOccurrencesOfRegex: pattern do: body
 !
 
 tokenize: pattern from: from to: to
-
-    | res oldIdx idx len regex |
-    idx := from.
-    res := WriteStream on: (Array new: 10).
+    | res idx regex beg end regs tokStart |
     regex := pattern asRegex.
-    
+    res := WriteStream on: (Array new: 10).
+    idx := from.
+    tokStart := 1.
     [
-       oldIdx := idx.
-       idx <= to and: [
-           idx := self searchRegex: regex from: idx to: to.
-           idx > 0 ]
+        regs := self searchRegexInternal: regex from: idx to: to.
+        beg := regs matchBeg.
+        end := regs matchEnd.
+        regs free.
+        beg >= 1
     ] whileTrue: [
-       len := self lengthOfRegexMatch: regex from: idx to: to.
-       res nextPut: (self copyFrom: oldIdx to: idx - 1).
-
-       len = 0 ifTrue: [ len := 1 ].
-       idx := idx + len.
+       res nextPut: (self copyFrom: tokStart to: beg - 1).
+       tokStart := end + 1.
+       idx := beg + 1 max: end + 1.
     ].
 
-    oldIdx <= to ifTrue: [
-       res nextPut: (self copyFrom: oldIdx to: to)
-    ].
+    res nextPut: (self copyFrom: tokStart to: to).
     ^res contents
 !
 
@@ -374,7 +504,6 @@ tokenize: pattern
 !
 
 tokenize
-
     ^self tokenize: '[\n\t ]+' from: 1 to: self size
 ! !
 


--- orig/kernel/Interval.st
+++ mod/kernel/Interval.st
@@ -8,7 +8,7 @@
 
 "======================================================================
 |
-| Copyright 1988,92,94,95,99,2000,2001,2002
+| Copyright 1988,92,94,95,99,2000,2001,2002,2005
 | Free Software Foundation, Inc.
 | Written by Steve Byrne.
 |
@@ -219,6 +219,18 @@ printOn: aStream
            print: stop
     ].
     aStream nextPut: $)
+!
+
+first
+    ^first
+!
+
+last
+    ^stop - ((stop - start) \\ step)
+!
+
+increment
+    ^step
 ! !
 
 
@@ -252,17 +264,5 @@ initializeFrom: startInteger to: stopInt
     start := startInteger.
     stop := stopInteger.
     step := stepInteger
-!
-
-start
-    ^start
-!
-
-stop
-    ^stop
-!
-
-step
-    ^step
 ! !
 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]