SystemOrganization addCategory: #'PetitRegex-Core'! SystemOrganization addCategory: #'PetitRegex-Tests'! PPCompositeParser subclass: #PPRegexParser instanceVariableNames: 'regex atom anyAtom beginOfLineAtom endOfLineAtom characterAtom rangePieceSpec number groupAtom escapeAtom characterGroup quantifier' classVariableNames: 'EscapeMap' poolDictionaries: '' category: 'PetitRegex-Core'! !PPRegexParser class methodsFor: 'initialization' stamp: 'lr 2/8/2010 08:40'! condition: aBlock ^ [ :stream | (aBlock value: stream) ifFalse: [ PPFailure new ] ] asParser! ! !PPRegexParser class methodsFor: 'initialization' stamp: 'lr 2/8/2010 10:13'! initialize EscapeMap := Dictionary new. EscapeMap "only at beginning of string" at: $A put: (self condition: [ :stream | stream position = 0 ]); "only at end of string" at: $Z put: (self condition: [ :stream | stream atEnd ]); "newline" at: $n put: Character lf asParser; "carriage return" at: $r put: Character cr asParser; "tab" at: $t put: Character tab asParser; "digit" at: $d put: #digit asParser; "not digit" at: $D put: #digit asParser negate; "word" at: $w put: #word asParser; "not word" at: $W put: #word asParser negate; "whitespace" at: $s put: #space asParser; "not whitespace" at: $S put: #space asParser negate! ! !PPRegexParser methodsFor: 'grammar-atoms' stamp: 'lr 2/7/2010 23:28'! anyAtom ^ $. asParser map: [ :char | #any asParser ]! ! !PPRegexParser methodsFor: 'grammar' stamp: 'lr 2/8/2010 00:18'! atom ^ escapeAtom / anyAtom / beginOfLineAtom / endOfLineAtom / characterGroup / groupAtom / characterAtom! ! !PPRegexParser methodsFor: 'grammar-atoms' stamp: 'lr 2/8/2010 00:40'! beginOfLineAtom ^ $^ asParser map: [ :char | [ :stream | (stream position = 0 or: [ (stream skip: -1; next) = Character cr ]) ifFalse: [ PPFailure at: stream position ] ] asParser ]! ! !PPRegexParser methodsFor: 'grammar-atoms' stamp: 'lr 2/8/2010 00:16'! characterAtom ^ $) asParser negate map: [ :char | char asParser ]! ! !PPRegexParser methodsFor: 'grammar-atoms' stamp: 'lr 2/8/2010 00:29'! characterGroup ^ $[ asParser , $^ asParser optional , [ :stream | | set | set := OrderedCollection new. [ stream atEnd or: [ stream peek = $] ] ] whileFalse: [ (stream peek = $- and: [ set notEmpty ]) ifTrue: [ set addAll: (set removeLast to: (stream next; next)) ] ifFalse: [ set add: stream next ] ]. set ] asParser , $] asParser map: [ :open :negate :set :close | negate isNil ifTrue: [ (PPPredicateParser anyOf: set asSet) ] ifFalse: [ (PPPredicateParser anyOf: set asSet) negate ] ]! ! !PPRegexParser methodsFor: 'grammar-atoms' stamp: 'lr 2/8/2010 00:40'! endOfLineAtom ^ $$ asParser map: [ :char | [ :stream | (stream atEnd or: [ stream peek = Character cr ]) ifFalse: [ PPFailure at: stream position ] ] asParser ]! ! !PPRegexParser methodsFor: 'grammar-atoms' stamp: 'lr 2/8/2010 08:43'! escapeAtom ^ $\ asParser , #any asParser map: [ :escape :char | EscapeMap at: char ifAbsent: [ char asParser ] ]! ! !PPRegexParser methodsFor: 'grammar-atoms' stamp: 'lr 2/8/2010 00:11'! groupAtom ^ $( asParser , regex , $) asParser ==> #second! ! !PPRegexParser methodsFor: 'other' stamp: 'lr 2/8/2010 19:25'! number ^ #digit asParser plus flatten ==> [ :token | token asInteger ]! ! !PPRegexParser methodsFor: 'grammar-operators' stamp: 'lr 3/1/2010 16:33'! oneOrMorePiece "+ 1 or more times" ^ $+ asParser , quantifier map: [ :op :qu | qu on: nil min: 1 max: SmallInteger maxVal ]! ! !PPRegexParser methodsFor: 'grammar-operators' stamp: 'lr 3/1/2010 16:24'! quantifier ^ ($+ asParser ==> [ :node | PPRepeatingParser ]) / ($? asParser ==> [ :node | PPLazyQuantifierStub ]) / (nil asParser ==> [ :node | PPGreedyQuantifierStub ])! ! !PPRegexParser methodsFor: 'grammar-operators' stamp: 'lr 3/1/2010 16:33'! rangePiece ^ ${ asParser , rangePieceSpec , $} asParser , quantifier map: [ :op :ra :cl :qu | qu on: nil min: ra first max: ra last ]! ! !PPRegexParser methodsFor: 'grammar-operators' stamp: 'lr 3/1/2010 16:31'! rangePieceSpec "{n,m} at least n but not more than m times {n,} at least n times {n} exactly n times" ^ number optional , $, asParser optional , number optional map: [ :n :op :m | (n ifNil: [ 0 ]) to: (m ifNil: [ SmallInteger maxVal ]) ]! ! !PPRegexParser methodsFor: 'grammar' stamp: 'lr 3/1/2010 16:32'! regex ^ PPExpressionParser new term: atom; group: [ :g |. g postfix: self zeroOrOnePiece do: [ :at :op | op setParser: at ]. g postfix: self zeroOrMorePiece do: [ :at :op | op setParser: at ]. g postfix: self oneOrMorePiece do: [ :at :op | op setParser: at ]. g postfix: self rangePiece do: [ :at :op | op setParser: at ] ]; group: [ :g | g left: nil do: [ :left :op :right | left , right ] ]; group: [ :g | g left: $| do: [ :left :op :right | left / right ] ]; yourself! ! !PPRegexParser methodsFor: 'private' stamp: 'lr 3/1/2010 16:40'! resolveStubs: aParser "In this post-processing step we need to fix the quantifiers." aParser followSets keysAndValuesDo: [ :each :set | (each class = PPGreedyQuantifierStub or: [ each class = PPLazyQuantifierStub ]) ifTrue: [ each def: (each resolve: (set size = 1 ifTrue: [ set anyOne ] ifFalse: [ PPChoiceParser withAll: set ])) ] ]. ^ aParser! ! !PPRegexParser methodsFor: 'accessing' stamp: 'lr 3/1/2010 16:17'! start ^ regex end ==> [ :node | self resolveStubs: node ]! ! !PPRegexParser methodsFor: 'grammar-operators' stamp: 'lr 3/1/2010 16:33'! zeroOrMorePiece "* 0 or more times" ^ $* asParser , quantifier map: [ :op :qu | qu on: nil min: 0 max: SmallInteger maxVal ]! ! !PPRegexParser methodsFor: 'grammar-operators' stamp: 'lr 3/1/2010 16:33'! zeroOrOnePiece "? 0 or 1 times" ^ $? asParser , quantifier map: [ :op :qu | qu on: nil min: 0 max: 1 ]! ! PPRepeatingParser subclass: #PPGreedyQuantifierStub instanceVariableNames: '' classVariableNames: '' poolDictionaries: '' category: 'PetitRegex-Core'! !PPGreedyQuantifierStub methodsFor: 'protected' stamp: 'lr 3/1/2010 21:26'! resolve: aParser ^ (min = 0 and: [ max = SmallInteger maxVal ]) ifTrue: [ parser starGreedy: aParser ] ifFalse: [ (min = 1 and: [ max = SmallInteger maxVal ]) ifTrue: [ parser plusGreedy: aParser ] ifFalse: [ self error: 'Invalid quantifier range' ] ]! ! PPRepeatingParser subclass: #PPLazyQuantifierStub instanceVariableNames: '' classVariableNames: '' poolDictionaries: '' category: 'PetitRegex-Core'! !PPLazyQuantifierStub methodsFor: 'protected' stamp: 'lr 3/1/2010 21:26'! resolve: aParser ^ (min = 0 and: [ max = SmallInteger maxVal ]) ifTrue: [ parser starLazy: aParser ] ifFalse: [ (min = 1 and: [ max = SmallInteger maxVal ]) ifTrue: [ parser plusLazy: aParser ] ifFalse: [ self error: 'Invalid quantifier range' ] ]! ! TestCase subclass: #PPRegexTest instanceVariableNames: '' classVariableNames: '' poolDictionaries: '' category: 'PetitRegex-Tests'! !PPRegexTest class methodsFor: 'accessing' stamp: 'lr 2/8/2010 10:14'! packageNamesUnderTest ^ #('PetitRegex')! ! !PPRegexTest methodsFor: 'utilties' stamp: 'lr 2/8/2010 00:49'! compileRegex: aString "Compile the regex and answer the matcher, or answer nil if compilation fails." | syntaxTree | syntaxTree := self parserClass parse: aString. ^ syntaxTree isPetitFailure ifFalse: [ syntaxTree ]! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! henryReadme self error: 'The tests in this category are based on the ones in Henry Spencer''s regexp.c package.'! ! !PPRegexTest methodsFor: 'accessing' stamp: 'lr 2/8/2010 00:46'! parserClass ^ PPRegexParser! ! !PPRegexTest methodsFor: 'utilties' stamp: 'lr 3/1/2010 21:33'! runMatcher: aParser with: aString expect: aBoolean withSubexpressions: anArray | copy got | "copy := aParser copy: aString translatingMatchesUsing: [ :each | each ]. self assert: copy = aString description: 'Copying: expected ' , aString printString , ', but got ' , copy printString." got := aParser matchesIn: aString. self assert: got notEmpty = aBoolean description: 'Searching: expected ' , aBoolean printString , ', but got ' , got printString. "(anArray isNil or: [ aParser supportsSubexpressions not ]) ifTrue: [ ^ self ]. 1 to: anArray size by: 2 do: [ :index | | sub subExpect subGot | sub := anArray at: index. subExpect := anArray at: index + 1. subGot := aParser subexpression: sub. self assert: subExpect = subGot description: 'Subexpression ' , sub printString , ': expected ' , subExpect printString , ', but got ' , subGot printString ]"! ! !PPRegexTest methodsFor: 'utilties' stamp: 'lr 2/8/2010 00:52'! runRegex: anArray "Run a clause anArray against a set of tests. Each clause is an array with a regex source string followed by sequence of 3-tuples. Each three-element group is one test to try against the regex, and includes: 1) test string; 2) expected result; 3) expected subexpression as an array of (index, substring), or nil." | source matcher | source := anArray first. matcher := self compileRegex: source. matcher isNil ifTrue: [ (anArray at: 2) isNil ifFalse: [ self signalFailure: 'Compilation failed, should have succeeded: ' , source printString ] ] ifFalse: [ (anArray at: 2) isNil ifTrue: [ self signalFailure: 'Compilation succeeded, should have failed: ' , source printString ] ifFalse: [ 2 to: anArray size by: 3 do: [ :index | self runMatcher: matcher with: (anArray at: index) expect: (anArray at: index + 1) withSubexpressions: (anArray at: index + 2) ] ] ]! ! !PPRegexTest methodsFor: 'testing-protocol' stamp: 'lr 2/8/2010 00:45'! testCaseInsensitive | matcher | matcher := self matcherClass forString: 'the quick brown fox' ignoreCase: true. self assert: (matcher search: 'the quick brown fox'). self assert: (matcher search: 'The quick brown FOX'). self assert: (matcher search: 'What do you know about the quick brown fox?'). self assert: (matcher search: 'What do you know about THE QUICK BROWN FOX?')! ! !PPRegexTest methodsFor: 'testing-protocol' stamp: 'lr 2/8/2010 00:45'! testCaseSensitive | matcher | matcher := self matcherClass forString: 'the quick brown fox' ignoreCase: false. self assert: (matcher search: 'the quick brown fox'). self deny: (matcher search: 'The quick brown FOX'). self assert: (matcher search: 'What do you know about the quick brown fox?'). self deny: (matcher search: 'What do you know about THE QUICK BROWN FOX?')! ! !PPRegexTest methodsFor: 'testing-protocol' stamp: 'lr 2/8/2010 00:45'! testCopyReplacingMatches "See that the match context is preserved while copying stuff between matches:" | matcher | matcher := self matcherClass forString: '\<\d\D+'. self assert: (matcher copy: '9aaa1bbb 8ccc' replacingMatchesWith: 'foo') = 'foo1bbb foo'! ! !PPRegexTest methodsFor: 'testing-protocol' stamp: 'lr 2/8/2010 00:45'! testCopyTranslatingMatches | matcher | matcher := self matcherClass forString: '\w+'. self assert: (matcher copy: 'now is the time ' translatingMatchesUsing: [ :each | each reverse ]) = 'won si eht emit '! ! !PPRegexTest methodsFor: 'testing-empty' stamp: 'lr 2/8/2010 00:45'! testEmptyStringAtBeginningOfLine | matcher | matcher := self matcherClass forString: '^'. self assert: (matcher copy: 'foo1 bar1' , String cr , 'foo2 bar2' replacingMatchesWith: '*') = ('*foo1 bar1' , String cr , '*foo2 bar2') description: 'An empty string at the beginning of a line'! ! !PPRegexTest methodsFor: 'testing-empty' stamp: 'lr 2/8/2010 00:45'! testEmptyStringAtBeginningOfWord | matcher | matcher := self matcherClass forString: '\<'. self assert: (matcher copy: 'foo bar' replacingMatchesWith: '*') = '*foo *bar' description: 'An empty string at the beginning of a word'! ! !PPRegexTest methodsFor: 'testing-empty' stamp: 'lr 2/8/2010 00:45'! testEmptyStringAtEndOfLine | matcher | matcher := self matcherClass forString: '$'. self assert: (matcher copy: 'foo1 bar1' , String cr , 'foo2 bar2' replacingMatchesWith: '*') = ('foo1 bar1*', String cr , 'foo2 bar2*') description: 'An empty string at the end of a line'! ! !PPRegexTest methodsFor: 'testing-empty' stamp: 'lr 2/8/2010 00:45'! testEmptyStringAtEndOfWord | matcher | matcher := self matcherClass forString: '\>'. self assert: (matcher copy: 'foo bar' replacingMatchesWith: '*') = 'foo* bar*' description: 'An empty string at the end of a word'! ! !PPRegexTest methodsFor: 'testing-empty' stamp: 'lr 2/8/2010 00:45'! testEmptyStringAtWordBoundary | matcher | matcher := self matcherClass forString: '\b'. self assert: (matcher copy: 'foo bar' replacingMatchesWith: '*') = '*foo* *bar*' description: 'An empty string at a word boundary'! ! !PPRegexTest methodsFor: 'testing-empty' stamp: 'lr 2/8/2010 00:45'! testEmptyStringNotAtWordBoundary | matcher | matcher := self matcherClass forString: '\B'. self assert: (matcher copy: 'foo bar' replacingMatchesWith: '*') = 'f*o*o b*a*r' description: 'An empty string not at a word boundary'! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry001 self runRegex: #('abc' 'abc' true (1 'abc') 'xbc' false nil 'axc' false nil 'abx' false nil 'xabcy' true (1 'abc') 'ababc' true (1 'abc'))! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry002 self runRegex: #('ab*c' 'abc' true (1 'abc'))! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry003 self runRegex: #('ab*bc' 'abc' true (1 'abc') 'abbc' true (1 'abbc') 'abbbbc' true (1 'abbbbc'))! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry004 self runRegex: #('ab+bc' 'abbc' true (1 'abbc') 'abc' false nil 'abq' false nil 'abbbbc' true (1 'abbbbc'))! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry005 self runRegex: #('ab?bc' 'abbc' true (1 'abbc') 'abc' true (1 'abc') 'abbbbc' false nil 'abc' true (1 'abc'))! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry006 self runRegex: #('^abc$' 'abc' true (1 'abc') 'abcc' false nil 'aabc' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry007 self runRegex: #('^abc' 'abcc' true (1 'abc'))! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry008 self runRegex: #('abc$' 'aabc' true (1 'abc'))! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry009 self runRegex: #('^' 'abc' true nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry010 self runRegex: #('$' 'abc' true nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry011 self runRegex: #('a.c' 'abc' true (1 'abc') 'axc' true (1 'axc'))! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry012 "Need to get creative to include the null character..." self runRegex: #('a.*c' 'axyzc' true (1 'axyzc') 'axy zc' true (1 'axy zc') "testing that a dot matches a space" ), (Array with: 'axy', (String with: 0 asCharacter), 'zc'), #(false nil "testing that a dot does not match a null" 'axyzd' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry013 self runRegex: #('.a.*' '1234abc' true (1 '4abc') 'abcd' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry014 self runRegex: #('a\w+c' ' abbbbc ' true (1 'abbbbc') 'abb bc' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry015 self runRegex: #('\w+' ' foobar quux' true (1 'foobar') ' ~!!@#$%^&*()-+=\|/?.>,<' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry016 self runRegex: #('a\W+c' 'a c' true (1 'a c') 'a bc' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry017 self runRegex: #('\W+' 'foo!!@#$bar' true (1 '!!@#$') 'foobar' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry018 self runRegex: #('a\s*c' 'a c' true (1 'a c') 'a bc' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry019 self runRegex: #('\s+' 'abc3457 sd' true (1 ' ') '1234$^*^&asdfb' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry020 self runRegex: #('a\S*c' 'aqwertyc' true (1 'aqwertyc') 'ab c' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry021 self runRegex: #('\S+' ' asdf ' true (1 'asdf') ' ' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry022 self runRegex: #('a\d+c' 'a0123456789c' true (1 'a0123456789c') 'a12b34c' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry023 self runRegex: #('\d+' 'foo@#$%123ASD #$$%^&' true (1 '123') 'foo!!@#$asdfl;' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry024 self runRegex: #('a\D+c' 'aqwertyc' true (1 'aqwertyc') 'aqw6ertc' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry025 self runRegex: #('\D+' '1234 abc 456' true (1 ' abc ') '1234567890' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry026 self runRegex: #('(f|o)+\b' 'foo' true (1 'foo') ' foo ' true (1 'foo'))! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry027 self runRegex: #('\ba\w+' "a word beginning with an A" 'land ancient' true (1 'ancient') 'antique vase' true (1 'antique') 'goofy foobar' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry028 self runRegex: #('(f|o)+\B' 'quuxfoobar' true (1 'foo') 'quuxfoo ' true (1 'fo'))! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry029 self runRegex: #('\Ba\w+' "a word with an A in the middle, match at A and further" 'land ancient' true (1 'and') 'antique vase' true (1 'ase') 'smalltalk shall overcome' true (1 'alltalk') 'foonix is better' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry030 self runRegex: #('fooa\>.*' 'fooa ' true nil 'fooa123' false nil 'fooa bar' true nil 'fooa' true nil 'fooargh' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry031 self runRegex: #('\>.+abc' ' abcde fg' false nil 'foo abcde' true (1 ' abc') 'abcde' false nil)! ! !PPRegexTest methodsFor: 'testing-henry' stamp: 'lr 2/8/2010 00:45'! testHenry032 self runRegex: #('\