Stanislav N. aka pztrn
48d43ca097
Pagination now works. Temporary hardcoded 10 pastes per page, will be put in configuration later. Maybe. From now user will receive readable error message if error occured. Started to work on syntax highlighting, tried to make lexers detection work but apparently to no avail.
897 lines
19 KiB
Go
897 lines
19 KiB
Go
package syntax
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"strconv"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
type Prefix struct {
|
|
PrefixStr []rune
|
|
PrefixSet CharSet
|
|
CaseInsensitive bool
|
|
}
|
|
|
|
// It takes a RegexTree and computes the set of chars that can start it.
|
|
func getFirstCharsPrefix(tree *RegexTree) *Prefix {
|
|
s := regexFcd{
|
|
fcStack: make([]regexFc, 32),
|
|
intStack: make([]int, 32),
|
|
}
|
|
fc := s.regexFCFromRegexTree(tree)
|
|
|
|
if fc == nil || fc.nullable || fc.cc.IsEmpty() {
|
|
return nil
|
|
}
|
|
fcSet := fc.getFirstChars()
|
|
return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive}
|
|
}
|
|
|
|
type regexFcd struct {
|
|
intStack []int
|
|
intDepth int
|
|
fcStack []regexFc
|
|
fcDepth int
|
|
skipAllChildren bool // don't process any more children at the current level
|
|
skipchild bool // don't process the current child.
|
|
failed bool
|
|
}
|
|
|
|
/*
|
|
* The main FC computation. It does a shortcutted depth-first walk
|
|
* through the tree and calls CalculateFC to emits code before
|
|
* and after each child of an interior node, and at each leaf.
|
|
*/
|
|
func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc {
|
|
curNode := tree.root
|
|
curChild := 0
|
|
|
|
for {
|
|
if len(curNode.children) == 0 {
|
|
// This is a leaf node
|
|
s.calculateFC(curNode.t, curNode, 0)
|
|
} else if curChild < len(curNode.children) && !s.skipAllChildren {
|
|
// This is an interior node, and we have more children to analyze
|
|
s.calculateFC(curNode.t|beforeChild, curNode, curChild)
|
|
|
|
if !s.skipchild {
|
|
curNode = curNode.children[curChild]
|
|
// this stack is how we get a depth first walk of the tree.
|
|
s.pushInt(curChild)
|
|
curChild = 0
|
|
} else {
|
|
curChild++
|
|
s.skipchild = false
|
|
}
|
|
continue
|
|
}
|
|
|
|
// This is an interior node where we've finished analyzing all the children, or
|
|
// the end of a leaf node.
|
|
s.skipAllChildren = false
|
|
|
|
if s.intIsEmpty() {
|
|
break
|
|
}
|
|
|
|
curChild = s.popInt()
|
|
curNode = curNode.next
|
|
|
|
s.calculateFC(curNode.t|afterChild, curNode, curChild)
|
|
if s.failed {
|
|
return nil
|
|
}
|
|
|
|
curChild++
|
|
}
|
|
|
|
if s.fcIsEmpty() {
|
|
return nil
|
|
}
|
|
|
|
return s.popFC()
|
|
}
|
|
|
|
// To avoid recursion, we use a simple integer stack.
|
|
// This is the push.
|
|
func (s *regexFcd) pushInt(I int) {
|
|
if s.intDepth >= len(s.intStack) {
|
|
expanded := make([]int, s.intDepth*2)
|
|
copy(expanded, s.intStack)
|
|
s.intStack = expanded
|
|
}
|
|
|
|
s.intStack[s.intDepth] = I
|
|
s.intDepth++
|
|
}
|
|
|
|
// True if the stack is empty.
|
|
func (s *regexFcd) intIsEmpty() bool {
|
|
return s.intDepth == 0
|
|
}
|
|
|
|
// This is the pop.
|
|
func (s *regexFcd) popInt() int {
|
|
s.intDepth--
|
|
return s.intStack[s.intDepth]
|
|
}
|
|
|
|
// We also use a stack of RegexFC objects.
|
|
// This is the push.
|
|
func (s *regexFcd) pushFC(fc regexFc) {
|
|
if s.fcDepth >= len(s.fcStack) {
|
|
expanded := make([]regexFc, s.fcDepth*2)
|
|
copy(expanded, s.fcStack)
|
|
s.fcStack = expanded
|
|
}
|
|
|
|
s.fcStack[s.fcDepth] = fc
|
|
s.fcDepth++
|
|
}
|
|
|
|
// True if the stack is empty.
|
|
func (s *regexFcd) fcIsEmpty() bool {
|
|
return s.fcDepth == 0
|
|
}
|
|
|
|
// This is the pop.
|
|
func (s *regexFcd) popFC() *regexFc {
|
|
s.fcDepth--
|
|
return &s.fcStack[s.fcDepth]
|
|
}
|
|
|
|
// This is the top.
|
|
func (s *regexFcd) topFC() *regexFc {
|
|
return &s.fcStack[s.fcDepth-1]
|
|
}
|
|
|
|
// Called in Beforechild to prevent further processing of the current child
|
|
func (s *regexFcd) skipChild() {
|
|
s.skipchild = true
|
|
}
|
|
|
|
// FC computation and shortcut cases for each node type
|
|
func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) {
|
|
//fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description())
|
|
ci := false
|
|
rtl := false
|
|
|
|
if nt <= ntRef {
|
|
if (node.options & IgnoreCase) != 0 {
|
|
ci = true
|
|
}
|
|
if (node.options & RightToLeft) != 0 {
|
|
rtl = true
|
|
}
|
|
}
|
|
|
|
switch nt {
|
|
case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild:
|
|
break
|
|
|
|
case ntTestgroup | beforeChild:
|
|
if CurIndex == 0 {
|
|
s.skipChild()
|
|
}
|
|
break
|
|
|
|
case ntEmpty:
|
|
s.pushFC(regexFc{nullable: true})
|
|
break
|
|
|
|
case ntConcatenate | afterChild:
|
|
if CurIndex != 0 {
|
|
child := s.popFC()
|
|
cumul := s.topFC()
|
|
|
|
s.failed = !cumul.addFC(*child, true)
|
|
}
|
|
|
|
fc := s.topFC()
|
|
if !fc.nullable {
|
|
s.skipAllChildren = true
|
|
}
|
|
break
|
|
|
|
case ntTestgroup | afterChild:
|
|
if CurIndex > 1 {
|
|
child := s.popFC()
|
|
cumul := s.topFC()
|
|
|
|
s.failed = !cumul.addFC(*child, false)
|
|
}
|
|
break
|
|
|
|
case ntAlternate | afterChild, ntTestref | afterChild:
|
|
if CurIndex != 0 {
|
|
child := s.popFC()
|
|
cumul := s.topFC()
|
|
|
|
s.failed = !cumul.addFC(*child, false)
|
|
}
|
|
break
|
|
|
|
case ntLoop | afterChild, ntLazyloop | afterChild:
|
|
if node.m == 0 {
|
|
fc := s.topFC()
|
|
fc.nullable = true
|
|
}
|
|
break
|
|
|
|
case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild:
|
|
break
|
|
|
|
case ntRequire | beforeChild, ntPrevent | beforeChild:
|
|
s.skipChild()
|
|
s.pushFC(regexFc{nullable: true})
|
|
break
|
|
|
|
case ntRequire | afterChild, ntPrevent | afterChild:
|
|
break
|
|
|
|
case ntOne, ntNotone:
|
|
s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci))
|
|
break
|
|
|
|
case ntOneloop, ntOnelazy:
|
|
s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci))
|
|
break
|
|
|
|
case ntNotoneloop, ntNotonelazy:
|
|
s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci))
|
|
break
|
|
|
|
case ntMulti:
|
|
if len(node.str) == 0 {
|
|
s.pushFC(regexFc{nullable: true})
|
|
} else if !rtl {
|
|
s.pushFC(newRegexFc(node.str[0], false, false, ci))
|
|
} else {
|
|
s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci))
|
|
}
|
|
break
|
|
|
|
case ntSet:
|
|
s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci})
|
|
break
|
|
|
|
case ntSetloop, ntSetlazy:
|
|
s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci})
|
|
break
|
|
|
|
case ntRef:
|
|
s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false})
|
|
break
|
|
|
|
case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
|
|
s.pushFC(regexFc{nullable: true})
|
|
break
|
|
|
|
default:
|
|
panic(fmt.Sprintf("unexpected op code: %v", nt))
|
|
}
|
|
}
|
|
|
|
type regexFc struct {
|
|
cc CharSet
|
|
nullable bool
|
|
caseInsensitive bool
|
|
}
|
|
|
|
func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc {
|
|
r := regexFc{
|
|
caseInsensitive: caseInsensitive,
|
|
nullable: nullable,
|
|
}
|
|
if not {
|
|
if ch > 0 {
|
|
r.cc.addRange('\x00', ch-1)
|
|
}
|
|
if ch < 0xFFFF {
|
|
r.cc.addRange(ch+1, utf8.MaxRune)
|
|
}
|
|
} else {
|
|
r.cc.addRange(ch, ch)
|
|
}
|
|
return r
|
|
}
|
|
|
|
func (r *regexFc) getFirstChars() CharSet {
|
|
if r.caseInsensitive {
|
|
r.cc.addLowercase()
|
|
}
|
|
|
|
return r.cc
|
|
}
|
|
|
|
func (r *regexFc) addFC(fc regexFc, concatenate bool) bool {
|
|
if !r.cc.IsMergeable() || !fc.cc.IsMergeable() {
|
|
return false
|
|
}
|
|
|
|
if concatenate {
|
|
if !r.nullable {
|
|
return true
|
|
}
|
|
|
|
if !fc.nullable {
|
|
r.nullable = false
|
|
}
|
|
} else {
|
|
if fc.nullable {
|
|
r.nullable = true
|
|
}
|
|
}
|
|
|
|
r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive
|
|
r.cc.addSet(fc.cc)
|
|
|
|
return true
|
|
}
|
|
|
|
// This is a related computation: it takes a RegexTree and computes the
|
|
// leading substring if it sees one. It's quite trivial and gives up easily.
|
|
func getPrefix(tree *RegexTree) *Prefix {
|
|
var concatNode *regexNode
|
|
nextChild := 0
|
|
|
|
curNode := tree.root
|
|
|
|
for {
|
|
switch curNode.t {
|
|
case ntConcatenate:
|
|
if len(curNode.children) > 0 {
|
|
concatNode = curNode
|
|
nextChild = 0
|
|
}
|
|
|
|
case ntGreedy, ntCapture:
|
|
curNode = curNode.children[0]
|
|
concatNode = nil
|
|
continue
|
|
|
|
case ntOneloop, ntOnelazy:
|
|
if curNode.m > 0 {
|
|
return &Prefix{
|
|
PrefixStr: repeat(curNode.ch, curNode.m),
|
|
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
|
}
|
|
}
|
|
return nil
|
|
|
|
case ntOne:
|
|
return &Prefix{
|
|
PrefixStr: []rune{curNode.ch},
|
|
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
|
}
|
|
|
|
case ntMulti:
|
|
return &Prefix{
|
|
PrefixStr: curNode.str,
|
|
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
|
}
|
|
|
|
case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart,
|
|
ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent:
|
|
|
|
default:
|
|
return nil
|
|
}
|
|
|
|
if concatNode == nil || nextChild >= len(concatNode.children) {
|
|
return nil
|
|
}
|
|
|
|
curNode = concatNode.children[nextChild]
|
|
nextChild++
|
|
}
|
|
}
|
|
|
|
// repeat the rune r, c times... up to the max of MaxPrefixSize
|
|
func repeat(r rune, c int) []rune {
|
|
if c > MaxPrefixSize {
|
|
c = MaxPrefixSize
|
|
}
|
|
|
|
ret := make([]rune, c)
|
|
|
|
// binary growth using copy for speed
|
|
ret[0] = r
|
|
bp := 1
|
|
for bp < len(ret) {
|
|
copy(ret[bp:], ret[:bp])
|
|
bp *= 2
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
// BmPrefix precomputes the Boyer-Moore
|
|
// tables for fast string scanning. These tables allow
|
|
// you to scan for the first occurrence of a string within
|
|
// a large body of text without examining every character.
|
|
// The performance of the heuristic depends on the actual
|
|
// string and the text being searched, but usually, the longer
|
|
// the string that is being searched for, the fewer characters
|
|
// need to be examined.
|
|
type BmPrefix struct {
|
|
positive []int
|
|
negativeASCII []int
|
|
negativeUnicode [][]int
|
|
pattern []rune
|
|
lowASCII rune
|
|
highASCII rune
|
|
rightToLeft bool
|
|
caseInsensitive bool
|
|
}
|
|
|
|
func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix {
|
|
|
|
b := &BmPrefix{
|
|
rightToLeft: rightToLeft,
|
|
caseInsensitive: caseInsensitive,
|
|
pattern: pattern,
|
|
}
|
|
|
|
if caseInsensitive {
|
|
for i := 0; i < len(b.pattern); i++ {
|
|
// We do the ToLower character by character for consistency. With surrogate chars, doing
|
|
// a ToLower on the entire string could actually change the surrogate pair. This is more correct
|
|
// linguistically, but since Regex doesn't support surrogates, it's more important to be
|
|
// consistent.
|
|
|
|
b.pattern[i] = unicode.ToLower(b.pattern[i])
|
|
}
|
|
}
|
|
|
|
var beforefirst, last, bump int
|
|
var scan, match int
|
|
|
|
if !rightToLeft {
|
|
beforefirst = -1
|
|
last = len(b.pattern) - 1
|
|
bump = 1
|
|
} else {
|
|
beforefirst = len(b.pattern)
|
|
last = 0
|
|
bump = -1
|
|
}
|
|
|
|
// PART I - the good-suffix shift table
|
|
//
|
|
// compute the positive requirement:
|
|
// if char "i" is the first one from the right that doesn't match,
|
|
// then we know the matcher can advance by _positive[i].
|
|
//
|
|
// This algorithm is a simplified variant of the standard
|
|
// Boyer-Moore good suffix calculation.
|
|
|
|
b.positive = make([]int, len(b.pattern))
|
|
|
|
examine := last
|
|
ch := b.pattern[examine]
|
|
b.positive[examine] = bump
|
|
examine -= bump
|
|
|
|
Outerloop:
|
|
for {
|
|
// find an internal char (examine) that matches the tail
|
|
|
|
for {
|
|
if examine == beforefirst {
|
|
break Outerloop
|
|
}
|
|
if b.pattern[examine] == ch {
|
|
break
|
|
}
|
|
examine -= bump
|
|
}
|
|
|
|
match = last
|
|
scan = examine
|
|
|
|
// find the length of the match
|
|
for {
|
|
if scan == beforefirst || b.pattern[match] != b.pattern[scan] {
|
|
// at the end of the match, note the difference in _positive
|
|
// this is not the length of the match, but the distance from the internal match
|
|
// to the tail suffix.
|
|
if b.positive[match] == 0 {
|
|
b.positive[match] = match - scan
|
|
}
|
|
|
|
// System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
|
|
|
|
break
|
|
}
|
|
|
|
scan -= bump
|
|
match -= bump
|
|
}
|
|
|
|
examine -= bump
|
|
}
|
|
|
|
match = last - bump
|
|
|
|
// scan for the chars for which there are no shifts that yield a different candidate
|
|
|
|
// The inside of the if statement used to say
|
|
// "_positive[match] = last - beforefirst;"
|
|
// This is slightly less aggressive in how much we skip, but at worst it
|
|
// should mean a little more work rather than skipping a potential match.
|
|
for match != beforefirst {
|
|
if b.positive[match] == 0 {
|
|
b.positive[match] = bump
|
|
}
|
|
|
|
match -= bump
|
|
}
|
|
|
|
// PART II - the bad-character shift table
|
|
//
|
|
// compute the negative requirement:
|
|
// if char "ch" is the reject character when testing position "i",
|
|
// we can slide up by _negative[ch];
|
|
// (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
|
|
//
|
|
// the lookup table is divided into ASCII and Unicode portions;
|
|
// only those parts of the Unicode 16-bit code set that actually
|
|
// appear in the string are in the table. (Maximum size with
|
|
// Unicode is 65K; ASCII only case is 512 bytes.)
|
|
|
|
b.negativeASCII = make([]int, 128)
|
|
|
|
for i := 0; i < len(b.negativeASCII); i++ {
|
|
b.negativeASCII[i] = last - beforefirst
|
|
}
|
|
|
|
b.lowASCII = 127
|
|
b.highASCII = 0
|
|
|
|
for examine = last; examine != beforefirst; examine -= bump {
|
|
ch = b.pattern[examine]
|
|
|
|
switch {
|
|
case ch < 128:
|
|
if b.lowASCII > ch {
|
|
b.lowASCII = ch
|
|
}
|
|
|
|
if b.highASCII < ch {
|
|
b.highASCII = ch
|
|
}
|
|
|
|
if b.negativeASCII[ch] == last-beforefirst {
|
|
b.negativeASCII[ch] = last - examine
|
|
}
|
|
case ch <= 0xffff:
|
|
i, j := ch>>8, ch&0xFF
|
|
|
|
if b.negativeUnicode == nil {
|
|
b.negativeUnicode = make([][]int, 256)
|
|
}
|
|
|
|
if b.negativeUnicode[i] == nil {
|
|
newarray := make([]int, 256)
|
|
|
|
for k := 0; k < len(newarray); k++ {
|
|
newarray[k] = last - beforefirst
|
|
}
|
|
|
|
if i == 0 {
|
|
copy(newarray, b.negativeASCII)
|
|
//TODO: this line needed?
|
|
b.negativeASCII = newarray
|
|
}
|
|
|
|
b.negativeUnicode[i] = newarray
|
|
}
|
|
|
|
if b.negativeUnicode[i][j] == last-beforefirst {
|
|
b.negativeUnicode[i][j] = last - examine
|
|
}
|
|
default:
|
|
// we can't do the filter because this algo doesn't support
|
|
// unicode chars >0xffff
|
|
return nil
|
|
}
|
|
}
|
|
|
|
return b
|
|
}
|
|
|
|
func (b *BmPrefix) String() string {
|
|
return string(b.pattern)
|
|
}
|
|
|
|
// Dump returns the contents of the filter as a human readable string
|
|
func (b *BmPrefix) Dump(indent string) string {
|
|
buf := &bytes.Buffer{}
|
|
|
|
fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent)
|
|
for i := 0; i < len(b.positive); i++ {
|
|
buf.WriteString(strconv.Itoa(b.positive[i]))
|
|
buf.WriteRune(' ')
|
|
}
|
|
buf.WriteRune('\n')
|
|
|
|
if b.negativeASCII != nil {
|
|
buf.WriteString(indent)
|
|
buf.WriteString("Negative table\n")
|
|
for i := 0; i < len(b.negativeASCII); i++ {
|
|
if b.negativeASCII[i] != len(b.pattern) {
|
|
fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i]))
|
|
}
|
|
}
|
|
}
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
// Scan uses the Boyer-Moore algorithm to find the first occurrence
|
|
// of the specified string within text, beginning at index, and
|
|
// constrained within beglimit and endlimit.
|
|
//
|
|
// The direction and case-sensitivity of the match is determined
|
|
// by the arguments to the RegexBoyerMoore constructor.
|
|
func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
|
|
var (
|
|
defadv, test, test2 int
|
|
match, startmatch, endmatch int
|
|
bump, advance int
|
|
chTest rune
|
|
unicodeLookup []int
|
|
)
|
|
|
|
if !b.rightToLeft {
|
|
defadv = len(b.pattern)
|
|
startmatch = len(b.pattern) - 1
|
|
endmatch = 0
|
|
test = index + defadv - 1
|
|
bump = 1
|
|
} else {
|
|
defadv = -len(b.pattern)
|
|
startmatch = 0
|
|
endmatch = -defadv - 1
|
|
test = index + defadv
|
|
bump = -1
|
|
}
|
|
|
|
chMatch := b.pattern[startmatch]
|
|
|
|
for {
|
|
if test >= endlimit || test < beglimit {
|
|
return -1
|
|
}
|
|
|
|
chTest = text[test]
|
|
|
|
if b.caseInsensitive {
|
|
chTest = unicode.ToLower(chTest)
|
|
}
|
|
|
|
if chTest != chMatch {
|
|
if chTest < 128 {
|
|
advance = b.negativeASCII[chTest]
|
|
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
|
|
unicodeLookup = b.negativeUnicode[chTest>>8]
|
|
if len(unicodeLookup) > 0 {
|
|
advance = unicodeLookup[chTest&0xFF]
|
|
} else {
|
|
advance = defadv
|
|
}
|
|
} else {
|
|
advance = defadv
|
|
}
|
|
|
|
test += advance
|
|
} else { // if (chTest == chMatch)
|
|
test2 = test
|
|
match = startmatch
|
|
|
|
for {
|
|
if match == endmatch {
|
|
if b.rightToLeft {
|
|
return test2 + 1
|
|
} else {
|
|
return test2
|
|
}
|
|
}
|
|
|
|
match -= bump
|
|
test2 -= bump
|
|
|
|
chTest = text[test2]
|
|
|
|
if b.caseInsensitive {
|
|
chTest = unicode.ToLower(chTest)
|
|
}
|
|
|
|
if chTest != b.pattern[match] {
|
|
advance = b.positive[match]
|
|
if (chTest & 0xFF80) == 0 {
|
|
test2 = (match - startmatch) + b.negativeASCII[chTest]
|
|
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
|
|
unicodeLookup = b.negativeUnicode[chTest>>8]
|
|
if len(unicodeLookup) > 0 {
|
|
test2 = (match - startmatch) + unicodeLookup[chTest&0xFF]
|
|
} else {
|
|
test += advance
|
|
break
|
|
}
|
|
} else {
|
|
test += advance
|
|
break
|
|
}
|
|
|
|
if b.rightToLeft {
|
|
if test2 < advance {
|
|
advance = test2
|
|
}
|
|
} else if test2 > advance {
|
|
advance = test2
|
|
}
|
|
|
|
test += advance
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
|
|
func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool {
|
|
if !b.rightToLeft {
|
|
if index < beglimit || endlimit-index < len(b.pattern) {
|
|
return false
|
|
}
|
|
|
|
return b.matchPattern(text, index)
|
|
} else {
|
|
if index > endlimit || index-beglimit < len(b.pattern) {
|
|
return false
|
|
}
|
|
|
|
return b.matchPattern(text, index-len(b.pattern))
|
|
}
|
|
}
|
|
|
|
func (b *BmPrefix) matchPattern(text []rune, index int) bool {
|
|
if len(text)-index < len(b.pattern) {
|
|
return false
|
|
}
|
|
|
|
if b.caseInsensitive {
|
|
for i := 0; i < len(b.pattern); i++ {
|
|
//Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
|
|
if unicode.ToLower(text[index+i]) != b.pattern[i] {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
} else {
|
|
for i := 0; i < len(b.pattern); i++ {
|
|
if text[index+i] != b.pattern[i] {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
}
|
|
|
|
type AnchorLoc int16
|
|
|
|
// where the regex can be pegged
|
|
const (
|
|
AnchorBeginning AnchorLoc = 0x0001
|
|
AnchorBol = 0x0002
|
|
AnchorStart = 0x0004
|
|
AnchorEol = 0x0008
|
|
AnchorEndZ = 0x0010
|
|
AnchorEnd = 0x0020
|
|
AnchorBoundary = 0x0040
|
|
AnchorECMABoundary = 0x0080
|
|
)
|
|
|
|
func getAnchors(tree *RegexTree) AnchorLoc {
|
|
|
|
var concatNode *regexNode
|
|
nextChild, result := 0, AnchorLoc(0)
|
|
|
|
curNode := tree.root
|
|
|
|
for {
|
|
switch curNode.t {
|
|
case ntConcatenate:
|
|
if len(curNode.children) > 0 {
|
|
concatNode = curNode
|
|
nextChild = 0
|
|
}
|
|
|
|
case ntGreedy, ntCapture:
|
|
curNode = curNode.children[0]
|
|
concatNode = nil
|
|
continue
|
|
|
|
case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning,
|
|
ntStart, ntEndZ, ntEnd:
|
|
return result | anchorFromType(curNode.t)
|
|
|
|
case ntEmpty, ntRequire, ntPrevent:
|
|
|
|
default:
|
|
return result
|
|
}
|
|
|
|
if concatNode == nil || nextChild >= len(concatNode.children) {
|
|
return result
|
|
}
|
|
|
|
curNode = concatNode.children[nextChild]
|
|
nextChild++
|
|
}
|
|
}
|
|
|
|
func anchorFromType(t nodeType) AnchorLoc {
|
|
switch t {
|
|
case ntBol:
|
|
return AnchorBol
|
|
case ntEol:
|
|
return AnchorEol
|
|
case ntBoundary:
|
|
return AnchorBoundary
|
|
case ntECMABoundary:
|
|
return AnchorECMABoundary
|
|
case ntBeginning:
|
|
return AnchorBeginning
|
|
case ntStart:
|
|
return AnchorStart
|
|
case ntEndZ:
|
|
return AnchorEndZ
|
|
case ntEnd:
|
|
return AnchorEnd
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
// anchorDescription returns a human-readable description of the anchors
|
|
func (anchors AnchorLoc) String() string {
|
|
buf := &bytes.Buffer{}
|
|
|
|
if 0 != (anchors & AnchorBeginning) {
|
|
buf.WriteString(", Beginning")
|
|
}
|
|
if 0 != (anchors & AnchorStart) {
|
|
buf.WriteString(", Start")
|
|
}
|
|
if 0 != (anchors & AnchorBol) {
|
|
buf.WriteString(", Bol")
|
|
}
|
|
if 0 != (anchors & AnchorBoundary) {
|
|
buf.WriteString(", Boundary")
|
|
}
|
|
if 0 != (anchors & AnchorECMABoundary) {
|
|
buf.WriteString(", ECMABoundary")
|
|
}
|
|
if 0 != (anchors & AnchorEol) {
|
|
buf.WriteString(", Eol")
|
|
}
|
|
if 0 != (anchors & AnchorEnd) {
|
|
buf.WriteString(", End")
|
|
}
|
|
if 0 != (anchors & AnchorEndZ) {
|
|
buf.WriteString(", EndZ")
|
|
}
|
|
|
|
// trim off comma
|
|
if buf.Len() >= 2 {
|
|
return buf.String()[2:]
|
|
}
|
|
return "None"
|
|
}
|