fastpastebin/vendor/github.com/dlclark/regexp2/syntax/prefix.go
Stanislav N. aka pztrn 48d43ca097 Pagination, readable error messages to user, syntax highlighting started.
Pagination now works. Temporary hardcoded 10 pastes per page, will be put
in configuration later. Maybe.

From now user will receive readable error message if error occured.

Started to work on syntax highlighting, tried to make lexers detection
work but apparently to no avail.
2018-05-01 02:37:51 +05:00

897 lines
19 KiB
Go

package syntax
import (
"bytes"
"fmt"
"strconv"
"unicode"
"unicode/utf8"
)
type Prefix struct {
PrefixStr []rune
PrefixSet CharSet
CaseInsensitive bool
}
// It takes a RegexTree and computes the set of chars that can start it.
func getFirstCharsPrefix(tree *RegexTree) *Prefix {
s := regexFcd{
fcStack: make([]regexFc, 32),
intStack: make([]int, 32),
}
fc := s.regexFCFromRegexTree(tree)
if fc == nil || fc.nullable || fc.cc.IsEmpty() {
return nil
}
fcSet := fc.getFirstChars()
return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive}
}
type regexFcd struct {
intStack []int
intDepth int
fcStack []regexFc
fcDepth int
skipAllChildren bool // don't process any more children at the current level
skipchild bool // don't process the current child.
failed bool
}
/*
* The main FC computation. It does a shortcutted depth-first walk
* through the tree and calls CalculateFC to emits code before
* and after each child of an interior node, and at each leaf.
*/
func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc {
curNode := tree.root
curChild := 0
for {
if len(curNode.children) == 0 {
// This is a leaf node
s.calculateFC(curNode.t, curNode, 0)
} else if curChild < len(curNode.children) && !s.skipAllChildren {
// This is an interior node, and we have more children to analyze
s.calculateFC(curNode.t|beforeChild, curNode, curChild)
if !s.skipchild {
curNode = curNode.children[curChild]
// this stack is how we get a depth first walk of the tree.
s.pushInt(curChild)
curChild = 0
} else {
curChild++
s.skipchild = false
}
continue
}
// This is an interior node where we've finished analyzing all the children, or
// the end of a leaf node.
s.skipAllChildren = false
if s.intIsEmpty() {
break
}
curChild = s.popInt()
curNode = curNode.next
s.calculateFC(curNode.t|afterChild, curNode, curChild)
if s.failed {
return nil
}
curChild++
}
if s.fcIsEmpty() {
return nil
}
return s.popFC()
}
// To avoid recursion, we use a simple integer stack.
// This is the push.
func (s *regexFcd) pushInt(I int) {
if s.intDepth >= len(s.intStack) {
expanded := make([]int, s.intDepth*2)
copy(expanded, s.intStack)
s.intStack = expanded
}
s.intStack[s.intDepth] = I
s.intDepth++
}
// True if the stack is empty.
func (s *regexFcd) intIsEmpty() bool {
return s.intDepth == 0
}
// This is the pop.
func (s *regexFcd) popInt() int {
s.intDepth--
return s.intStack[s.intDepth]
}
// We also use a stack of RegexFC objects.
// This is the push.
func (s *regexFcd) pushFC(fc regexFc) {
if s.fcDepth >= len(s.fcStack) {
expanded := make([]regexFc, s.fcDepth*2)
copy(expanded, s.fcStack)
s.fcStack = expanded
}
s.fcStack[s.fcDepth] = fc
s.fcDepth++
}
// True if the stack is empty.
func (s *regexFcd) fcIsEmpty() bool {
return s.fcDepth == 0
}
// This is the pop.
func (s *regexFcd) popFC() *regexFc {
s.fcDepth--
return &s.fcStack[s.fcDepth]
}
// This is the top.
func (s *regexFcd) topFC() *regexFc {
return &s.fcStack[s.fcDepth-1]
}
// Called in Beforechild to prevent further processing of the current child
func (s *regexFcd) skipChild() {
s.skipchild = true
}
// FC computation and shortcut cases for each node type
func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) {
//fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description())
ci := false
rtl := false
if nt <= ntRef {
if (node.options & IgnoreCase) != 0 {
ci = true
}
if (node.options & RightToLeft) != 0 {
rtl = true
}
}
switch nt {
case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild:
break
case ntTestgroup | beforeChild:
if CurIndex == 0 {
s.skipChild()
}
break
case ntEmpty:
s.pushFC(regexFc{nullable: true})
break
case ntConcatenate | afterChild:
if CurIndex != 0 {
child := s.popFC()
cumul := s.topFC()
s.failed = !cumul.addFC(*child, true)
}
fc := s.topFC()
if !fc.nullable {
s.skipAllChildren = true
}
break
case ntTestgroup | afterChild:
if CurIndex > 1 {
child := s.popFC()
cumul := s.topFC()
s.failed = !cumul.addFC(*child, false)
}
break
case ntAlternate | afterChild, ntTestref | afterChild:
if CurIndex != 0 {
child := s.popFC()
cumul := s.topFC()
s.failed = !cumul.addFC(*child, false)
}
break
case ntLoop | afterChild, ntLazyloop | afterChild:
if node.m == 0 {
fc := s.topFC()
fc.nullable = true
}
break
case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild:
break
case ntRequire | beforeChild, ntPrevent | beforeChild:
s.skipChild()
s.pushFC(regexFc{nullable: true})
break
case ntRequire | afterChild, ntPrevent | afterChild:
break
case ntOne, ntNotone:
s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci))
break
case ntOneloop, ntOnelazy:
s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci))
break
case ntNotoneloop, ntNotonelazy:
s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci))
break
case ntMulti:
if len(node.str) == 0 {
s.pushFC(regexFc{nullable: true})
} else if !rtl {
s.pushFC(newRegexFc(node.str[0], false, false, ci))
} else {
s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci))
}
break
case ntSet:
s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci})
break
case ntSetloop, ntSetlazy:
s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci})
break
case ntRef:
s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false})
break
case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
s.pushFC(regexFc{nullable: true})
break
default:
panic(fmt.Sprintf("unexpected op code: %v", nt))
}
}
type regexFc struct {
cc CharSet
nullable bool
caseInsensitive bool
}
func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc {
r := regexFc{
caseInsensitive: caseInsensitive,
nullable: nullable,
}
if not {
if ch > 0 {
r.cc.addRange('\x00', ch-1)
}
if ch < 0xFFFF {
r.cc.addRange(ch+1, utf8.MaxRune)
}
} else {
r.cc.addRange(ch, ch)
}
return r
}
func (r *regexFc) getFirstChars() CharSet {
if r.caseInsensitive {
r.cc.addLowercase()
}
return r.cc
}
func (r *regexFc) addFC(fc regexFc, concatenate bool) bool {
if !r.cc.IsMergeable() || !fc.cc.IsMergeable() {
return false
}
if concatenate {
if !r.nullable {
return true
}
if !fc.nullable {
r.nullable = false
}
} else {
if fc.nullable {
r.nullable = true
}
}
r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive
r.cc.addSet(fc.cc)
return true
}
// This is a related computation: it takes a RegexTree and computes the
// leading substring if it sees one. It's quite trivial and gives up easily.
func getPrefix(tree *RegexTree) *Prefix {
var concatNode *regexNode
nextChild := 0
curNode := tree.root
for {
switch curNode.t {
case ntConcatenate:
if len(curNode.children) > 0 {
concatNode = curNode
nextChild = 0
}
case ntGreedy, ntCapture:
curNode = curNode.children[0]
concatNode = nil
continue
case ntOneloop, ntOnelazy:
if curNode.m > 0 {
return &Prefix{
PrefixStr: repeat(curNode.ch, curNode.m),
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
}
}
return nil
case ntOne:
return &Prefix{
PrefixStr: []rune{curNode.ch},
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
}
case ntMulti:
return &Prefix{
PrefixStr: curNode.str,
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
}
case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart,
ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent:
default:
return nil
}
if concatNode == nil || nextChild >= len(concatNode.children) {
return nil
}
curNode = concatNode.children[nextChild]
nextChild++
}
}
// repeat the rune r, c times... up to the max of MaxPrefixSize
func repeat(r rune, c int) []rune {
if c > MaxPrefixSize {
c = MaxPrefixSize
}
ret := make([]rune, c)
// binary growth using copy for speed
ret[0] = r
bp := 1
for bp < len(ret) {
copy(ret[bp:], ret[:bp])
bp *= 2
}
return ret
}
// BmPrefix precomputes the Boyer-Moore
// tables for fast string scanning. These tables allow
// you to scan for the first occurrence of a string within
// a large body of text without examining every character.
// The performance of the heuristic depends on the actual
// string and the text being searched, but usually, the longer
// the string that is being searched for, the fewer characters
// need to be examined.
type BmPrefix struct {
positive []int
negativeASCII []int
negativeUnicode [][]int
pattern []rune
lowASCII rune
highASCII rune
rightToLeft bool
caseInsensitive bool
}
func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix {
b := &BmPrefix{
rightToLeft: rightToLeft,
caseInsensitive: caseInsensitive,
pattern: pattern,
}
if caseInsensitive {
for i := 0; i < len(b.pattern); i++ {
// We do the ToLower character by character for consistency. With surrogate chars, doing
// a ToLower on the entire string could actually change the surrogate pair. This is more correct
// linguistically, but since Regex doesn't support surrogates, it's more important to be
// consistent.
b.pattern[i] = unicode.ToLower(b.pattern[i])
}
}
var beforefirst, last, bump int
var scan, match int
if !rightToLeft {
beforefirst = -1
last = len(b.pattern) - 1
bump = 1
} else {
beforefirst = len(b.pattern)
last = 0
bump = -1
}
// PART I - the good-suffix shift table
//
// compute the positive requirement:
// if char "i" is the first one from the right that doesn't match,
// then we know the matcher can advance by _positive[i].
//
// This algorithm is a simplified variant of the standard
// Boyer-Moore good suffix calculation.
b.positive = make([]int, len(b.pattern))
examine := last
ch := b.pattern[examine]
b.positive[examine] = bump
examine -= bump
Outerloop:
for {
// find an internal char (examine) that matches the tail
for {
if examine == beforefirst {
break Outerloop
}
if b.pattern[examine] == ch {
break
}
examine -= bump
}
match = last
scan = examine
// find the length of the match
for {
if scan == beforefirst || b.pattern[match] != b.pattern[scan] {
// at the end of the match, note the difference in _positive
// this is not the length of the match, but the distance from the internal match
// to the tail suffix.
if b.positive[match] == 0 {
b.positive[match] = match - scan
}
// System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
break
}
scan -= bump
match -= bump
}
examine -= bump
}
match = last - bump
// scan for the chars for which there are no shifts that yield a different candidate
// The inside of the if statement used to say
// "_positive[match] = last - beforefirst;"
// This is slightly less aggressive in how much we skip, but at worst it
// should mean a little more work rather than skipping a potential match.
for match != beforefirst {
if b.positive[match] == 0 {
b.positive[match] = bump
}
match -= bump
}
// PART II - the bad-character shift table
//
// compute the negative requirement:
// if char "ch" is the reject character when testing position "i",
// we can slide up by _negative[ch];
// (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
//
// the lookup table is divided into ASCII and Unicode portions;
// only those parts of the Unicode 16-bit code set that actually
// appear in the string are in the table. (Maximum size with
// Unicode is 65K; ASCII only case is 512 bytes.)
b.negativeASCII = make([]int, 128)
for i := 0; i < len(b.negativeASCII); i++ {
b.negativeASCII[i] = last - beforefirst
}
b.lowASCII = 127
b.highASCII = 0
for examine = last; examine != beforefirst; examine -= bump {
ch = b.pattern[examine]
switch {
case ch < 128:
if b.lowASCII > ch {
b.lowASCII = ch
}
if b.highASCII < ch {
b.highASCII = ch
}
if b.negativeASCII[ch] == last-beforefirst {
b.negativeASCII[ch] = last - examine
}
case ch <= 0xffff:
i, j := ch>>8, ch&0xFF
if b.negativeUnicode == nil {
b.negativeUnicode = make([][]int, 256)
}
if b.negativeUnicode[i] == nil {
newarray := make([]int, 256)
for k := 0; k < len(newarray); k++ {
newarray[k] = last - beforefirst
}
if i == 0 {
copy(newarray, b.negativeASCII)
//TODO: this line needed?
b.negativeASCII = newarray
}
b.negativeUnicode[i] = newarray
}
if b.negativeUnicode[i][j] == last-beforefirst {
b.negativeUnicode[i][j] = last - examine
}
default:
// we can't do the filter because this algo doesn't support
// unicode chars >0xffff
return nil
}
}
return b
}
func (b *BmPrefix) String() string {
return string(b.pattern)
}
// Dump returns the contents of the filter as a human readable string
func (b *BmPrefix) Dump(indent string) string {
buf := &bytes.Buffer{}
fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent)
for i := 0; i < len(b.positive); i++ {
buf.WriteString(strconv.Itoa(b.positive[i]))
buf.WriteRune(' ')
}
buf.WriteRune('\n')
if b.negativeASCII != nil {
buf.WriteString(indent)
buf.WriteString("Negative table\n")
for i := 0; i < len(b.negativeASCII); i++ {
if b.negativeASCII[i] != len(b.pattern) {
fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i]))
}
}
}
return buf.String()
}
// Scan uses the Boyer-Moore algorithm to find the first occurrence
// of the specified string within text, beginning at index, and
// constrained within beglimit and endlimit.
//
// The direction and case-sensitivity of the match is determined
// by the arguments to the RegexBoyerMoore constructor.
func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
var (
defadv, test, test2 int
match, startmatch, endmatch int
bump, advance int
chTest rune
unicodeLookup []int
)
if !b.rightToLeft {
defadv = len(b.pattern)
startmatch = len(b.pattern) - 1
endmatch = 0
test = index + defadv - 1
bump = 1
} else {
defadv = -len(b.pattern)
startmatch = 0
endmatch = -defadv - 1
test = index + defadv
bump = -1
}
chMatch := b.pattern[startmatch]
for {
if test >= endlimit || test < beglimit {
return -1
}
chTest = text[test]
if b.caseInsensitive {
chTest = unicode.ToLower(chTest)
}
if chTest != chMatch {
if chTest < 128 {
advance = b.negativeASCII[chTest]
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
unicodeLookup = b.negativeUnicode[chTest>>8]
if len(unicodeLookup) > 0 {
advance = unicodeLookup[chTest&0xFF]
} else {
advance = defadv
}
} else {
advance = defadv
}
test += advance
} else { // if (chTest == chMatch)
test2 = test
match = startmatch
for {
if match == endmatch {
if b.rightToLeft {
return test2 + 1
} else {
return test2
}
}
match -= bump
test2 -= bump
chTest = text[test2]
if b.caseInsensitive {
chTest = unicode.ToLower(chTest)
}
if chTest != b.pattern[match] {
advance = b.positive[match]
if (chTest & 0xFF80) == 0 {
test2 = (match - startmatch) + b.negativeASCII[chTest]
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
unicodeLookup = b.negativeUnicode[chTest>>8]
if len(unicodeLookup) > 0 {
test2 = (match - startmatch) + unicodeLookup[chTest&0xFF]
} else {
test += advance
break
}
} else {
test += advance
break
}
if b.rightToLeft {
if test2 < advance {
advance = test2
}
} else if test2 > advance {
advance = test2
}
test += advance
break
}
}
}
}
}
// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool {
if !b.rightToLeft {
if index < beglimit || endlimit-index < len(b.pattern) {
return false
}
return b.matchPattern(text, index)
} else {
if index > endlimit || index-beglimit < len(b.pattern) {
return false
}
return b.matchPattern(text, index-len(b.pattern))
}
}
func (b *BmPrefix) matchPattern(text []rune, index int) bool {
if len(text)-index < len(b.pattern) {
return false
}
if b.caseInsensitive {
for i := 0; i < len(b.pattern); i++ {
//Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
if unicode.ToLower(text[index+i]) != b.pattern[i] {
return false
}
}
return true
} else {
for i := 0; i < len(b.pattern); i++ {
if text[index+i] != b.pattern[i] {
return false
}
}
return true
}
}
type AnchorLoc int16
// where the regex can be pegged
const (
AnchorBeginning AnchorLoc = 0x0001
AnchorBol = 0x0002
AnchorStart = 0x0004
AnchorEol = 0x0008
AnchorEndZ = 0x0010
AnchorEnd = 0x0020
AnchorBoundary = 0x0040
AnchorECMABoundary = 0x0080
)
func getAnchors(tree *RegexTree) AnchorLoc {
var concatNode *regexNode
nextChild, result := 0, AnchorLoc(0)
curNode := tree.root
for {
switch curNode.t {
case ntConcatenate:
if len(curNode.children) > 0 {
concatNode = curNode
nextChild = 0
}
case ntGreedy, ntCapture:
curNode = curNode.children[0]
concatNode = nil
continue
case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning,
ntStart, ntEndZ, ntEnd:
return result | anchorFromType(curNode.t)
case ntEmpty, ntRequire, ntPrevent:
default:
return result
}
if concatNode == nil || nextChild >= len(concatNode.children) {
return result
}
curNode = concatNode.children[nextChild]
nextChild++
}
}
func anchorFromType(t nodeType) AnchorLoc {
switch t {
case ntBol:
return AnchorBol
case ntEol:
return AnchorEol
case ntBoundary:
return AnchorBoundary
case ntECMABoundary:
return AnchorECMABoundary
case ntBeginning:
return AnchorBeginning
case ntStart:
return AnchorStart
case ntEndZ:
return AnchorEndZ
case ntEnd:
return AnchorEnd
default:
return 0
}
}
// anchorDescription returns a human-readable description of the anchors
func (anchors AnchorLoc) String() string {
buf := &bytes.Buffer{}
if 0 != (anchors & AnchorBeginning) {
buf.WriteString(", Beginning")
}
if 0 != (anchors & AnchorStart) {
buf.WriteString(", Start")
}
if 0 != (anchors & AnchorBol) {
buf.WriteString(", Bol")
}
if 0 != (anchors & AnchorBoundary) {
buf.WriteString(", Boundary")
}
if 0 != (anchors & AnchorECMABoundary) {
buf.WriteString(", ECMABoundary")
}
if 0 != (anchors & AnchorEol) {
buf.WriteString(", Eol")
}
if 0 != (anchors & AnchorEnd) {
buf.WriteString(", End")
}
if 0 != (anchors & AnchorEndZ) {
buf.WriteString(", EndZ")
}
// trim off comma
if buf.Len() >= 2 {
return buf.String()[2:]
}
return "None"
}