Stanislav N. aka pztrn
48d43ca097
Pagination now works. Temporary hardcoded 10 pastes per page, will be put in configuration later. Maybe. From now user will receive readable error message if error occured. Started to work on syntax highlighting, tried to make lexers detection work but apparently to no avail.
655 lines
16 KiB
Go
655 lines
16 KiB
Go
package syntax
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"math"
|
|
"strconv"
|
|
)
|
|
|
|
type RegexTree struct {
|
|
root *regexNode
|
|
caps map[int]int
|
|
capnumlist []int
|
|
captop int
|
|
Capnames map[string]int
|
|
Caplist []string
|
|
options RegexOptions
|
|
}
|
|
|
|
// It is built into a parsed tree for a regular expression.
|
|
|
|
// Implementation notes:
|
|
//
|
|
// Since the node tree is a temporary data structure only used
|
|
// during compilation of the regexp to integer codes, it's
|
|
// designed for clarity and convenience rather than
|
|
// space efficiency.
|
|
//
|
|
// RegexNodes are built into a tree, linked by the n.children list.
|
|
// Each node also has a n.parent and n.ichild member indicating
|
|
// its parent and which child # it is in its parent's list.
|
|
//
|
|
// RegexNodes come in as many types as there are constructs in
|
|
// a regular expression, for example, "concatenate", "alternate",
|
|
// "one", "rept", "group". There are also node types for basic
|
|
// peephole optimizations, e.g., "onerep", "notsetrep", etc.
|
|
//
|
|
// Because perl 5 allows "lookback" groups that scan backwards,
|
|
// each node also gets a "direction". Normally the value of
|
|
// boolean n.backward = false.
|
|
//
|
|
// During parsing, top-level nodes are also stacked onto a parse
|
|
// stack (a stack of trees). For this purpose we have a n.next
|
|
// pointer. [Note that to save a few bytes, we could overload the
|
|
// n.parent pointer instead.]
|
|
//
|
|
// On the parse stack, each tree has a "role" - basically, the
|
|
// nonterminal in the grammar that the parser has currently
|
|
// assigned to the tree. That code is stored in n.role.
|
|
//
|
|
// Finally, some of the different kinds of nodes have data.
|
|
// Two integers (for the looping constructs) are stored in
|
|
// n.operands, an an object (either a string or a set)
|
|
// is stored in n.data
|
|
type regexNode struct {
|
|
t nodeType
|
|
children []*regexNode
|
|
str []rune
|
|
set *CharSet
|
|
ch rune
|
|
m int
|
|
n int
|
|
options RegexOptions
|
|
next *regexNode
|
|
}
|
|
|
|
type nodeType int32
|
|
|
|
const (
|
|
// The following are leaves, and correspond to primitive operations
|
|
|
|
ntOnerep nodeType = 0 // lef,back char,min,max a {n}
|
|
ntNotonerep = 1 // lef,back char,min,max .{n}
|
|
ntSetrep = 2 // lef,back set,min,max [\d]{n}
|
|
ntOneloop = 3 // lef,back char,min,max a {,n}
|
|
ntNotoneloop = 4 // lef,back char,min,max .{,n}
|
|
ntSetloop = 5 // lef,back set,min,max [\d]{,n}
|
|
ntOnelazy = 6 // lef,back char,min,max a {,n}?
|
|
ntNotonelazy = 7 // lef,back char,min,max .{,n}?
|
|
ntSetlazy = 8 // lef,back set,min,max [\d]{,n}?
|
|
ntOne = 9 // lef char a
|
|
ntNotone = 10 // lef char [^a]
|
|
ntSet = 11 // lef set [a-z\s] \w \s \d
|
|
ntMulti = 12 // lef string abcd
|
|
ntRef = 13 // lef group \#
|
|
ntBol = 14 // ^
|
|
ntEol = 15 // $
|
|
ntBoundary = 16 // \b
|
|
ntNonboundary = 17 // \B
|
|
ntBeginning = 18 // \A
|
|
ntStart = 19 // \G
|
|
ntEndZ = 20 // \Z
|
|
ntEnd = 21 // \Z
|
|
|
|
// Interior nodes do not correspond to primitive operations, but
|
|
// control structures compositing other operations
|
|
|
|
// Concat and alternate take n children, and can run forward or backwards
|
|
|
|
ntNothing = 22 // []
|
|
ntEmpty = 23 // ()
|
|
ntAlternate = 24 // a|b
|
|
ntConcatenate = 25 // ab
|
|
ntLoop = 26 // m,x * + ? {,}
|
|
ntLazyloop = 27 // m,x *? +? ?? {,}?
|
|
ntCapture = 28 // n ()
|
|
ntGroup = 29 // (?:)
|
|
ntRequire = 30 // (?=) (?<=)
|
|
ntPrevent = 31 // (?!) (?<!)
|
|
ntGreedy = 32 // (?>) (?<)
|
|
ntTestref = 33 // (?(n) | )
|
|
ntTestgroup = 34 // (?(...) | )
|
|
|
|
ntECMABoundary = 41 // \b
|
|
ntNonECMABoundary = 42 // \B
|
|
)
|
|
|
|
func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
|
|
return ®exNode{
|
|
t: t,
|
|
options: opt,
|
|
}
|
|
}
|
|
|
|
func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
|
|
return ®exNode{
|
|
t: t,
|
|
options: opt,
|
|
ch: ch,
|
|
}
|
|
}
|
|
|
|
func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
|
|
return ®exNode{
|
|
t: t,
|
|
options: opt,
|
|
str: str,
|
|
}
|
|
}
|
|
|
|
func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
|
|
return ®exNode{
|
|
t: t,
|
|
options: opt,
|
|
set: set,
|
|
}
|
|
}
|
|
|
|
func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
|
|
return ®exNode{
|
|
t: t,
|
|
options: opt,
|
|
m: m,
|
|
}
|
|
}
|
|
func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
|
|
return ®exNode{
|
|
t: t,
|
|
options: opt,
|
|
m: m,
|
|
n: n,
|
|
}
|
|
}
|
|
|
|
func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
|
|
for i := 0; i < len(n.str); i++ {
|
|
buf.WriteRune(n.str[i])
|
|
}
|
|
}
|
|
|
|
func (n *regexNode) addChild(child *regexNode) {
|
|
reduced := child.reduce()
|
|
n.children = append(n.children, reduced)
|
|
reduced.next = n
|
|
}
|
|
|
|
func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
|
|
newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
|
|
n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
|
|
}
|
|
|
|
// removes children including the start but not the end index
|
|
func (n *regexNode) removeChildren(startIndex, endIndex int) {
|
|
n.children = append(n.children[:startIndex], n.children[endIndex:]...)
|
|
}
|
|
|
|
// Pass type as OneLazy or OneLoop
|
|
func (n *regexNode) makeRep(t nodeType, min, max int) {
|
|
n.t += (t - ntOne)
|
|
n.m = min
|
|
n.n = max
|
|
}
|
|
|
|
func (n *regexNode) reduce() *regexNode {
|
|
switch n.t {
|
|
case ntAlternate:
|
|
return n.reduceAlternation()
|
|
|
|
case ntConcatenate:
|
|
return n.reduceConcatenation()
|
|
|
|
case ntLoop, ntLazyloop:
|
|
return n.reduceRep()
|
|
|
|
case ntGroup:
|
|
return n.reduceGroup()
|
|
|
|
case ntSet, ntSetloop:
|
|
return n.reduceSet()
|
|
|
|
default:
|
|
return n
|
|
}
|
|
}
|
|
|
|
// Basic optimization. Single-letter alternations can be replaced
|
|
// by faster set specifications, and nested alternations with no
|
|
// intervening operators can be flattened:
|
|
//
|
|
// a|b|c|def|g|h -> [a-c]|def|[gh]
|
|
// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
|
|
func (n *regexNode) reduceAlternation() *regexNode {
|
|
if len(n.children) == 0 {
|
|
return newRegexNode(ntNothing, n.options)
|
|
}
|
|
|
|
wasLastSet := false
|
|
lastNodeCannotMerge := false
|
|
var optionsLast RegexOptions
|
|
var i, j int
|
|
|
|
for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
|
|
at := n.children[i]
|
|
|
|
if j < i {
|
|
n.children[j] = at
|
|
}
|
|
|
|
for {
|
|
if at.t == ntAlternate {
|
|
for k := 0; k < len(at.children); k++ {
|
|
at.children[k].next = n
|
|
}
|
|
n.insertChildren(i+1, at.children)
|
|
|
|
j--
|
|
} else if at.t == ntSet || at.t == ntOne {
|
|
// Cannot merge sets if L or I options differ, or if either are negated.
|
|
optionsAt := at.options & (RightToLeft | IgnoreCase)
|
|
|
|
if at.t == ntSet {
|
|
if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
|
|
wasLastSet = true
|
|
lastNodeCannotMerge = !at.set.IsMergeable()
|
|
optionsLast = optionsAt
|
|
break
|
|
}
|
|
} else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
|
|
wasLastSet = true
|
|
lastNodeCannotMerge = false
|
|
optionsLast = optionsAt
|
|
break
|
|
}
|
|
|
|
// The last node was a Set or a One, we're a Set or One and our options are the same.
|
|
// Merge the two nodes.
|
|
j--
|
|
prev := n.children[j]
|
|
|
|
var prevCharClass *CharSet
|
|
if prev.t == ntOne {
|
|
prevCharClass = &CharSet{}
|
|
prevCharClass.addChar(prev.ch)
|
|
} else {
|
|
prevCharClass = prev.set
|
|
}
|
|
|
|
if at.t == ntOne {
|
|
prevCharClass.addChar(at.ch)
|
|
} else {
|
|
prevCharClass.addSet(*at.set)
|
|
}
|
|
|
|
prev.t = ntSet
|
|
prev.set = prevCharClass
|
|
} else if at.t == ntNothing {
|
|
j--
|
|
} else {
|
|
wasLastSet = false
|
|
lastNodeCannotMerge = false
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
if j < i {
|
|
n.removeChildren(j, i)
|
|
}
|
|
|
|
return n.stripEnation(ntNothing)
|
|
}
|
|
|
|
// Basic optimization. Adjacent strings can be concatenated.
|
|
//
|
|
// (?:abc)(?:def) -> abcdef
|
|
func (n *regexNode) reduceConcatenation() *regexNode {
|
|
// Eliminate empties and concat adjacent strings/chars
|
|
|
|
var optionsLast RegexOptions
|
|
var optionsAt RegexOptions
|
|
var i, j int
|
|
|
|
if len(n.children) == 0 {
|
|
return newRegexNode(ntEmpty, n.options)
|
|
}
|
|
|
|
wasLastString := false
|
|
|
|
for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
|
|
var at, prev *regexNode
|
|
|
|
at = n.children[i]
|
|
|
|
if j < i {
|
|
n.children[j] = at
|
|
}
|
|
|
|
if at.t == ntConcatenate &&
|
|
((at.options & RightToLeft) == (n.options & RightToLeft)) {
|
|
for k := 0; k < len(at.children); k++ {
|
|
at.children[k].next = n
|
|
}
|
|
|
|
//insert at.children at i+1 index in n.children
|
|
n.insertChildren(i+1, at.children)
|
|
|
|
j--
|
|
} else if at.t == ntMulti || at.t == ntOne {
|
|
// Cannot merge strings if L or I options differ
|
|
optionsAt = at.options & (RightToLeft | IgnoreCase)
|
|
|
|
if !wasLastString || optionsLast != optionsAt {
|
|
wasLastString = true
|
|
optionsLast = optionsAt
|
|
continue
|
|
}
|
|
|
|
j--
|
|
prev = n.children[j]
|
|
|
|
if prev.t == ntOne {
|
|
prev.t = ntMulti
|
|
prev.str = []rune{prev.ch}
|
|
}
|
|
|
|
if (optionsAt & RightToLeft) == 0 {
|
|
if at.t == ntOne {
|
|
prev.str = append(prev.str, at.ch)
|
|
} else {
|
|
prev.str = append(prev.str, at.str...)
|
|
}
|
|
} else {
|
|
if at.t == ntOne {
|
|
// insert at the front by expanding our slice, copying the data over, and then setting the value
|
|
prev.str = append(prev.str, 0)
|
|
copy(prev.str[1:], prev.str)
|
|
prev.str[0] = at.ch
|
|
} else {
|
|
//insert at the front...this one we'll make a new slice and copy both into it
|
|
merge := make([]rune, len(prev.str)+len(at.str))
|
|
copy(merge, at.str)
|
|
copy(merge[len(at.str):], prev.str)
|
|
prev.str = merge
|
|
}
|
|
}
|
|
} else if at.t == ntEmpty {
|
|
j--
|
|
} else {
|
|
wasLastString = false
|
|
}
|
|
}
|
|
|
|
if j < i {
|
|
// remove indices j through i from the children
|
|
n.removeChildren(j, i)
|
|
}
|
|
|
|
return n.stripEnation(ntEmpty)
|
|
}
|
|
|
|
// Nested repeaters just get multiplied with each other if they're not
|
|
// too lumpy
|
|
func (n *regexNode) reduceRep() *regexNode {
|
|
|
|
u := n
|
|
t := n.t
|
|
min := n.m
|
|
max := n.n
|
|
|
|
for {
|
|
if len(u.children) == 0 {
|
|
break
|
|
}
|
|
|
|
child := u.children[0]
|
|
|
|
// multiply reps of the same type only
|
|
if child.t != t {
|
|
childType := child.t
|
|
|
|
if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
|
|
childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
|
|
break
|
|
}
|
|
}
|
|
|
|
// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
|
|
// [but things like (a {2,})+ are not too lumpy...]
|
|
if u.m == 0 && child.m > 1 || child.n < child.m*2 {
|
|
break
|
|
}
|
|
|
|
u = child
|
|
if u.m > 0 {
|
|
if (math.MaxInt32-1)/u.m < min {
|
|
u.m = math.MaxInt32
|
|
} else {
|
|
u.m = u.m * min
|
|
}
|
|
}
|
|
if u.n > 0 {
|
|
if (math.MaxInt32-1)/u.n < max {
|
|
u.n = math.MaxInt32
|
|
} else {
|
|
u.n = u.n * max
|
|
}
|
|
}
|
|
}
|
|
|
|
if math.MaxInt32 == min {
|
|
return newRegexNode(ntNothing, n.options)
|
|
}
|
|
return u
|
|
|
|
}
|
|
|
|
// Simple optimization. If a concatenation or alternation has only
|
|
// one child strip out the intermediate node. If it has zero children,
|
|
// turn it into an empty.
|
|
func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
|
|
switch len(n.children) {
|
|
case 0:
|
|
return newRegexNode(emptyType, n.options)
|
|
case 1:
|
|
return n.children[0]
|
|
default:
|
|
return n
|
|
}
|
|
}
|
|
|
|
func (n *regexNode) reduceGroup() *regexNode {
|
|
u := n
|
|
|
|
for u.t == ntGroup {
|
|
u = u.children[0]
|
|
}
|
|
|
|
return u
|
|
}
|
|
|
|
// Simple optimization. If a set is a singleton, an inverse singleton,
|
|
// or empty, it's transformed accordingly.
|
|
func (n *regexNode) reduceSet() *regexNode {
|
|
// Extract empty-set, one and not-one case as special
|
|
|
|
if n.set == nil {
|
|
n.t = ntNothing
|
|
} else if n.set.IsSingleton() {
|
|
n.ch = n.set.SingletonChar()
|
|
n.set = nil
|
|
n.t += (ntOne - ntSet)
|
|
} else if n.set.IsSingletonInverse() {
|
|
n.ch = n.set.SingletonChar()
|
|
n.set = nil
|
|
n.t += (ntNotone - ntSet)
|
|
}
|
|
|
|
return n
|
|
}
|
|
|
|
func (n *regexNode) reverseLeft() *regexNode {
|
|
if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
|
|
//reverse children order
|
|
for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
|
|
n.children[left], n.children[right] = n.children[right], n.children[left]
|
|
}
|
|
}
|
|
|
|
return n
|
|
}
|
|
|
|
func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
|
|
if min == 0 && max == 0 {
|
|
return newRegexNode(ntEmpty, n.options)
|
|
}
|
|
|
|
if min == 1 && max == 1 {
|
|
return n
|
|
}
|
|
|
|
switch n.t {
|
|
case ntOne, ntNotone, ntSet:
|
|
if lazy {
|
|
n.makeRep(Onelazy, min, max)
|
|
} else {
|
|
n.makeRep(Oneloop, min, max)
|
|
}
|
|
return n
|
|
|
|
default:
|
|
var t nodeType
|
|
if lazy {
|
|
t = ntLazyloop
|
|
} else {
|
|
t = ntLoop
|
|
}
|
|
result := newRegexNodeMN(t, n.options, min, max)
|
|
result.addChild(n)
|
|
return result
|
|
}
|
|
}
|
|
|
|
// debug functions
|
|
|
|
var typeStr = []string{
|
|
"Onerep", "Notonerep", "Setrep",
|
|
"Oneloop", "Notoneloop", "Setloop",
|
|
"Onelazy", "Notonelazy", "Setlazy",
|
|
"One", "Notone", "Set",
|
|
"Multi", "Ref",
|
|
"Bol", "Eol", "Boundary", "Nonboundary",
|
|
"Beginning", "Start", "EndZ", "End",
|
|
"Nothing", "Empty",
|
|
"Alternate", "Concatenate",
|
|
"Loop", "Lazyloop",
|
|
"Capture", "Group", "Require", "Prevent", "Greedy",
|
|
"Testref", "Testgroup",
|
|
"Unknown", "Unknown", "Unknown",
|
|
"Unknown", "Unknown", "Unknown",
|
|
"ECMABoundary", "NonECMABoundary",
|
|
}
|
|
|
|
func (n *regexNode) description() string {
|
|
buf := &bytes.Buffer{}
|
|
|
|
buf.WriteString(typeStr[n.t])
|
|
|
|
if (n.options & ExplicitCapture) != 0 {
|
|
buf.WriteString("-C")
|
|
}
|
|
if (n.options & IgnoreCase) != 0 {
|
|
buf.WriteString("-I")
|
|
}
|
|
if (n.options & RightToLeft) != 0 {
|
|
buf.WriteString("-L")
|
|
}
|
|
if (n.options & Multiline) != 0 {
|
|
buf.WriteString("-M")
|
|
}
|
|
if (n.options & Singleline) != 0 {
|
|
buf.WriteString("-S")
|
|
}
|
|
if (n.options & IgnorePatternWhitespace) != 0 {
|
|
buf.WriteString("-X")
|
|
}
|
|
if (n.options & ECMAScript) != 0 {
|
|
buf.WriteString("-E")
|
|
}
|
|
|
|
switch n.t {
|
|
case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
|
|
buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
|
|
break
|
|
case ntCapture:
|
|
buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
|
|
break
|
|
case ntRef, ntTestref:
|
|
buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
|
|
break
|
|
case ntMulti:
|
|
fmt.Fprintf(buf, "(String = %s)", string(n.str))
|
|
break
|
|
case ntSet, ntSetloop, ntSetlazy:
|
|
buf.WriteString("(Set = " + n.set.String() + ")")
|
|
break
|
|
}
|
|
|
|
switch n.t {
|
|
case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
|
|
buf.WriteString("(Min = ")
|
|
buf.WriteString(strconv.Itoa(n.m))
|
|
buf.WriteString(", Max = ")
|
|
if n.n == math.MaxInt32 {
|
|
buf.WriteString("inf")
|
|
} else {
|
|
buf.WriteString(strconv.Itoa(n.n))
|
|
}
|
|
buf.WriteString(")")
|
|
|
|
break
|
|
}
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
var padSpace = []byte(" ")
|
|
|
|
func (t *RegexTree) Dump() string {
|
|
return t.root.dump()
|
|
}
|
|
|
|
func (n *regexNode) dump() string {
|
|
var stack []int
|
|
CurNode := n
|
|
CurChild := 0
|
|
|
|
buf := bytes.NewBufferString(CurNode.description())
|
|
buf.WriteRune('\n')
|
|
|
|
for {
|
|
if CurNode.children != nil && CurChild < len(CurNode.children) {
|
|
stack = append(stack, CurChild+1)
|
|
CurNode = CurNode.children[CurChild]
|
|
CurChild = 0
|
|
|
|
Depth := len(stack)
|
|
if Depth > 32 {
|
|
Depth = 32
|
|
}
|
|
buf.Write(padSpace[:Depth])
|
|
buf.WriteString(CurNode.description())
|
|
buf.WriteRune('\n')
|
|
} else {
|
|
if len(stack) == 0 {
|
|
break
|
|
}
|
|
|
|
CurChild = stack[len(stack)-1]
|
|
stack = stack[:len(stack)-1]
|
|
CurNode = CurNode.next
|
|
}
|
|
}
|
|
return buf.String()
|
|
}
|