Stanislav N. aka pztrn
48d43ca097
Pagination now works. Temporary hardcoded 10 pastes per page, will be put in configuration later. Maybe. From now user will receive readable error message if error occured. Started to work on syntax highlighting, tried to make lexers detection work but apparently to no avail.
501 lines
11 KiB
Go
501 lines
11 KiB
Go
package syntax
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
)
|
|
|
|
func Write(tree *RegexTree) (*Code, error) {
|
|
w := writer{
|
|
intStack: make([]int, 0, 32),
|
|
emitted: make([]int, 2),
|
|
stringhash: make(map[string]int),
|
|
sethash: make(map[string]int),
|
|
}
|
|
|
|
code, err := w.codeFromTree(tree)
|
|
|
|
if tree.options&Debug > 0 && code != nil {
|
|
os.Stdout.WriteString(code.Dump())
|
|
os.Stdout.WriteString("\n")
|
|
}
|
|
|
|
return code, err
|
|
}
|
|
|
|
type writer struct {
|
|
emitted []int
|
|
|
|
intStack []int
|
|
curpos int
|
|
stringhash map[string]int
|
|
stringtable [][]rune
|
|
sethash map[string]int
|
|
settable []*CharSet
|
|
counting bool
|
|
count int
|
|
trackcount int
|
|
caps map[int]int
|
|
}
|
|
|
|
const (
|
|
beforeChild nodeType = 64
|
|
afterChild = 128
|
|
//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
|
|
MaxPrefixSize = 50
|
|
)
|
|
|
|
// The top level RegexCode generator. It does a depth-first walk
|
|
// through the tree and calls EmitFragment to emits code before
|
|
// and after each child of an interior node, and at each leaf.
|
|
//
|
|
// It runs two passes, first to count the size of the generated
|
|
// code, and second to generate the code.
|
|
//
|
|
// We should time it against the alternative, which is
|
|
// to just generate the code and grow the array as we go.
|
|
func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
|
|
var (
|
|
curNode *regexNode
|
|
curChild int
|
|
capsize int
|
|
)
|
|
// construct sparse capnum mapping if some numbers are unused
|
|
|
|
if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
|
|
capsize = tree.captop
|
|
w.caps = nil
|
|
} else {
|
|
capsize = len(tree.capnumlist)
|
|
w.caps = tree.caps
|
|
for i := 0; i < len(tree.capnumlist); i++ {
|
|
w.caps[tree.capnumlist[i]] = i
|
|
}
|
|
}
|
|
|
|
w.counting = true
|
|
|
|
for {
|
|
if !w.counting {
|
|
w.emitted = make([]int, w.count)
|
|
}
|
|
|
|
curNode = tree.root
|
|
curChild = 0
|
|
|
|
w.emit1(Lazybranch, 0)
|
|
|
|
for {
|
|
if len(curNode.children) == 0 {
|
|
w.emitFragment(curNode.t, curNode, 0)
|
|
} else if curChild < len(curNode.children) {
|
|
w.emitFragment(curNode.t|beforeChild, curNode, curChild)
|
|
|
|
curNode = curNode.children[curChild]
|
|
|
|
w.pushInt(curChild)
|
|
curChild = 0
|
|
continue
|
|
}
|
|
|
|
if w.emptyStack() {
|
|
break
|
|
}
|
|
|
|
curChild = w.popInt()
|
|
curNode = curNode.next
|
|
|
|
w.emitFragment(curNode.t|afterChild, curNode, curChild)
|
|
curChild++
|
|
}
|
|
|
|
w.patchJump(0, w.curPos())
|
|
w.emit(Stop)
|
|
|
|
if !w.counting {
|
|
break
|
|
}
|
|
|
|
w.counting = false
|
|
}
|
|
|
|
fcPrefix := getFirstCharsPrefix(tree)
|
|
prefix := getPrefix(tree)
|
|
rtl := (tree.options & RightToLeft) != 0
|
|
|
|
var bmPrefix *BmPrefix
|
|
//TODO: benchmark string prefixes
|
|
if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
|
|
if len(prefix.PrefixStr) > MaxPrefixSize {
|
|
// limit prefix changes to 10k
|
|
prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
|
|
}
|
|
bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
|
|
} else {
|
|
bmPrefix = nil
|
|
}
|
|
|
|
return &Code{
|
|
Codes: w.emitted,
|
|
Strings: w.stringtable,
|
|
Sets: w.settable,
|
|
TrackCount: w.trackcount,
|
|
Caps: w.caps,
|
|
Capsize: capsize,
|
|
FcPrefix: fcPrefix,
|
|
BmPrefix: bmPrefix,
|
|
Anchors: getAnchors(tree),
|
|
RightToLeft: rtl,
|
|
}, nil
|
|
}
|
|
|
|
// The main RegexCode generator. It does a depth-first walk
|
|
// through the tree and calls EmitFragment to emits code before
|
|
// and after each child of an interior node, and at each leaf.
|
|
func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
|
|
bits := InstOp(0)
|
|
|
|
if nodetype <= ntRef {
|
|
if (node.options & RightToLeft) != 0 {
|
|
bits |= Rtl
|
|
}
|
|
if (node.options & IgnoreCase) != 0 {
|
|
bits |= Ci
|
|
}
|
|
}
|
|
ntBits := nodeType(bits)
|
|
|
|
switch nodetype {
|
|
case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
|
|
break
|
|
|
|
case ntAlternate | beforeChild:
|
|
if curIndex < len(node.children)-1 {
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Lazybranch, 0)
|
|
}
|
|
|
|
case ntAlternate | afterChild:
|
|
if curIndex < len(node.children)-1 {
|
|
lbPos := w.popInt()
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Goto, 0)
|
|
w.patchJump(lbPos, w.curPos())
|
|
} else {
|
|
for i := 0; i < curIndex; i++ {
|
|
w.patchJump(w.popInt(), w.curPos())
|
|
}
|
|
}
|
|
break
|
|
|
|
case ntTestref | beforeChild:
|
|
if curIndex == 0 {
|
|
w.emit(Setjump)
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Lazybranch, 0)
|
|
w.emit1(Testref, w.mapCapnum(node.m))
|
|
w.emit(Forejump)
|
|
}
|
|
|
|
case ntTestref | afterChild:
|
|
if curIndex == 0 {
|
|
branchpos := w.popInt()
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Goto, 0)
|
|
w.patchJump(branchpos, w.curPos())
|
|
w.emit(Forejump)
|
|
if len(node.children) <= 1 {
|
|
w.patchJump(w.popInt(), w.curPos())
|
|
}
|
|
} else if curIndex == 1 {
|
|
w.patchJump(w.popInt(), w.curPos())
|
|
}
|
|
|
|
case ntTestgroup | beforeChild:
|
|
if curIndex == 0 {
|
|
w.emit(Setjump)
|
|
w.emit(Setmark)
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Lazybranch, 0)
|
|
}
|
|
|
|
case ntTestgroup | afterChild:
|
|
if curIndex == 0 {
|
|
w.emit(Getmark)
|
|
w.emit(Forejump)
|
|
} else if curIndex == 1 {
|
|
Branchpos := w.popInt()
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Goto, 0)
|
|
w.patchJump(Branchpos, w.curPos())
|
|
w.emit(Getmark)
|
|
w.emit(Forejump)
|
|
if len(node.children) <= 2 {
|
|
w.patchJump(w.popInt(), w.curPos())
|
|
}
|
|
} else if curIndex == 2 {
|
|
w.patchJump(w.popInt(), w.curPos())
|
|
}
|
|
|
|
case ntLoop | beforeChild, ntLazyloop | beforeChild:
|
|
|
|
if node.n < math.MaxInt32 || node.m > 1 {
|
|
if node.m == 0 {
|
|
w.emit1(Nullcount, 0)
|
|
} else {
|
|
w.emit1(Setcount, 1-node.m)
|
|
}
|
|
} else if node.m == 0 {
|
|
w.emit(Nullmark)
|
|
} else {
|
|
w.emit(Setmark)
|
|
}
|
|
|
|
if node.m == 0 {
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Goto, 0)
|
|
}
|
|
w.pushInt(w.curPos())
|
|
|
|
case ntLoop | afterChild, ntLazyloop | afterChild:
|
|
|
|
startJumpPos := w.curPos()
|
|
lazy := (nodetype - (ntLoop | afterChild))
|
|
|
|
if node.n < math.MaxInt32 || node.m > 1 {
|
|
if node.n == math.MaxInt32 {
|
|
w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
|
|
} else {
|
|
w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
|
|
}
|
|
} else {
|
|
w.emit1(InstOp(Branchmark+lazy), w.popInt())
|
|
}
|
|
|
|
if node.m == 0 {
|
|
w.patchJump(w.popInt(), startJumpPos)
|
|
}
|
|
|
|
case ntGroup | beforeChild, ntGroup | afterChild:
|
|
|
|
case ntCapture | beforeChild:
|
|
w.emit(Setmark)
|
|
|
|
case ntCapture | afterChild:
|
|
w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
|
|
|
|
case ntRequire | beforeChild:
|
|
// NOTE: the following line causes lookahead/lookbehind to be
|
|
// NON-BACKTRACKING. It can be commented out with (*)
|
|
w.emit(Setjump)
|
|
|
|
w.emit(Setmark)
|
|
|
|
case ntRequire | afterChild:
|
|
w.emit(Getmark)
|
|
|
|
// NOTE: the following line causes lookahead/lookbehind to be
|
|
// NON-BACKTRACKING. It can be commented out with (*)
|
|
w.emit(Forejump)
|
|
|
|
case ntPrevent | beforeChild:
|
|
w.emit(Setjump)
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Lazybranch, 0)
|
|
|
|
case ntPrevent | afterChild:
|
|
w.emit(Backjump)
|
|
w.patchJump(w.popInt(), w.curPos())
|
|
w.emit(Forejump)
|
|
|
|
case ntGreedy | beforeChild:
|
|
w.emit(Setjump)
|
|
|
|
case ntGreedy | afterChild:
|
|
w.emit(Forejump)
|
|
|
|
case ntOne, ntNotone:
|
|
w.emit1(InstOp(node.t|ntBits), int(node.ch))
|
|
|
|
case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
|
|
if node.m > 0 {
|
|
if node.t == ntOneloop || node.t == ntOnelazy {
|
|
w.emit2(Onerep|bits, int(node.ch), node.m)
|
|
} else {
|
|
w.emit2(Notonerep|bits, int(node.ch), node.m)
|
|
}
|
|
}
|
|
if node.n > node.m {
|
|
if node.n == math.MaxInt32 {
|
|
w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
|
|
} else {
|
|
w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
|
|
}
|
|
}
|
|
|
|
case ntSetloop, ntSetlazy:
|
|
if node.m > 0 {
|
|
w.emit2(Setrep|bits, w.setCode(node.set), node.m)
|
|
}
|
|
if node.n > node.m {
|
|
if node.n == math.MaxInt32 {
|
|
w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
|
|
} else {
|
|
w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
|
|
}
|
|
}
|
|
|
|
case ntMulti:
|
|
w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
|
|
|
|
case ntSet:
|
|
w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
|
|
|
|
case ntRef:
|
|
w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
|
|
|
|
case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
|
|
w.emit(InstOp(node.t))
|
|
|
|
default:
|
|
return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// To avoid recursion, we use a simple integer stack.
|
|
// This is the push.
|
|
func (w *writer) pushInt(i int) {
|
|
w.intStack = append(w.intStack, i)
|
|
}
|
|
|
|
// Returns true if the stack is empty.
|
|
func (w *writer) emptyStack() bool {
|
|
return len(w.intStack) == 0
|
|
}
|
|
|
|
// This is the pop.
|
|
func (w *writer) popInt() int {
|
|
//get our item
|
|
idx := len(w.intStack) - 1
|
|
i := w.intStack[idx]
|
|
//trim our slice
|
|
w.intStack = w.intStack[:idx]
|
|
return i
|
|
}
|
|
|
|
// Returns the current position in the emitted code.
|
|
func (w *writer) curPos() int {
|
|
return w.curpos
|
|
}
|
|
|
|
// Fixes up a jump instruction at the specified offset
|
|
// so that it jumps to the specified jumpDest.
|
|
func (w *writer) patchJump(offset, jumpDest int) {
|
|
w.emitted[offset+1] = jumpDest
|
|
}
|
|
|
|
// Returns an index in the set table for a charset
|
|
// uses a map to eliminate duplicates.
|
|
func (w *writer) setCode(set *CharSet) int {
|
|
if w.counting {
|
|
return 0
|
|
}
|
|
|
|
buf := &bytes.Buffer{}
|
|
|
|
set.mapHashFill(buf)
|
|
hash := buf.String()
|
|
i, ok := w.sethash[hash]
|
|
if !ok {
|
|
i = len(w.sethash)
|
|
w.sethash[hash] = i
|
|
w.settable = append(w.settable, set)
|
|
}
|
|
return i
|
|
}
|
|
|
|
// Returns an index in the string table for a string.
|
|
// uses a map to eliminate duplicates.
|
|
func (w *writer) stringCode(str []rune) int {
|
|
if w.counting {
|
|
return 0
|
|
}
|
|
|
|
hash := string(str)
|
|
i, ok := w.stringhash[hash]
|
|
if !ok {
|
|
i = len(w.stringhash)
|
|
w.stringhash[hash] = i
|
|
w.stringtable = append(w.stringtable, str)
|
|
}
|
|
|
|
return i
|
|
}
|
|
|
|
// When generating code on a regex that uses a sparse set
|
|
// of capture slots, we hash them to a dense set of indices
|
|
// for an array of capture slots. Instead of doing the hash
|
|
// at match time, it's done at compile time, here.
|
|
func (w *writer) mapCapnum(capnum int) int {
|
|
if capnum == -1 {
|
|
return -1
|
|
}
|
|
|
|
if w.caps != nil {
|
|
return w.caps[capnum]
|
|
}
|
|
|
|
return capnum
|
|
}
|
|
|
|
// Emits a zero-argument operation. Note that the emit
|
|
// functions all run in two modes: they can emit code, or
|
|
// they can just count the size of the code.
|
|
func (w *writer) emit(op InstOp) {
|
|
if w.counting {
|
|
w.count++
|
|
if opcodeBacktracks(op) {
|
|
w.trackcount++
|
|
}
|
|
return
|
|
}
|
|
w.emitted[w.curpos] = int(op)
|
|
w.curpos++
|
|
}
|
|
|
|
// Emits a one-argument operation.
|
|
func (w *writer) emit1(op InstOp, opd1 int) {
|
|
if w.counting {
|
|
w.count += 2
|
|
if opcodeBacktracks(op) {
|
|
w.trackcount++
|
|
}
|
|
return
|
|
}
|
|
w.emitted[w.curpos] = int(op)
|
|
w.curpos++
|
|
w.emitted[w.curpos] = opd1
|
|
w.curpos++
|
|
}
|
|
|
|
// Emits a two-argument operation.
|
|
func (w *writer) emit2(op InstOp, opd1, opd2 int) {
|
|
if w.counting {
|
|
w.count += 3
|
|
if opcodeBacktracks(op) {
|
|
w.trackcount++
|
|
}
|
|
return
|
|
}
|
|
w.emitted[w.curpos] = int(op)
|
|
w.curpos++
|
|
w.emitted[w.curpos] = opd1
|
|
w.curpos++
|
|
w.emitted[w.curpos] = opd2
|
|
w.curpos++
|
|
}
|