fastpastebin/vendor/github.com/dlclark/regexp2/syntax/writer.go
Stanislav N. aka pztrn 48d43ca097 Pagination, readable error messages to user, syntax highlighting started.
Pagination now works. Temporary hardcoded 10 pastes per page, will be put
in configuration later. Maybe.

From now user will receive readable error message if error occured.

Started to work on syntax highlighting, tried to make lexers detection
work but apparently to no avail.
2018-05-01 02:37:51 +05:00

501 lines
11 KiB
Go

package syntax
import (
"bytes"
"fmt"
"math"
"os"
)
func Write(tree *RegexTree) (*Code, error) {
w := writer{
intStack: make([]int, 0, 32),
emitted: make([]int, 2),
stringhash: make(map[string]int),
sethash: make(map[string]int),
}
code, err := w.codeFromTree(tree)
if tree.options&Debug > 0 && code != nil {
os.Stdout.WriteString(code.Dump())
os.Stdout.WriteString("\n")
}
return code, err
}
type writer struct {
emitted []int
intStack []int
curpos int
stringhash map[string]int
stringtable [][]rune
sethash map[string]int
settable []*CharSet
counting bool
count int
trackcount int
caps map[int]int
}
const (
beforeChild nodeType = 64
afterChild = 128
//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
MaxPrefixSize = 50
)
// The top level RegexCode generator. It does a depth-first walk
// through the tree and calls EmitFragment to emits code before
// and after each child of an interior node, and at each leaf.
//
// It runs two passes, first to count the size of the generated
// code, and second to generate the code.
//
// We should time it against the alternative, which is
// to just generate the code and grow the array as we go.
func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
var (
curNode *regexNode
curChild int
capsize int
)
// construct sparse capnum mapping if some numbers are unused
if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
capsize = tree.captop
w.caps = nil
} else {
capsize = len(tree.capnumlist)
w.caps = tree.caps
for i := 0; i < len(tree.capnumlist); i++ {
w.caps[tree.capnumlist[i]] = i
}
}
w.counting = true
for {
if !w.counting {
w.emitted = make([]int, w.count)
}
curNode = tree.root
curChild = 0
w.emit1(Lazybranch, 0)
for {
if len(curNode.children) == 0 {
w.emitFragment(curNode.t, curNode, 0)
} else if curChild < len(curNode.children) {
w.emitFragment(curNode.t|beforeChild, curNode, curChild)
curNode = curNode.children[curChild]
w.pushInt(curChild)
curChild = 0
continue
}
if w.emptyStack() {
break
}
curChild = w.popInt()
curNode = curNode.next
w.emitFragment(curNode.t|afterChild, curNode, curChild)
curChild++
}
w.patchJump(0, w.curPos())
w.emit(Stop)
if !w.counting {
break
}
w.counting = false
}
fcPrefix := getFirstCharsPrefix(tree)
prefix := getPrefix(tree)
rtl := (tree.options & RightToLeft) != 0
var bmPrefix *BmPrefix
//TODO: benchmark string prefixes
if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
if len(prefix.PrefixStr) > MaxPrefixSize {
// limit prefix changes to 10k
prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
}
bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
} else {
bmPrefix = nil
}
return &Code{
Codes: w.emitted,
Strings: w.stringtable,
Sets: w.settable,
TrackCount: w.trackcount,
Caps: w.caps,
Capsize: capsize,
FcPrefix: fcPrefix,
BmPrefix: bmPrefix,
Anchors: getAnchors(tree),
RightToLeft: rtl,
}, nil
}
// The main RegexCode generator. It does a depth-first walk
// through the tree and calls EmitFragment to emits code before
// and after each child of an interior node, and at each leaf.
func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
bits := InstOp(0)
if nodetype <= ntRef {
if (node.options & RightToLeft) != 0 {
bits |= Rtl
}
if (node.options & IgnoreCase) != 0 {
bits |= Ci
}
}
ntBits := nodeType(bits)
switch nodetype {
case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
break
case ntAlternate | beforeChild:
if curIndex < len(node.children)-1 {
w.pushInt(w.curPos())
w.emit1(Lazybranch, 0)
}
case ntAlternate | afterChild:
if curIndex < len(node.children)-1 {
lbPos := w.popInt()
w.pushInt(w.curPos())
w.emit1(Goto, 0)
w.patchJump(lbPos, w.curPos())
} else {
for i := 0; i < curIndex; i++ {
w.patchJump(w.popInt(), w.curPos())
}
}
break
case ntTestref | beforeChild:
if curIndex == 0 {
w.emit(Setjump)
w.pushInt(w.curPos())
w.emit1(Lazybranch, 0)
w.emit1(Testref, w.mapCapnum(node.m))
w.emit(Forejump)
}
case ntTestref | afterChild:
if curIndex == 0 {
branchpos := w.popInt()
w.pushInt(w.curPos())
w.emit1(Goto, 0)
w.patchJump(branchpos, w.curPos())
w.emit(Forejump)
if len(node.children) <= 1 {
w.patchJump(w.popInt(), w.curPos())
}
} else if curIndex == 1 {
w.patchJump(w.popInt(), w.curPos())
}
case ntTestgroup | beforeChild:
if curIndex == 0 {
w.emit(Setjump)
w.emit(Setmark)
w.pushInt(w.curPos())
w.emit1(Lazybranch, 0)
}
case ntTestgroup | afterChild:
if curIndex == 0 {
w.emit(Getmark)
w.emit(Forejump)
} else if curIndex == 1 {
Branchpos := w.popInt()
w.pushInt(w.curPos())
w.emit1(Goto, 0)
w.patchJump(Branchpos, w.curPos())
w.emit(Getmark)
w.emit(Forejump)
if len(node.children) <= 2 {
w.patchJump(w.popInt(), w.curPos())
}
} else if curIndex == 2 {
w.patchJump(w.popInt(), w.curPos())
}
case ntLoop | beforeChild, ntLazyloop | beforeChild:
if node.n < math.MaxInt32 || node.m > 1 {
if node.m == 0 {
w.emit1(Nullcount, 0)
} else {
w.emit1(Setcount, 1-node.m)
}
} else if node.m == 0 {
w.emit(Nullmark)
} else {
w.emit(Setmark)
}
if node.m == 0 {
w.pushInt(w.curPos())
w.emit1(Goto, 0)
}
w.pushInt(w.curPos())
case ntLoop | afterChild, ntLazyloop | afterChild:
startJumpPos := w.curPos()
lazy := (nodetype - (ntLoop | afterChild))
if node.n < math.MaxInt32 || node.m > 1 {
if node.n == math.MaxInt32 {
w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
} else {
w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
}
} else {
w.emit1(InstOp(Branchmark+lazy), w.popInt())
}
if node.m == 0 {
w.patchJump(w.popInt(), startJumpPos)
}
case ntGroup | beforeChild, ntGroup | afterChild:
case ntCapture | beforeChild:
w.emit(Setmark)
case ntCapture | afterChild:
w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
case ntRequire | beforeChild:
// NOTE: the following line causes lookahead/lookbehind to be
// NON-BACKTRACKING. It can be commented out with (*)
w.emit(Setjump)
w.emit(Setmark)
case ntRequire | afterChild:
w.emit(Getmark)
// NOTE: the following line causes lookahead/lookbehind to be
// NON-BACKTRACKING. It can be commented out with (*)
w.emit(Forejump)
case ntPrevent | beforeChild:
w.emit(Setjump)
w.pushInt(w.curPos())
w.emit1(Lazybranch, 0)
case ntPrevent | afterChild:
w.emit(Backjump)
w.patchJump(w.popInt(), w.curPos())
w.emit(Forejump)
case ntGreedy | beforeChild:
w.emit(Setjump)
case ntGreedy | afterChild:
w.emit(Forejump)
case ntOne, ntNotone:
w.emit1(InstOp(node.t|ntBits), int(node.ch))
case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
if node.m > 0 {
if node.t == ntOneloop || node.t == ntOnelazy {
w.emit2(Onerep|bits, int(node.ch), node.m)
} else {
w.emit2(Notonerep|bits, int(node.ch), node.m)
}
}
if node.n > node.m {
if node.n == math.MaxInt32 {
w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
} else {
w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
}
}
case ntSetloop, ntSetlazy:
if node.m > 0 {
w.emit2(Setrep|bits, w.setCode(node.set), node.m)
}
if node.n > node.m {
if node.n == math.MaxInt32 {
w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
} else {
w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
}
}
case ntMulti:
w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
case ntSet:
w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
case ntRef:
w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
w.emit(InstOp(node.t))
default:
return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
}
return nil
}
// To avoid recursion, we use a simple integer stack.
// This is the push.
func (w *writer) pushInt(i int) {
w.intStack = append(w.intStack, i)
}
// Returns true if the stack is empty.
func (w *writer) emptyStack() bool {
return len(w.intStack) == 0
}
// This is the pop.
func (w *writer) popInt() int {
//get our item
idx := len(w.intStack) - 1
i := w.intStack[idx]
//trim our slice
w.intStack = w.intStack[:idx]
return i
}
// Returns the current position in the emitted code.
func (w *writer) curPos() int {
return w.curpos
}
// Fixes up a jump instruction at the specified offset
// so that it jumps to the specified jumpDest.
func (w *writer) patchJump(offset, jumpDest int) {
w.emitted[offset+1] = jumpDest
}
// Returns an index in the set table for a charset
// uses a map to eliminate duplicates.
func (w *writer) setCode(set *CharSet) int {
if w.counting {
return 0
}
buf := &bytes.Buffer{}
set.mapHashFill(buf)
hash := buf.String()
i, ok := w.sethash[hash]
if !ok {
i = len(w.sethash)
w.sethash[hash] = i
w.settable = append(w.settable, set)
}
return i
}
// Returns an index in the string table for a string.
// uses a map to eliminate duplicates.
func (w *writer) stringCode(str []rune) int {
if w.counting {
return 0
}
hash := string(str)
i, ok := w.stringhash[hash]
if !ok {
i = len(w.stringhash)
w.stringhash[hash] = i
w.stringtable = append(w.stringtable, str)
}
return i
}
// When generating code on a regex that uses a sparse set
// of capture slots, we hash them to a dense set of indices
// for an array of capture slots. Instead of doing the hash
// at match time, it's done at compile time, here.
func (w *writer) mapCapnum(capnum int) int {
if capnum == -1 {
return -1
}
if w.caps != nil {
return w.caps[capnum]
}
return capnum
}
// Emits a zero-argument operation. Note that the emit
// functions all run in two modes: they can emit code, or
// they can just count the size of the code.
func (w *writer) emit(op InstOp) {
if w.counting {
w.count++
if opcodeBacktracks(op) {
w.trackcount++
}
return
}
w.emitted[w.curpos] = int(op)
w.curpos++
}
// Emits a one-argument operation.
func (w *writer) emit1(op InstOp, opd1 int) {
if w.counting {
w.count += 2
if opcodeBacktracks(op) {
w.trackcount++
}
return
}
w.emitted[w.curpos] = int(op)
w.curpos++
w.emitted[w.curpos] = opd1
w.curpos++
}
// Emits a two-argument operation.
func (w *writer) emit2(op InstOp, opd1, opd2 int) {
if w.counting {
w.count += 3
if opcodeBacktracks(op) {
w.trackcount++
}
return
}
w.emitted[w.curpos] = int(op)
w.curpos++
w.emitted[w.curpos] = opd1
w.curpos++
w.emitted[w.curpos] = opd2
w.curpos++
}