Stanislav N. aka pztrn
48d43ca097
Pagination now works. Temporary hardcoded 10 pastes per page, will be put in configuration later. Maybe. From now user will receive readable error message if error occured. Started to work on syntax highlighting, tried to make lexers detection work but apparently to no avail.
2126 lines
47 KiB
Go
2126 lines
47 KiB
Go
package syntax
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"sort"
|
|
"strconv"
|
|
"unicode"
|
|
)
|
|
|
|
type RegexOptions int32
|
|
|
|
const (
|
|
IgnoreCase RegexOptions = 0x0001 // "i"
|
|
Multiline = 0x0002 // "m"
|
|
ExplicitCapture = 0x0004 // "n"
|
|
Compiled = 0x0008 // "c"
|
|
Singleline = 0x0010 // "s"
|
|
IgnorePatternWhitespace = 0x0020 // "x"
|
|
RightToLeft = 0x0040 // "r"
|
|
Debug = 0x0080 // "d"
|
|
ECMAScript = 0x0100 // "e"
|
|
)
|
|
|
|
func optionFromCode(ch rune) RegexOptions {
|
|
// case-insensitive
|
|
switch ch {
|
|
case 'i', 'I':
|
|
return IgnoreCase
|
|
case 'r', 'R':
|
|
return RightToLeft
|
|
case 'm', 'M':
|
|
return Multiline
|
|
case 'n', 'N':
|
|
return ExplicitCapture
|
|
case 's', 'S':
|
|
return Singleline
|
|
case 'x', 'X':
|
|
return IgnorePatternWhitespace
|
|
case 'd', 'D':
|
|
return Debug
|
|
case 'e', 'E':
|
|
return ECMAScript
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
// An Error describes a failure to parse a regular expression
|
|
// and gives the offending expression.
|
|
type Error struct {
|
|
Code ErrorCode
|
|
Expr string
|
|
Args []interface{}
|
|
}
|
|
|
|
func (e *Error) Error() string {
|
|
if len(e.Args) == 0 {
|
|
return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
|
|
}
|
|
return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
|
|
}
|
|
|
|
// An ErrorCode describes a failure to parse a regular expression.
|
|
type ErrorCode string
|
|
|
|
const (
|
|
// internal issue
|
|
ErrInternalError ErrorCode = "regexp/syntax: internal error"
|
|
// Parser errors
|
|
ErrUnterminatedComment = "unterminated comment"
|
|
ErrInvalidCharRange = "invalid character class range"
|
|
ErrInvalidRepeatSize = "invalid repeat count"
|
|
ErrInvalidUTF8 = "invalid UTF-8"
|
|
ErrCaptureGroupOutOfRange = "capture group number out of range"
|
|
ErrUnexpectedParen = "unexpected )"
|
|
ErrMissingParen = "missing closing )"
|
|
ErrMissingBrace = "missing closing }"
|
|
ErrInvalidRepeatOp = "invalid nested repetition operator"
|
|
ErrMissingRepeatArgument = "missing argument to repetition operator"
|
|
ErrConditionalExpression = "illegal conditional (?(...)) expression"
|
|
ErrTooManyAlternates = "too many | in (?()|)"
|
|
ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
|
|
ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
|
|
ErrCapNumNotZero = "capture number cannot be zero"
|
|
ErrUndefinedBackRef = "reference to undefined group number %v"
|
|
ErrUndefinedNameRef = "reference to undefined group name %v"
|
|
ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
|
|
ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
|
|
ErrMalformedReference = "(?(%v) ) malformed"
|
|
ErrUndefinedReference = "(?(%v) ) reference to undefined group"
|
|
ErrIllegalEndEscape = "illegal \\ at end of pattern"
|
|
ErrMalformedSlashP = "malformed \\p{X} character escape"
|
|
ErrIncompleteSlashP = "incomplete \\p{X} character escape"
|
|
ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
|
|
ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
|
|
ErrMissingControl = "missing control character"
|
|
ErrUnrecognizedControl = "unrecognized control character"
|
|
ErrTooFewHex = "insufficient hexadecimal digits"
|
|
ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
|
|
ErrMalformedNameRef = "malformed \\k<...> named back reference"
|
|
ErrBadClassInCharRange = "cannot include class \\%v in character range"
|
|
ErrUnterminatedBracket = "unterminated [] set"
|
|
ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
|
|
ErrReversedCharRange = "[x-y] range in reverse order"
|
|
)
|
|
|
|
func (e ErrorCode) String() string {
|
|
return string(e)
|
|
}
|
|
|
|
type parser struct {
|
|
stack *regexNode
|
|
group *regexNode
|
|
alternation *regexNode
|
|
concatenation *regexNode
|
|
unit *regexNode
|
|
|
|
patternRaw string
|
|
pattern []rune
|
|
|
|
currentPos int
|
|
specialCase *unicode.SpecialCase
|
|
|
|
autocap int
|
|
capcount int
|
|
captop int
|
|
capsize int
|
|
|
|
caps map[int]int
|
|
capnames map[string]int
|
|
|
|
capnumlist []int
|
|
capnamelist []string
|
|
|
|
options RegexOptions
|
|
optionsStack []RegexOptions
|
|
ignoreNextParen bool
|
|
}
|
|
|
|
const (
|
|
maxValueDiv10 int = math.MaxInt32 / 10
|
|
maxValueMod10 = math.MaxInt32 % 10
|
|
)
|
|
|
|
// Parse converts a regex string into a parse tree
|
|
func Parse(re string, op RegexOptions) (*RegexTree, error) {
|
|
p := parser{
|
|
options: op,
|
|
caps: make(map[int]int),
|
|
}
|
|
p.setPattern(re)
|
|
|
|
if err := p.countCaptures(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
p.reset(op)
|
|
root, err := p.scanRegex()
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
tree := &RegexTree{
|
|
root: root,
|
|
caps: p.caps,
|
|
capnumlist: p.capnumlist,
|
|
captop: p.captop,
|
|
Capnames: p.capnames,
|
|
Caplist: p.capnamelist,
|
|
options: op,
|
|
}
|
|
|
|
if tree.options&Debug > 0 {
|
|
os.Stdout.WriteString(tree.Dump())
|
|
}
|
|
|
|
return tree, nil
|
|
}
|
|
|
|
func (p *parser) setPattern(pattern string) {
|
|
p.patternRaw = pattern
|
|
p.pattern = make([]rune, 0, len(pattern))
|
|
|
|
//populate our rune array to handle utf8 encoding
|
|
for _, r := range pattern {
|
|
p.pattern = append(p.pattern, r)
|
|
}
|
|
}
|
|
func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
|
|
return &Error{Code: code, Expr: p.patternRaw, Args: args}
|
|
}
|
|
|
|
func (p *parser) noteCaptureSlot(i, pos int) {
|
|
if _, ok := p.caps[i]; !ok {
|
|
// the rhs of the hashtable isn't used in the parser
|
|
p.caps[i] = pos
|
|
p.capcount++
|
|
|
|
if p.captop <= i {
|
|
if i == math.MaxInt32 {
|
|
p.captop = i
|
|
} else {
|
|
p.captop = i + 1
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (p *parser) noteCaptureName(name string, pos int) {
|
|
if p.capnames == nil {
|
|
p.capnames = make(map[string]int)
|
|
}
|
|
|
|
if _, ok := p.capnames[name]; !ok {
|
|
p.capnames[name] = pos
|
|
p.capnamelist = append(p.capnamelist, name)
|
|
}
|
|
}
|
|
|
|
func (p *parser) assignNameSlots() {
|
|
if p.capnames != nil {
|
|
for _, name := range p.capnamelist {
|
|
for p.isCaptureSlot(p.autocap) {
|
|
p.autocap++
|
|
}
|
|
pos := p.capnames[name]
|
|
p.capnames[name] = p.autocap
|
|
p.noteCaptureSlot(p.autocap, pos)
|
|
|
|
p.autocap++
|
|
}
|
|
}
|
|
|
|
// if the caps array has at least one gap, construct the list of used slots
|
|
if p.capcount < p.captop {
|
|
p.capnumlist = make([]int, p.capcount)
|
|
i := 0
|
|
|
|
for k := range p.caps {
|
|
p.capnumlist[i] = k
|
|
i++
|
|
}
|
|
|
|
sort.Ints(p.capnumlist)
|
|
}
|
|
|
|
// merge capsnumlist into capnamelist
|
|
if p.capnames != nil || p.capnumlist != nil {
|
|
var oldcapnamelist []string
|
|
var next int
|
|
var k int
|
|
|
|
if p.capnames == nil {
|
|
oldcapnamelist = nil
|
|
p.capnames = make(map[string]int)
|
|
p.capnamelist = []string{}
|
|
next = -1
|
|
} else {
|
|
oldcapnamelist = p.capnamelist
|
|
p.capnamelist = []string{}
|
|
next = p.capnames[oldcapnamelist[0]]
|
|
}
|
|
|
|
for i := 0; i < p.capcount; i++ {
|
|
j := i
|
|
if p.capnumlist != nil {
|
|
j = p.capnumlist[i]
|
|
}
|
|
|
|
if next == j {
|
|
p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
|
|
k++
|
|
|
|
if k == len(oldcapnamelist) {
|
|
next = -1
|
|
} else {
|
|
next = p.capnames[oldcapnamelist[k]]
|
|
}
|
|
|
|
} else {
|
|
//feature: culture?
|
|
str := strconv.Itoa(j)
|
|
p.capnamelist = append(p.capnamelist, str)
|
|
p.capnames[str] = j
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (p *parser) consumeAutocap() int {
|
|
r := p.autocap
|
|
p.autocap++
|
|
return r
|
|
}
|
|
|
|
// CountCaptures is a prescanner for deducing the slots used for
|
|
// captures by doing a partial tokenization of the pattern.
|
|
func (p *parser) countCaptures() error {
|
|
var ch rune
|
|
|
|
p.noteCaptureSlot(0, 0)
|
|
|
|
p.autocap = 1
|
|
|
|
for p.charsRight() > 0 {
|
|
pos := p.textpos()
|
|
ch = p.moveRightGetChar()
|
|
switch ch {
|
|
case '\\':
|
|
if p.charsRight() > 0 {
|
|
p.moveRight(1)
|
|
}
|
|
|
|
case '#':
|
|
if p.useOptionX() {
|
|
p.moveLeft()
|
|
p.scanBlank()
|
|
}
|
|
|
|
case '[':
|
|
p.scanCharSet(false, true)
|
|
|
|
case ')':
|
|
if !p.emptyOptionsStack() {
|
|
p.popOptions()
|
|
}
|
|
|
|
case '(':
|
|
if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
|
|
p.moveLeft()
|
|
p.scanBlank()
|
|
} else {
|
|
p.pushOptions()
|
|
if p.charsRight() > 0 && p.rightChar(0) == '?' {
|
|
// we have (?...
|
|
p.moveRight(1)
|
|
|
|
if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
|
|
// named group: (?<... or (?'...
|
|
|
|
p.moveRight(1)
|
|
ch = p.rightChar(0)
|
|
|
|
if ch != '0' && IsWordChar(ch) {
|
|
if ch >= '1' && ch <= '9' {
|
|
dec, err := p.scanDecimal()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
p.noteCaptureSlot(dec, pos)
|
|
} else {
|
|
p.noteCaptureName(p.scanCapname(), pos)
|
|
}
|
|
}
|
|
} else {
|
|
// (?...
|
|
|
|
// get the options if it's an option construct (?cimsx-cimsx...)
|
|
p.scanOptions()
|
|
|
|
if p.charsRight() > 0 {
|
|
if p.rightChar(0) == ')' {
|
|
// (?cimsx-cimsx)
|
|
p.moveRight(1)
|
|
p.popKeepOptions()
|
|
} else if p.rightChar(0) == '(' {
|
|
// alternation construct: (?(foo)yes|no)
|
|
// ignore the next paren so we don't capture the condition
|
|
p.ignoreNextParen = true
|
|
|
|
// break from here so we don't reset ignoreNextParen
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
if !p.useOptionN() && !p.ignoreNextParen {
|
|
p.noteCaptureSlot(p.consumeAutocap(), pos)
|
|
}
|
|
}
|
|
}
|
|
|
|
p.ignoreNextParen = false
|
|
|
|
}
|
|
}
|
|
|
|
p.assignNameSlots()
|
|
return nil
|
|
}
|
|
|
|
func (p *parser) reset(topopts RegexOptions) {
|
|
p.currentPos = 0
|
|
p.autocap = 1
|
|
p.ignoreNextParen = false
|
|
|
|
if len(p.optionsStack) > 0 {
|
|
p.optionsStack = p.optionsStack[:0]
|
|
}
|
|
|
|
p.options = topopts
|
|
p.stack = nil
|
|
}
|
|
|
|
func (p *parser) scanRegex() (*regexNode, error) {
|
|
ch := '@' // nonspecial ch, means at beginning
|
|
isQuant := false
|
|
|
|
p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
|
|
|
|
for p.charsRight() > 0 {
|
|
wasPrevQuantifier := isQuant
|
|
isQuant = false
|
|
|
|
if err := p.scanBlank(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
startpos := p.textpos()
|
|
|
|
// move past all of the normal characters. We'll stop when we hit some kind of control character,
|
|
// or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
|
|
if p.useOptionX() {
|
|
for p.charsRight() > 0 {
|
|
ch = p.rightChar(0)
|
|
//UGLY: clean up, this is ugly
|
|
if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
|
|
break
|
|
}
|
|
p.moveRight(1)
|
|
}
|
|
} else {
|
|
for p.charsRight() > 0 {
|
|
ch = p.rightChar(0)
|
|
if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
|
|
break
|
|
}
|
|
p.moveRight(1)
|
|
}
|
|
}
|
|
|
|
endpos := p.textpos()
|
|
|
|
p.scanBlank()
|
|
|
|
if p.charsRight() == 0 {
|
|
ch = '!' // nonspecial, means at end
|
|
} else if ch = p.rightChar(0); isSpecial(ch) {
|
|
isQuant = isQuantifier(ch)
|
|
p.moveRight(1)
|
|
} else {
|
|
ch = ' ' // nonspecial, means at ordinary char
|
|
}
|
|
|
|
if startpos < endpos {
|
|
cchUnquantified := endpos - startpos
|
|
if isQuant {
|
|
cchUnquantified--
|
|
}
|
|
wasPrevQuantifier = false
|
|
|
|
if cchUnquantified > 0 {
|
|
p.addToConcatenate(startpos, cchUnquantified, false)
|
|
}
|
|
|
|
if isQuant {
|
|
p.addUnitOne(p.charAt(endpos - 1))
|
|
}
|
|
}
|
|
|
|
switch ch {
|
|
case '!':
|
|
goto BreakOuterScan
|
|
|
|
case ' ':
|
|
goto ContinueOuterScan
|
|
|
|
case '[':
|
|
cc, err := p.scanCharSet(p.useOptionI(), false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
p.addUnitSet(cc)
|
|
|
|
case '(':
|
|
p.pushOptions()
|
|
|
|
if grouper, err := p.scanGroupOpen(); err != nil {
|
|
return nil, err
|
|
} else if grouper == nil {
|
|
p.popKeepOptions()
|
|
} else {
|
|
p.pushGroup()
|
|
p.startGroup(grouper)
|
|
}
|
|
|
|
continue
|
|
|
|
case '|':
|
|
p.addAlternate()
|
|
goto ContinueOuterScan
|
|
|
|
case ')':
|
|
if p.emptyStack() {
|
|
return nil, p.getErr(ErrUnexpectedParen)
|
|
}
|
|
|
|
if err := p.addGroup(); err != nil {
|
|
return nil, err
|
|
}
|
|
if err := p.popGroup(); err != nil {
|
|
return nil, err
|
|
}
|
|
p.popOptions()
|
|
|
|
if p.unit == nil {
|
|
goto ContinueOuterScan
|
|
}
|
|
|
|
case '\\':
|
|
n, err := p.scanBackslash()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
p.addUnitNode(n)
|
|
|
|
case '^':
|
|
if p.useOptionM() {
|
|
p.addUnitType(ntBol)
|
|
} else {
|
|
p.addUnitType(ntBeginning)
|
|
}
|
|
|
|
case '$':
|
|
if p.useOptionM() {
|
|
p.addUnitType(ntEol)
|
|
} else {
|
|
p.addUnitType(ntEndZ)
|
|
}
|
|
|
|
case '.':
|
|
if p.useOptionE() {
|
|
p.addUnitSet(ECMAAnyClass())
|
|
} else if p.useOptionS() {
|
|
p.addUnitSet(AnyClass())
|
|
} else {
|
|
p.addUnitNotone('\n')
|
|
}
|
|
|
|
case '{', '*', '+', '?':
|
|
if p.unit == nil {
|
|
if wasPrevQuantifier {
|
|
return nil, p.getErr(ErrInvalidRepeatOp)
|
|
} else {
|
|
return nil, p.getErr(ErrMissingRepeatArgument)
|
|
}
|
|
}
|
|
p.moveLeft()
|
|
|
|
default:
|
|
return nil, p.getErr(ErrInternalError)
|
|
}
|
|
|
|
if err := p.scanBlank(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if p.charsRight() > 0 {
|
|
isQuant = p.isTrueQuantifier()
|
|
}
|
|
if p.charsRight() == 0 || !isQuant {
|
|
//maintain odd C# assignment order -- not sure if required, could clean up?
|
|
p.addConcatenate()
|
|
goto ContinueOuterScan
|
|
}
|
|
|
|
ch = p.moveRightGetChar()
|
|
|
|
// Handle quantifiers
|
|
for p.unit != nil {
|
|
var min, max int
|
|
var lazy bool
|
|
|
|
switch ch {
|
|
case '*':
|
|
min = 0
|
|
max = math.MaxInt32
|
|
|
|
case '?':
|
|
min = 0
|
|
max = 1
|
|
|
|
case '+':
|
|
min = 1
|
|
max = math.MaxInt32
|
|
|
|
case '{':
|
|
{
|
|
var err error
|
|
startpos = p.textpos()
|
|
if min, err = p.scanDecimal(); err != nil {
|
|
return nil, err
|
|
}
|
|
max = min
|
|
if startpos < p.textpos() {
|
|
if p.charsRight() > 0 && p.rightChar(0) == ',' {
|
|
p.moveRight(1)
|
|
if p.charsRight() == 0 || p.rightChar(0) == '}' {
|
|
max = math.MaxInt32
|
|
} else {
|
|
if max, err = p.scanDecimal(); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
|
|
p.addConcatenate()
|
|
p.textto(startpos - 1)
|
|
goto ContinueOuterScan
|
|
}
|
|
}
|
|
|
|
default:
|
|
return nil, p.getErr(ErrInternalError)
|
|
}
|
|
|
|
if err := p.scanBlank(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if p.charsRight() == 0 || p.rightChar(0) != '?' {
|
|
lazy = false
|
|
} else {
|
|
p.moveRight(1)
|
|
lazy = true
|
|
}
|
|
|
|
if min > max {
|
|
return nil, p.getErr(ErrInvalidRepeatSize)
|
|
}
|
|
|
|
p.addConcatenate3(lazy, min, max)
|
|
}
|
|
|
|
ContinueOuterScan:
|
|
}
|
|
|
|
BreakOuterScan:
|
|
;
|
|
|
|
if !p.emptyStack() {
|
|
return nil, p.getErr(ErrMissingParen)
|
|
}
|
|
|
|
if err := p.addGroup(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return p.unit, nil
|
|
|
|
}
|
|
|
|
/*
|
|
* Simple parsing for replacement patterns
|
|
*/
|
|
func (p *parser) scanReplacement() (*regexNode, error) {
|
|
var c, startpos int
|
|
|
|
p.concatenation = newRegexNode(ntConcatenate, p.options)
|
|
|
|
for {
|
|
c = p.charsRight()
|
|
if c == 0 {
|
|
break
|
|
}
|
|
|
|
startpos = p.textpos()
|
|
|
|
for c > 0 && p.rightChar(0) != '$' {
|
|
p.moveRight(1)
|
|
c--
|
|
}
|
|
|
|
p.addToConcatenate(startpos, p.textpos()-startpos, true)
|
|
|
|
if c > 0 {
|
|
if p.moveRightGetChar() == '$' {
|
|
n, err := p.scanDollar()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
p.addUnitNode(n)
|
|
}
|
|
p.addConcatenate()
|
|
}
|
|
}
|
|
|
|
return p.concatenation, nil
|
|
}
|
|
|
|
/*
|
|
* Scans $ patterns recognized within replacement patterns
|
|
*/
|
|
func (p *parser) scanDollar() (*regexNode, error) {
|
|
if p.charsRight() == 0 {
|
|
return newRegexNodeCh(ntOne, p.options, '$'), nil
|
|
}
|
|
|
|
ch := p.rightChar(0)
|
|
angled := false
|
|
backpos := p.textpos()
|
|
lastEndPos := backpos
|
|
|
|
// Note angle
|
|
|
|
if ch == '{' && p.charsRight() > 1 {
|
|
angled = true
|
|
p.moveRight(1)
|
|
ch = p.rightChar(0)
|
|
}
|
|
|
|
// Try to parse backreference: \1 or \{1} or \{cap}
|
|
|
|
if ch >= '0' && ch <= '9' {
|
|
if !angled && p.useOptionE() {
|
|
capnum := -1
|
|
newcapnum := int(ch - '0')
|
|
p.moveRight(1)
|
|
if p.isCaptureSlot(newcapnum) {
|
|
capnum = newcapnum
|
|
lastEndPos = p.textpos()
|
|
}
|
|
|
|
for p.charsRight() > 0 {
|
|
ch = p.rightChar(0)
|
|
if ch < '0' || ch > '9' {
|
|
break
|
|
}
|
|
digit := int(ch - '0')
|
|
if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
|
|
return nil, p.getErr(ErrCaptureGroupOutOfRange)
|
|
}
|
|
|
|
newcapnum = newcapnum*10 + digit
|
|
|
|
p.moveRight(1)
|
|
if p.isCaptureSlot(newcapnum) {
|
|
capnum = newcapnum
|
|
lastEndPos = p.textpos()
|
|
}
|
|
}
|
|
p.textto(lastEndPos)
|
|
if capnum >= 0 {
|
|
return newRegexNodeM(ntRef, p.options, capnum), nil
|
|
}
|
|
} else {
|
|
capnum, err := p.scanDecimal()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
|
|
if p.isCaptureSlot(capnum) {
|
|
return newRegexNodeM(ntRef, p.options, capnum), nil
|
|
}
|
|
}
|
|
}
|
|
} else if angled && IsWordChar(ch) {
|
|
capname := p.scanCapname()
|
|
|
|
if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
|
|
if p.isCaptureName(capname) {
|
|
return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
|
|
}
|
|
}
|
|
} else if !angled {
|
|
capnum := 1
|
|
|
|
switch ch {
|
|
case '$':
|
|
p.moveRight(1)
|
|
return newRegexNodeCh(ntOne, p.options, '$'), nil
|
|
case '&':
|
|
capnum = 0
|
|
case '`':
|
|
capnum = replaceLeftPortion
|
|
case '\'':
|
|
capnum = replaceRightPortion
|
|
case '+':
|
|
capnum = replaceLastGroup
|
|
case '_':
|
|
capnum = replaceWholeString
|
|
}
|
|
|
|
if capnum != 1 {
|
|
p.moveRight(1)
|
|
return newRegexNodeM(ntRef, p.options, capnum), nil
|
|
}
|
|
}
|
|
|
|
// unrecognized $: literalize
|
|
|
|
p.textto(backpos)
|
|
return newRegexNodeCh(ntOne, p.options, '$'), nil
|
|
}
|
|
|
|
// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
|
|
// a RegexNode for the type of group scanned, or nil if the group
|
|
// simply changed options (?cimsx-cimsx) or was a comment (#...).
|
|
func (p *parser) scanGroupOpen() (*regexNode, error) {
|
|
var ch rune
|
|
var nt nodeType
|
|
var err error
|
|
close := '>'
|
|
start := p.textpos()
|
|
|
|
// just return a RegexNode if we have:
|
|
// 1. "(" followed by nothing
|
|
// 2. "(x" where x != ?
|
|
// 3. "(?)"
|
|
if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
|
|
if p.useOptionN() || p.ignoreNextParen {
|
|
p.ignoreNextParen = false
|
|
return newRegexNode(ntGroup, p.options), nil
|
|
}
|
|
return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
|
|
}
|
|
|
|
p.moveRight(1)
|
|
|
|
for {
|
|
if p.charsRight() == 0 {
|
|
break
|
|
}
|
|
|
|
switch ch = p.moveRightGetChar(); ch {
|
|
case ':':
|
|
nt = ntGroup
|
|
|
|
case '=':
|
|
p.options &= ^RightToLeft
|
|
nt = ntRequire
|
|
|
|
case '!':
|
|
p.options &= ^RightToLeft
|
|
nt = ntPrevent
|
|
|
|
case '>':
|
|
nt = ntGreedy
|
|
|
|
case '\'':
|
|
close = '\''
|
|
fallthrough
|
|
|
|
case '<':
|
|
if p.charsRight() == 0 {
|
|
goto BreakRecognize
|
|
}
|
|
|
|
switch ch = p.moveRightGetChar(); ch {
|
|
case '=':
|
|
if close == '\'' {
|
|
goto BreakRecognize
|
|
}
|
|
|
|
p.options |= RightToLeft
|
|
nt = ntRequire
|
|
|
|
case '!':
|
|
if close == '\'' {
|
|
goto BreakRecognize
|
|
}
|
|
|
|
p.options |= RightToLeft
|
|
nt = ntPrevent
|
|
|
|
default:
|
|
p.moveLeft()
|
|
capnum := -1
|
|
uncapnum := -1
|
|
proceed := false
|
|
|
|
// grab part before -
|
|
|
|
if ch >= '0' && ch <= '9' {
|
|
if capnum, err = p.scanDecimal(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if !p.isCaptureSlot(capnum) {
|
|
capnum = -1
|
|
}
|
|
|
|
// check if we have bogus characters after the number
|
|
if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
|
|
return nil, p.getErr(ErrInvalidGroupName)
|
|
}
|
|
if capnum == 0 {
|
|
return nil, p.getErr(ErrCapNumNotZero)
|
|
}
|
|
} else if IsWordChar(ch) {
|
|
capname := p.scanCapname()
|
|
|
|
if p.isCaptureName(capname) {
|
|
capnum = p.captureSlotFromName(capname)
|
|
}
|
|
|
|
// check if we have bogus character after the name
|
|
if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
|
|
return nil, p.getErr(ErrInvalidGroupName)
|
|
}
|
|
} else if ch == '-' {
|
|
proceed = true
|
|
} else {
|
|
// bad group name - starts with something other than a word character and isn't a number
|
|
return nil, p.getErr(ErrInvalidGroupName)
|
|
}
|
|
|
|
// grab part after - if any
|
|
|
|
if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
|
|
p.moveRight(1)
|
|
|
|
//no more chars left, no closing char, etc
|
|
if p.charsRight() == 0 {
|
|
return nil, p.getErr(ErrInvalidGroupName)
|
|
}
|
|
|
|
ch = p.rightChar(0)
|
|
if ch >= '0' && ch <= '9' {
|
|
if uncapnum, err = p.scanDecimal(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if !p.isCaptureSlot(uncapnum) {
|
|
return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
|
|
}
|
|
|
|
// check if we have bogus characters after the number
|
|
if p.charsRight() > 0 && p.rightChar(0) != close {
|
|
return nil, p.getErr(ErrInvalidGroupName)
|
|
}
|
|
} else if IsWordChar(ch) {
|
|
uncapname := p.scanCapname()
|
|
|
|
if !p.isCaptureName(uncapname) {
|
|
return nil, p.getErr(ErrUndefinedNameRef, uncapname)
|
|
}
|
|
uncapnum = p.captureSlotFromName(uncapname)
|
|
|
|
// check if we have bogus character after the name
|
|
if p.charsRight() > 0 && p.rightChar(0) != close {
|
|
return nil, p.getErr(ErrInvalidGroupName)
|
|
}
|
|
} else {
|
|
// bad group name - starts with something other than a word character and isn't a number
|
|
return nil, p.getErr(ErrInvalidGroupName)
|
|
}
|
|
}
|
|
|
|
// actually make the node
|
|
|
|
if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
|
|
return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
|
|
}
|
|
goto BreakRecognize
|
|
}
|
|
|
|
case '(':
|
|
// alternation construct (?(...) | )
|
|
|
|
parenPos := p.textpos()
|
|
if p.charsRight() > 0 {
|
|
ch = p.rightChar(0)
|
|
|
|
// check if the alternation condition is a backref
|
|
if ch >= '0' && ch <= '9' {
|
|
var capnum int
|
|
if capnum, err = p.scanDecimal(); err != nil {
|
|
return nil, err
|
|
}
|
|
if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
|
|
if p.isCaptureSlot(capnum) {
|
|
return newRegexNodeM(ntTestref, p.options, capnum), nil
|
|
}
|
|
return nil, p.getErr(ErrUndefinedReference, capnum)
|
|
}
|
|
|
|
return nil, p.getErr(ErrMalformedReference, capnum)
|
|
|
|
} else if IsWordChar(ch) {
|
|
capname := p.scanCapname()
|
|
|
|
if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
|
|
return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
|
|
}
|
|
}
|
|
}
|
|
// not a backref
|
|
nt = ntTestgroup
|
|
p.textto(parenPos - 1) // jump to the start of the parentheses
|
|
p.ignoreNextParen = true // but make sure we don't try to capture the insides
|
|
|
|
charsRight := p.charsRight()
|
|
if charsRight >= 3 && p.rightChar(1) == '?' {
|
|
rightchar2 := p.rightChar(2)
|
|
// disallow comments in the condition
|
|
if rightchar2 == '#' {
|
|
return nil, p.getErr(ErrAlternationCantHaveComment)
|
|
}
|
|
|
|
// disallow named capture group (?<..>..) in the condition
|
|
if rightchar2 == '\'' {
|
|
return nil, p.getErr(ErrAlternationCantCapture)
|
|
}
|
|
|
|
if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
|
|
return nil, p.getErr(ErrAlternationCantCapture)
|
|
}
|
|
}
|
|
|
|
default:
|
|
p.moveLeft()
|
|
|
|
nt = ntGroup
|
|
// disallow options in the children of a testgroup node
|
|
if p.group.t != ntTestgroup {
|
|
p.scanOptions()
|
|
}
|
|
if p.charsRight() == 0 {
|
|
goto BreakRecognize
|
|
}
|
|
|
|
if ch = p.moveRightGetChar(); ch == ')' {
|
|
return nil, nil
|
|
}
|
|
|
|
if ch != ':' {
|
|
goto BreakRecognize
|
|
}
|
|
|
|
}
|
|
|
|
return newRegexNode(nt, p.options), nil
|
|
}
|
|
|
|
BreakRecognize:
|
|
|
|
// break Recognize comes here
|
|
|
|
return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
|
|
}
|
|
|
|
// scans backslash specials and basics
|
|
func (p *parser) scanBackslash() (*regexNode, error) {
|
|
|
|
if p.charsRight() == 0 {
|
|
return nil, p.getErr(ErrIllegalEndEscape)
|
|
}
|
|
|
|
switch ch := p.rightChar(0); ch {
|
|
case 'b', 'B', 'A', 'G', 'Z', 'z':
|
|
p.moveRight(1)
|
|
return newRegexNode(p.typeFromCode(ch), p.options), nil
|
|
|
|
case 'w':
|
|
p.moveRight(1)
|
|
if p.useOptionE() {
|
|
return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
|
|
}
|
|
return newRegexNodeSet(ntSet, p.options, WordClass()), nil
|
|
|
|
case 'W':
|
|
p.moveRight(1)
|
|
if p.useOptionE() {
|
|
return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
|
|
}
|
|
return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
|
|
|
|
case 's':
|
|
p.moveRight(1)
|
|
if p.useOptionE() {
|
|
return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
|
|
}
|
|
return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
|
|
|
|
case 'S':
|
|
p.moveRight(1)
|
|
if p.useOptionE() {
|
|
return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
|
|
}
|
|
return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
|
|
|
|
case 'd':
|
|
p.moveRight(1)
|
|
if p.useOptionE() {
|
|
return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
|
|
}
|
|
return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
|
|
|
|
case 'D':
|
|
p.moveRight(1)
|
|
if p.useOptionE() {
|
|
return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
|
|
}
|
|
return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
|
|
|
|
case 'p', 'P':
|
|
p.moveRight(1)
|
|
prop, err := p.parseProperty()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
cc := &CharSet{}
|
|
cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
|
|
if p.useOptionI() {
|
|
cc.addLowercase()
|
|
}
|
|
|
|
return newRegexNodeSet(ntSet, p.options, cc), nil
|
|
|
|
default:
|
|
return p.scanBasicBackslash()
|
|
}
|
|
}
|
|
|
|
// Scans \-style backreferences and character escapes
|
|
func (p *parser) scanBasicBackslash() (*regexNode, error) {
|
|
if p.charsRight() == 0 {
|
|
return nil, p.getErr(ErrIllegalEndEscape)
|
|
}
|
|
angled := false
|
|
close := '\x00'
|
|
|
|
backpos := p.textpos()
|
|
ch := p.rightChar(0)
|
|
|
|
// allow \k<foo> instead of \<foo>, which is now deprecated
|
|
|
|
if ch == 'k' {
|
|
if p.charsRight() >= 2 {
|
|
p.moveRight(1)
|
|
ch = p.moveRightGetChar()
|
|
|
|
if ch == '<' || ch == '\'' {
|
|
angled = true
|
|
if ch == '\'' {
|
|
close = '\''
|
|
} else {
|
|
close = '>'
|
|
}
|
|
}
|
|
}
|
|
|
|
if !angled || p.charsRight() <= 0 {
|
|
return nil, p.getErr(ErrMalformedNameRef)
|
|
}
|
|
|
|
ch = p.rightChar(0)
|
|
|
|
} else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
|
|
angled = true
|
|
if ch == '\'' {
|
|
close = '\''
|
|
} else {
|
|
close = '>'
|
|
}
|
|
|
|
p.moveRight(1)
|
|
ch = p.rightChar(0)
|
|
}
|
|
|
|
// Try to parse backreference: \<1> or \<cap>
|
|
|
|
if angled && ch >= '0' && ch <= '9' {
|
|
capnum, err := p.scanDecimal()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if p.charsRight() > 0 && p.moveRightGetChar() == close {
|
|
if p.isCaptureSlot(capnum) {
|
|
return newRegexNodeM(ntRef, p.options, capnum), nil
|
|
} else {
|
|
return nil, p.getErr(ErrUndefinedBackRef, capnum)
|
|
}
|
|
}
|
|
} else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
|
|
capnum, err := p.scanDecimal()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if p.useOptionE() || p.isCaptureSlot(capnum) {
|
|
return newRegexNodeM(ntRef, p.options, capnum), nil
|
|
}
|
|
if capnum <= 9 {
|
|
return nil, p.getErr(ErrUndefinedBackRef, capnum)
|
|
}
|
|
|
|
} else if angled && IsWordChar(ch) {
|
|
capname := p.scanCapname()
|
|
|
|
if p.charsRight() > 0 && p.moveRightGetChar() == close {
|
|
if p.isCaptureName(capname) {
|
|
return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
|
|
}
|
|
return nil, p.getErr(ErrUndefinedNameRef, capname)
|
|
}
|
|
}
|
|
|
|
// Not backreference: must be char code
|
|
|
|
p.textto(backpos)
|
|
ch, err := p.scanCharEscape()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if p.useOptionI() {
|
|
ch = unicode.ToLower(ch)
|
|
}
|
|
|
|
return newRegexNodeCh(ntOne, p.options, ch), nil
|
|
}
|
|
|
|
// Scans X for \p{X} or \P{X}
|
|
func (p *parser) parseProperty() (string, error) {
|
|
if p.charsRight() < 3 {
|
|
return "", p.getErr(ErrIncompleteSlashP)
|
|
}
|
|
ch := p.moveRightGetChar()
|
|
if ch != '{' {
|
|
return "", p.getErr(ErrMalformedSlashP)
|
|
}
|
|
|
|
startpos := p.textpos()
|
|
for p.charsRight() > 0 {
|
|
ch = p.moveRightGetChar()
|
|
if !(IsWordChar(ch) || ch == '-') {
|
|
p.moveLeft()
|
|
break
|
|
}
|
|
}
|
|
capname := string(p.pattern[startpos:p.textpos()])
|
|
|
|
if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
|
|
return "", p.getErr(ErrIncompleteSlashP)
|
|
}
|
|
|
|
if !isValidUnicodeCat(capname) {
|
|
return "", p.getErr(ErrUnknownSlashP, capname)
|
|
}
|
|
|
|
return capname, nil
|
|
}
|
|
|
|
// Returns ReNode type for zero-length assertions with a \ code.
|
|
func (p *parser) typeFromCode(ch rune) nodeType {
|
|
switch ch {
|
|
case 'b':
|
|
if p.useOptionE() {
|
|
return ntECMABoundary
|
|
}
|
|
return ntBoundary
|
|
case 'B':
|
|
if p.useOptionE() {
|
|
return ntNonECMABoundary
|
|
}
|
|
return ntNonboundary
|
|
case 'A':
|
|
return ntBeginning
|
|
case 'G':
|
|
return ntStart
|
|
case 'Z':
|
|
return ntEndZ
|
|
case 'z':
|
|
return ntEnd
|
|
default:
|
|
return ntNothing
|
|
}
|
|
}
|
|
|
|
// Scans whitespace or x-mode comments.
|
|
func (p *parser) scanBlank() error {
|
|
if p.useOptionX() {
|
|
for {
|
|
for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
|
|
p.moveRight(1)
|
|
}
|
|
|
|
if p.charsRight() == 0 {
|
|
break
|
|
}
|
|
|
|
if p.rightChar(0) == '#' {
|
|
for p.charsRight() > 0 && p.rightChar(0) != '\n' {
|
|
p.moveRight(1)
|
|
}
|
|
} else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
|
|
p.rightChar(1) == '?' && p.rightChar(0) == '(' {
|
|
for p.charsRight() > 0 && p.rightChar(0) != ')' {
|
|
p.moveRight(1)
|
|
}
|
|
if p.charsRight() == 0 {
|
|
return p.getErr(ErrUnterminatedComment)
|
|
}
|
|
p.moveRight(1)
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
} else {
|
|
for {
|
|
if p.charsRight() < 3 || p.rightChar(2) != '#' ||
|
|
p.rightChar(1) != '?' || p.rightChar(0) != '(' {
|
|
return nil
|
|
}
|
|
|
|
for p.charsRight() > 0 && p.rightChar(0) != ')' {
|
|
p.moveRight(1)
|
|
}
|
|
if p.charsRight() == 0 {
|
|
return p.getErr(ErrUnterminatedComment)
|
|
}
|
|
p.moveRight(1)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (p *parser) scanCapname() string {
|
|
startpos := p.textpos()
|
|
|
|
for p.charsRight() > 0 {
|
|
if !IsWordChar(p.moveRightGetChar()) {
|
|
p.moveLeft()
|
|
break
|
|
}
|
|
}
|
|
|
|
return string(p.pattern[startpos:p.textpos()])
|
|
}
|
|
|
|
//Scans contents of [] (not including []'s), and converts to a set.
|
|
func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
|
|
ch := '\x00'
|
|
chPrev := '\x00'
|
|
inRange := false
|
|
firstChar := true
|
|
closed := false
|
|
|
|
var cc *CharSet
|
|
if !scanOnly {
|
|
cc = &CharSet{}
|
|
}
|
|
|
|
if p.charsRight() > 0 && p.rightChar(0) == '^' {
|
|
p.moveRight(1)
|
|
if !scanOnly {
|
|
cc.negate = true
|
|
}
|
|
}
|
|
|
|
for ; p.charsRight() > 0; firstChar = false {
|
|
fTranslatedChar := false
|
|
ch = p.moveRightGetChar()
|
|
if ch == ']' {
|
|
if !firstChar {
|
|
closed = true
|
|
break
|
|
} else if p.useOptionE() {
|
|
if !scanOnly {
|
|
cc.addRanges(NoneClass().ranges)
|
|
}
|
|
closed = true
|
|
break
|
|
}
|
|
|
|
} else if ch == '\\' && p.charsRight() > 0 {
|
|
switch ch = p.moveRightGetChar(); ch {
|
|
case 'D', 'd':
|
|
if !scanOnly {
|
|
if inRange {
|
|
return nil, p.getErr(ErrBadClassInCharRange, ch)
|
|
}
|
|
cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
|
|
}
|
|
continue
|
|
|
|
case 'S', 's':
|
|
if !scanOnly {
|
|
if inRange {
|
|
return nil, p.getErr(ErrBadClassInCharRange, ch)
|
|
}
|
|
cc.addSpace(p.useOptionE(), ch == 'S')
|
|
}
|
|
continue
|
|
|
|
case 'W', 'w':
|
|
if !scanOnly {
|
|
if inRange {
|
|
return nil, p.getErr(ErrBadClassInCharRange, ch)
|
|
}
|
|
|
|
cc.addWord(p.useOptionE(), ch == 'W')
|
|
}
|
|
continue
|
|
|
|
case 'p', 'P':
|
|
if !scanOnly {
|
|
if inRange {
|
|
return nil, p.getErr(ErrBadClassInCharRange, ch)
|
|
}
|
|
prop, err := p.parseProperty()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
|
|
} else {
|
|
p.parseProperty()
|
|
}
|
|
|
|
continue
|
|
|
|
case '-':
|
|
if !scanOnly {
|
|
cc.addRange(ch, ch)
|
|
}
|
|
continue
|
|
|
|
default:
|
|
p.moveLeft()
|
|
var err error
|
|
ch, err = p.scanCharEscape() // non-literal character
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
fTranslatedChar = true
|
|
break // this break will only break out of the switch
|
|
}
|
|
} else if ch == '[' {
|
|
// This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
|
|
// It currently doesn't do anything other than skip the whole thing!
|
|
if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
|
|
savePos := p.textpos()
|
|
|
|
p.moveRight(1)
|
|
p.scanCapname() // throwaway the name
|
|
if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
|
|
p.textto(savePos)
|
|
}
|
|
// else lookup name (nyi)
|
|
}
|
|
}
|
|
|
|
if inRange {
|
|
inRange = false
|
|
if !scanOnly {
|
|
if ch == '[' && !fTranslatedChar && !firstChar {
|
|
// We thought we were in a range, but we're actually starting a subtraction.
|
|
// In that case, we'll add chPrev to our char class, skip the opening [, and
|
|
// scan the new character class recursively.
|
|
cc.addChar(chPrev)
|
|
sub, err := p.scanCharSet(caseInsensitive, false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
cc.addSubtraction(sub)
|
|
|
|
if p.charsRight() > 0 && p.rightChar(0) != ']' {
|
|
return nil, p.getErr(ErrSubtractionMustBeLast)
|
|
}
|
|
} else {
|
|
// a regular range, like a-z
|
|
if chPrev > ch {
|
|
return nil, p.getErr(ErrReversedCharRange)
|
|
}
|
|
cc.addRange(chPrev, ch)
|
|
}
|
|
}
|
|
} else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
|
|
// this could be the start of a range
|
|
chPrev = ch
|
|
inRange = true
|
|
p.moveRight(1)
|
|
} else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
|
|
// we aren't in a range, and now there is a subtraction. Usually this happens
|
|
// only when a subtraction follows a range, like [a-z-[b]]
|
|
if !scanOnly {
|
|
p.moveRight(1)
|
|
sub, err := p.scanCharSet(caseInsensitive, false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
cc.addSubtraction(sub)
|
|
|
|
if p.charsRight() > 0 && p.rightChar(0) != ']' {
|
|
return nil, p.getErr(ErrSubtractionMustBeLast)
|
|
}
|
|
} else {
|
|
p.moveRight(1)
|
|
p.scanCharSet(caseInsensitive, true)
|
|
}
|
|
} else {
|
|
if !scanOnly {
|
|
cc.addRange(ch, ch)
|
|
}
|
|
}
|
|
}
|
|
|
|
if !closed {
|
|
return nil, p.getErr(ErrUnterminatedBracket)
|
|
}
|
|
|
|
if !scanOnly && caseInsensitive {
|
|
cc.addLowercase()
|
|
}
|
|
|
|
return cc, nil
|
|
}
|
|
|
|
// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
|
|
func (p *parser) scanDecimal() (int, error) {
|
|
i := 0
|
|
var d int
|
|
|
|
for p.charsRight() > 0 {
|
|
d = int(p.rightChar(0) - '0')
|
|
if d < 0 || d > 9 {
|
|
break
|
|
}
|
|
p.moveRight(1)
|
|
|
|
if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
|
|
return 0, p.getErr(ErrCaptureGroupOutOfRange)
|
|
}
|
|
|
|
i *= 10
|
|
i += d
|
|
}
|
|
|
|
return int(i), nil
|
|
}
|
|
|
|
// Returns true for options allowed only at the top level
|
|
func isOnlyTopOption(option RegexOptions) bool {
|
|
return option == RightToLeft || option == ECMAScript
|
|
}
|
|
|
|
// Scans cimsx-cimsx option string, stops at the first unrecognized char.
|
|
func (p *parser) scanOptions() {
|
|
|
|
for off := false; p.charsRight() > 0; p.moveRight(1) {
|
|
ch := p.rightChar(0)
|
|
|
|
if ch == '-' {
|
|
off = true
|
|
} else if ch == '+' {
|
|
off = false
|
|
} else {
|
|
option := optionFromCode(ch)
|
|
if option == 0 || isOnlyTopOption(option) {
|
|
return
|
|
}
|
|
|
|
if off {
|
|
p.options &= ^option
|
|
} else {
|
|
p.options |= option
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Scans \ code for escape codes that map to single unicode chars.
|
|
func (p *parser) scanCharEscape() (rune, error) {
|
|
|
|
ch := p.moveRightGetChar()
|
|
|
|
if ch >= '0' && ch <= '7' {
|
|
p.moveLeft()
|
|
return p.scanOctal(), nil
|
|
}
|
|
|
|
switch ch {
|
|
case 'x':
|
|
// support for \x{HEX} syntax from Perl and PCRE
|
|
if p.charsRight() > 0 && p.rightChar(0) == '{' {
|
|
p.moveRight(1)
|
|
return p.scanHexUntilBrace()
|
|
}
|
|
return p.scanHex(2)
|
|
case 'u':
|
|
return p.scanHex(4)
|
|
case 'a':
|
|
return '\u0007', nil
|
|
case 'b':
|
|
return '\b', nil
|
|
case 'e':
|
|
return '\u001B', nil
|
|
case 'f':
|
|
return '\f', nil
|
|
case 'n':
|
|
return '\n', nil
|
|
case 'r':
|
|
return '\r', nil
|
|
case 't':
|
|
return '\t', nil
|
|
case 'v':
|
|
return '\u000B', nil
|
|
case 'c':
|
|
return p.scanControl()
|
|
default:
|
|
if !p.useOptionE() && IsWordChar(ch) {
|
|
return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
|
|
}
|
|
return ch, nil
|
|
}
|
|
}
|
|
|
|
// Grabs and converts an ascii control character
|
|
func (p *parser) scanControl() (rune, error) {
|
|
if p.charsRight() <= 0 {
|
|
return 0, p.getErr(ErrMissingControl)
|
|
}
|
|
|
|
ch := p.moveRightGetChar()
|
|
|
|
// \ca interpreted as \cA
|
|
|
|
if ch >= 'a' && ch <= 'z' {
|
|
ch = (ch - ('a' - 'A'))
|
|
}
|
|
ch = (ch - '@')
|
|
if ch >= 0 && ch < ' ' {
|
|
return ch, nil
|
|
}
|
|
|
|
return 0, p.getErr(ErrUnrecognizedControl)
|
|
|
|
}
|
|
|
|
// Scan hex digits until we hit a closing brace.
|
|
// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
|
|
func (p *parser) scanHexUntilBrace() (rune, error) {
|
|
// PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
|
|
// so we can enforce that
|
|
i := 0
|
|
hasContent := false
|
|
|
|
for p.charsRight() > 0 {
|
|
ch := p.moveRightGetChar()
|
|
if ch == '}' {
|
|
// hit our close brace, we're done here
|
|
// prevent \x{}
|
|
if !hasContent {
|
|
return 0, p.getErr(ErrTooFewHex)
|
|
}
|
|
return rune(i), nil
|
|
}
|
|
hasContent = true
|
|
// no brace needs to be hex digit
|
|
d := hexDigit(ch)
|
|
if d < 0 {
|
|
return 0, p.getErr(ErrMissingBrace)
|
|
}
|
|
|
|
i *= 0x10
|
|
i += d
|
|
|
|
if i > unicode.MaxRune {
|
|
return 0, p.getErr(ErrInvalidHex)
|
|
}
|
|
}
|
|
|
|
// we only make it here if we run out of digits without finding the brace
|
|
return 0, p.getErr(ErrMissingBrace)
|
|
}
|
|
|
|
// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
|
|
func (p *parser) scanHex(c int) (rune, error) {
|
|
|
|
i := 0
|
|
|
|
if p.charsRight() >= c {
|
|
for c > 0 {
|
|
d := hexDigit(p.moveRightGetChar())
|
|
if d < 0 {
|
|
break
|
|
}
|
|
i *= 0x10
|
|
i += d
|
|
c--
|
|
}
|
|
}
|
|
|
|
if c > 0 {
|
|
return 0, p.getErr(ErrTooFewHex)
|
|
}
|
|
|
|
return rune(i), nil
|
|
}
|
|
|
|
// Returns n <= 0xF for a hex digit.
|
|
func hexDigit(ch rune) int {
|
|
|
|
if d := uint(ch - '0'); d <= 9 {
|
|
return int(d)
|
|
}
|
|
|
|
if d := uint(ch - 'a'); d <= 5 {
|
|
return int(d + 0xa)
|
|
}
|
|
|
|
if d := uint(ch - 'A'); d <= 5 {
|
|
return int(d + 0xa)
|
|
}
|
|
|
|
return -1
|
|
}
|
|
|
|
// Scans up to three octal digits (stops before exceeding 0377).
|
|
func (p *parser) scanOctal() rune {
|
|
// Consume octal chars only up to 3 digits and value 0377
|
|
|
|
c := 3
|
|
|
|
if c > p.charsRight() {
|
|
c = p.charsRight()
|
|
}
|
|
|
|
//we know the first char is good because the caller had to check
|
|
i := 0
|
|
d := int(p.rightChar(0) - '0')
|
|
for c > 0 && d <= 7 {
|
|
i *= 8
|
|
i += d
|
|
if p.useOptionE() && i >= 0x20 {
|
|
break
|
|
}
|
|
c--
|
|
|
|
p.moveRight(1)
|
|
if !p.rightMost() {
|
|
d = int(p.rightChar(0) - '0')
|
|
}
|
|
}
|
|
|
|
// Octal codes only go up to 255. Any larger and the behavior that Perl follows
|
|
// is simply to truncate the high bits.
|
|
i &= 0xFF
|
|
|
|
return rune(i)
|
|
}
|
|
|
|
// Returns the current parsing position.
|
|
func (p *parser) textpos() int {
|
|
return p.currentPos
|
|
}
|
|
|
|
// Zaps to a specific parsing position.
|
|
func (p *parser) textto(pos int) {
|
|
p.currentPos = pos
|
|
}
|
|
|
|
// Returns the char at the right of the current parsing position and advances to the right.
|
|
func (p *parser) moveRightGetChar() rune {
|
|
ch := p.pattern[p.currentPos]
|
|
p.currentPos++
|
|
return ch
|
|
}
|
|
|
|
// Moves the current position to the right.
|
|
func (p *parser) moveRight(i int) {
|
|
// default would be 1
|
|
p.currentPos += i
|
|
}
|
|
|
|
// Moves the current parsing position one to the left.
|
|
func (p *parser) moveLeft() {
|
|
p.currentPos--
|
|
}
|
|
|
|
// Returns the char left of the current parsing position.
|
|
func (p *parser) charAt(i int) rune {
|
|
return p.pattern[i]
|
|
}
|
|
|
|
// Returns the char i chars right of the current parsing position.
|
|
func (p *parser) rightChar(i int) rune {
|
|
// default would be 0
|
|
return p.pattern[p.currentPos+i]
|
|
}
|
|
|
|
// Number of characters to the right of the current parsing position.
|
|
func (p *parser) charsRight() int {
|
|
return len(p.pattern) - p.currentPos
|
|
}
|
|
|
|
func (p *parser) rightMost() bool {
|
|
return p.currentPos == len(p.pattern)
|
|
}
|
|
|
|
// Looks up the slot number for a given name
|
|
func (p *parser) captureSlotFromName(capname string) int {
|
|
return p.capnames[capname]
|
|
}
|
|
|
|
// True if the capture slot was noted
|
|
func (p *parser) isCaptureSlot(i int) bool {
|
|
if p.caps != nil {
|
|
_, ok := p.caps[i]
|
|
return ok
|
|
}
|
|
|
|
return (i >= 0 && i < p.capsize)
|
|
}
|
|
|
|
// Looks up the slot number for a given name
|
|
func (p *parser) isCaptureName(capname string) bool {
|
|
if p.capnames == nil {
|
|
return false
|
|
}
|
|
|
|
_, ok := p.capnames[capname]
|
|
return ok
|
|
}
|
|
|
|
// option shortcuts
|
|
|
|
// True if N option disabling '(' autocapture is on.
|
|
func (p *parser) useOptionN() bool {
|
|
return (p.options & ExplicitCapture) != 0
|
|
}
|
|
|
|
// True if I option enabling case-insensitivity is on.
|
|
func (p *parser) useOptionI() bool {
|
|
return (p.options & IgnoreCase) != 0
|
|
}
|
|
|
|
// True if M option altering meaning of $ and ^ is on.
|
|
func (p *parser) useOptionM() bool {
|
|
return (p.options & Multiline) != 0
|
|
}
|
|
|
|
// True if S option altering meaning of . is on.
|
|
func (p *parser) useOptionS() bool {
|
|
return (p.options & Singleline) != 0
|
|
}
|
|
|
|
// True if X option enabling whitespace/comment mode is on.
|
|
func (p *parser) useOptionX() bool {
|
|
return (p.options & IgnorePatternWhitespace) != 0
|
|
}
|
|
|
|
// True if E option enabling ECMAScript behavior on.
|
|
func (p *parser) useOptionE() bool {
|
|
return (p.options & ECMAScript) != 0
|
|
}
|
|
|
|
// True if options stack is empty.
|
|
func (p *parser) emptyOptionsStack() bool {
|
|
return len(p.optionsStack) == 0
|
|
}
|
|
|
|
// Finish the current quantifiable (when a quantifier is not found or is not possible)
|
|
func (p *parser) addConcatenate() {
|
|
// The first (| inside a Testgroup group goes directly to the group
|
|
p.concatenation.addChild(p.unit)
|
|
p.unit = nil
|
|
}
|
|
|
|
// Finish the current quantifiable (when a quantifier is found)
|
|
func (p *parser) addConcatenate3(lazy bool, min, max int) {
|
|
p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
|
|
p.unit = nil
|
|
}
|
|
|
|
// Sets the current unit to a single char node
|
|
func (p *parser) addUnitOne(ch rune) {
|
|
if p.useOptionI() {
|
|
ch = unicode.ToLower(ch)
|
|
}
|
|
|
|
p.unit = newRegexNodeCh(ntOne, p.options, ch)
|
|
}
|
|
|
|
// Sets the current unit to a single inverse-char node
|
|
func (p *parser) addUnitNotone(ch rune) {
|
|
if p.useOptionI() {
|
|
ch = unicode.ToLower(ch)
|
|
}
|
|
|
|
p.unit = newRegexNodeCh(ntNotone, p.options, ch)
|
|
}
|
|
|
|
// Sets the current unit to a single set node
|
|
func (p *parser) addUnitSet(set *CharSet) {
|
|
p.unit = newRegexNodeSet(ntSet, p.options, set)
|
|
}
|
|
|
|
// Sets the current unit to a subtree
|
|
func (p *parser) addUnitNode(node *regexNode) {
|
|
p.unit = node
|
|
}
|
|
|
|
// Sets the current unit to an assertion of the specified type
|
|
func (p *parser) addUnitType(t nodeType) {
|
|
p.unit = newRegexNode(t, p.options)
|
|
}
|
|
|
|
// Finish the current group (in response to a ')' or end)
|
|
func (p *parser) addGroup() error {
|
|
if p.group.t == ntTestgroup || p.group.t == ntTestref {
|
|
p.group.addChild(p.concatenation.reverseLeft())
|
|
if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
|
|
return p.getErr(ErrTooManyAlternates)
|
|
}
|
|
} else {
|
|
p.alternation.addChild(p.concatenation.reverseLeft())
|
|
p.group.addChild(p.alternation)
|
|
}
|
|
|
|
p.unit = p.group
|
|
return nil
|
|
}
|
|
|
|
// Pops the option stack, but keeps the current options unchanged.
|
|
func (p *parser) popKeepOptions() {
|
|
lastIdx := len(p.optionsStack) - 1
|
|
p.optionsStack = p.optionsStack[:lastIdx]
|
|
}
|
|
|
|
// Recalls options from the stack.
|
|
func (p *parser) popOptions() {
|
|
lastIdx := len(p.optionsStack) - 1
|
|
// get the last item on the stack and then remove it by reslicing
|
|
p.options = p.optionsStack[lastIdx]
|
|
p.optionsStack = p.optionsStack[:lastIdx]
|
|
}
|
|
|
|
// Saves options on a stack.
|
|
func (p *parser) pushOptions() {
|
|
p.optionsStack = append(p.optionsStack, p.options)
|
|
}
|
|
|
|
// Add a string to the last concatenate.
|
|
func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
|
|
var node *regexNode
|
|
|
|
if cch == 0 {
|
|
return
|
|
}
|
|
|
|
if cch > 1 {
|
|
str := p.pattern[pos : pos+cch]
|
|
|
|
if p.useOptionI() && !isReplacement {
|
|
// We do the ToLower character by character for consistency. With surrogate chars, doing
|
|
// a ToLower on the entire string could actually change the surrogate pair. This is more correct
|
|
// linguistically, but since Regex doesn't support surrogates, it's more important to be
|
|
// consistent.
|
|
for i := 0; i < len(str); i++ {
|
|
str[i] = unicode.ToLower(str[i])
|
|
}
|
|
}
|
|
|
|
node = newRegexNodeStr(ntMulti, p.options, str)
|
|
} else {
|
|
ch := p.charAt(pos)
|
|
|
|
if p.useOptionI() && !isReplacement {
|
|
ch = unicode.ToLower(ch)
|
|
}
|
|
|
|
node = newRegexNodeCh(ntOne, p.options, ch)
|
|
}
|
|
|
|
p.concatenation.addChild(node)
|
|
}
|
|
|
|
// Push the parser state (in response to an open paren)
|
|
func (p *parser) pushGroup() {
|
|
p.group.next = p.stack
|
|
p.alternation.next = p.group
|
|
p.concatenation.next = p.alternation
|
|
p.stack = p.concatenation
|
|
}
|
|
|
|
// Remember the pushed state (in response to a ')')
|
|
func (p *parser) popGroup() error {
|
|
p.concatenation = p.stack
|
|
p.alternation = p.concatenation.next
|
|
p.group = p.alternation.next
|
|
p.stack = p.group.next
|
|
|
|
// The first () inside a Testgroup group goes directly to the group
|
|
if p.group.t == ntTestgroup && len(p.group.children) == 0 {
|
|
if p.unit == nil {
|
|
return p.getErr(ErrConditionalExpression)
|
|
}
|
|
|
|
p.group.addChild(p.unit)
|
|
p.unit = nil
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// True if the group stack is empty.
|
|
func (p *parser) emptyStack() bool {
|
|
return p.stack == nil
|
|
}
|
|
|
|
// Start a new round for the parser state (in response to an open paren or string start)
|
|
func (p *parser) startGroup(openGroup *regexNode) {
|
|
p.group = openGroup
|
|
p.alternation = newRegexNode(ntAlternate, p.options)
|
|
p.concatenation = newRegexNode(ntConcatenate, p.options)
|
|
}
|
|
|
|
// Finish the current concatenation (in response to a |)
|
|
func (p *parser) addAlternate() {
|
|
// The | parts inside a Testgroup group go directly to the group
|
|
|
|
if p.group.t == ntTestgroup || p.group.t == ntTestref {
|
|
p.group.addChild(p.concatenation.reverseLeft())
|
|
} else {
|
|
p.alternation.addChild(p.concatenation.reverseLeft())
|
|
}
|
|
|
|
p.concatenation = newRegexNode(ntConcatenate, p.options)
|
|
}
|
|
|
|
// For categorizing ascii characters.
|
|
|
|
const (
|
|
Q byte = 5 // quantifier
|
|
S = 4 // ordinary stopper
|
|
Z = 3 // ScanBlank stopper
|
|
X = 2 // whitespace
|
|
E = 1 // should be escaped
|
|
)
|
|
|
|
var _category = []byte{
|
|
//01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
// ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
|
|
X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
|
|
//@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
|
|
//'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
|
|
}
|
|
|
|
func isSpace(ch rune) bool {
|
|
return (ch <= ' ' && _category[ch] == X)
|
|
}
|
|
|
|
// Returns true for those characters that terminate a string of ordinary chars.
|
|
func isSpecial(ch rune) bool {
|
|
return (ch <= '|' && _category[ch] >= S)
|
|
}
|
|
|
|
// Returns true for those characters that terminate a string of ordinary chars.
|
|
func isStopperX(ch rune) bool {
|
|
return (ch <= '|' && _category[ch] >= X)
|
|
}
|
|
|
|
// Returns true for those characters that begin a quantifier.
|
|
func isQuantifier(ch rune) bool {
|
|
return (ch <= '{' && _category[ch] >= Q)
|
|
}
|
|
|
|
func (p *parser) isTrueQuantifier() bool {
|
|
nChars := p.charsRight()
|
|
if nChars == 0 {
|
|
return false
|
|
}
|
|
|
|
startpos := p.textpos()
|
|
ch := p.charAt(startpos)
|
|
if ch != '{' {
|
|
return ch <= '{' && _category[ch] >= Q
|
|
}
|
|
|
|
//UGLY: this is ugly -- the original code was ugly too
|
|
pos := startpos
|
|
for {
|
|
nChars--
|
|
if nChars <= 0 {
|
|
break
|
|
}
|
|
pos++
|
|
ch = p.charAt(pos)
|
|
if ch < '0' || ch > '9' {
|
|
break
|
|
}
|
|
}
|
|
|
|
if nChars == 0 || pos-startpos == 1 {
|
|
return false
|
|
}
|
|
if ch == '}' {
|
|
return true
|
|
}
|
|
if ch != ',' {
|
|
return false
|
|
}
|
|
for {
|
|
nChars--
|
|
if nChars <= 0 {
|
|
break
|
|
}
|
|
pos++
|
|
ch = p.charAt(pos)
|
|
if ch < '0' || ch > '9' {
|
|
break
|
|
}
|
|
}
|
|
|
|
return nChars > 0 && ch == '}'
|
|
}
|