fastpastebin/vendor/github.com/dlclark/regexp2/syntax/parser.go
Stanislav N. aka pztrn 48d43ca097 Pagination, readable error messages to user, syntax highlighting started.
Pagination now works. Temporary hardcoded 10 pastes per page, will be put
in configuration later. Maybe.

From now user will receive readable error message if error occured.

Started to work on syntax highlighting, tried to make lexers detection
work but apparently to no avail.
2018-05-01 02:37:51 +05:00

2126 lines
47 KiB
Go

package syntax
import (
"fmt"
"math"
"os"
"sort"
"strconv"
"unicode"
)
type RegexOptions int32
const (
IgnoreCase RegexOptions = 0x0001 // "i"
Multiline = 0x0002 // "m"
ExplicitCapture = 0x0004 // "n"
Compiled = 0x0008 // "c"
Singleline = 0x0010 // "s"
IgnorePatternWhitespace = 0x0020 // "x"
RightToLeft = 0x0040 // "r"
Debug = 0x0080 // "d"
ECMAScript = 0x0100 // "e"
)
func optionFromCode(ch rune) RegexOptions {
// case-insensitive
switch ch {
case 'i', 'I':
return IgnoreCase
case 'r', 'R':
return RightToLeft
case 'm', 'M':
return Multiline
case 'n', 'N':
return ExplicitCapture
case 's', 'S':
return Singleline
case 'x', 'X':
return IgnorePatternWhitespace
case 'd', 'D':
return Debug
case 'e', 'E':
return ECMAScript
default:
return 0
}
}
// An Error describes a failure to parse a regular expression
// and gives the offending expression.
type Error struct {
Code ErrorCode
Expr string
Args []interface{}
}
func (e *Error) Error() string {
if len(e.Args) == 0 {
return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
}
return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
}
// An ErrorCode describes a failure to parse a regular expression.
type ErrorCode string
const (
// internal issue
ErrInternalError ErrorCode = "regexp/syntax: internal error"
// Parser errors
ErrUnterminatedComment = "unterminated comment"
ErrInvalidCharRange = "invalid character class range"
ErrInvalidRepeatSize = "invalid repeat count"
ErrInvalidUTF8 = "invalid UTF-8"
ErrCaptureGroupOutOfRange = "capture group number out of range"
ErrUnexpectedParen = "unexpected )"
ErrMissingParen = "missing closing )"
ErrMissingBrace = "missing closing }"
ErrInvalidRepeatOp = "invalid nested repetition operator"
ErrMissingRepeatArgument = "missing argument to repetition operator"
ErrConditionalExpression = "illegal conditional (?(...)) expression"
ErrTooManyAlternates = "too many | in (?()|)"
ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
ErrCapNumNotZero = "capture number cannot be zero"
ErrUndefinedBackRef = "reference to undefined group number %v"
ErrUndefinedNameRef = "reference to undefined group name %v"
ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
ErrMalformedReference = "(?(%v) ) malformed"
ErrUndefinedReference = "(?(%v) ) reference to undefined group"
ErrIllegalEndEscape = "illegal \\ at end of pattern"
ErrMalformedSlashP = "malformed \\p{X} character escape"
ErrIncompleteSlashP = "incomplete \\p{X} character escape"
ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
ErrMissingControl = "missing control character"
ErrUnrecognizedControl = "unrecognized control character"
ErrTooFewHex = "insufficient hexadecimal digits"
ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
ErrMalformedNameRef = "malformed \\k<...> named back reference"
ErrBadClassInCharRange = "cannot include class \\%v in character range"
ErrUnterminatedBracket = "unterminated [] set"
ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
ErrReversedCharRange = "[x-y] range in reverse order"
)
func (e ErrorCode) String() string {
return string(e)
}
type parser struct {
stack *regexNode
group *regexNode
alternation *regexNode
concatenation *regexNode
unit *regexNode
patternRaw string
pattern []rune
currentPos int
specialCase *unicode.SpecialCase
autocap int
capcount int
captop int
capsize int
caps map[int]int
capnames map[string]int
capnumlist []int
capnamelist []string
options RegexOptions
optionsStack []RegexOptions
ignoreNextParen bool
}
const (
maxValueDiv10 int = math.MaxInt32 / 10
maxValueMod10 = math.MaxInt32 % 10
)
// Parse converts a regex string into a parse tree
func Parse(re string, op RegexOptions) (*RegexTree, error) {
p := parser{
options: op,
caps: make(map[int]int),
}
p.setPattern(re)
if err := p.countCaptures(); err != nil {
return nil, err
}
p.reset(op)
root, err := p.scanRegex()
if err != nil {
return nil, err
}
tree := &RegexTree{
root: root,
caps: p.caps,
capnumlist: p.capnumlist,
captop: p.captop,
Capnames: p.capnames,
Caplist: p.capnamelist,
options: op,
}
if tree.options&Debug > 0 {
os.Stdout.WriteString(tree.Dump())
}
return tree, nil
}
func (p *parser) setPattern(pattern string) {
p.patternRaw = pattern
p.pattern = make([]rune, 0, len(pattern))
//populate our rune array to handle utf8 encoding
for _, r := range pattern {
p.pattern = append(p.pattern, r)
}
}
func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
return &Error{Code: code, Expr: p.patternRaw, Args: args}
}
func (p *parser) noteCaptureSlot(i, pos int) {
if _, ok := p.caps[i]; !ok {
// the rhs of the hashtable isn't used in the parser
p.caps[i] = pos
p.capcount++
if p.captop <= i {
if i == math.MaxInt32 {
p.captop = i
} else {
p.captop = i + 1
}
}
}
}
func (p *parser) noteCaptureName(name string, pos int) {
if p.capnames == nil {
p.capnames = make(map[string]int)
}
if _, ok := p.capnames[name]; !ok {
p.capnames[name] = pos
p.capnamelist = append(p.capnamelist, name)
}
}
func (p *parser) assignNameSlots() {
if p.capnames != nil {
for _, name := range p.capnamelist {
for p.isCaptureSlot(p.autocap) {
p.autocap++
}
pos := p.capnames[name]
p.capnames[name] = p.autocap
p.noteCaptureSlot(p.autocap, pos)
p.autocap++
}
}
// if the caps array has at least one gap, construct the list of used slots
if p.capcount < p.captop {
p.capnumlist = make([]int, p.capcount)
i := 0
for k := range p.caps {
p.capnumlist[i] = k
i++
}
sort.Ints(p.capnumlist)
}
// merge capsnumlist into capnamelist
if p.capnames != nil || p.capnumlist != nil {
var oldcapnamelist []string
var next int
var k int
if p.capnames == nil {
oldcapnamelist = nil
p.capnames = make(map[string]int)
p.capnamelist = []string{}
next = -1
} else {
oldcapnamelist = p.capnamelist
p.capnamelist = []string{}
next = p.capnames[oldcapnamelist[0]]
}
for i := 0; i < p.capcount; i++ {
j := i
if p.capnumlist != nil {
j = p.capnumlist[i]
}
if next == j {
p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
k++
if k == len(oldcapnamelist) {
next = -1
} else {
next = p.capnames[oldcapnamelist[k]]
}
} else {
//feature: culture?
str := strconv.Itoa(j)
p.capnamelist = append(p.capnamelist, str)
p.capnames[str] = j
}
}
}
}
func (p *parser) consumeAutocap() int {
r := p.autocap
p.autocap++
return r
}
// CountCaptures is a prescanner for deducing the slots used for
// captures by doing a partial tokenization of the pattern.
func (p *parser) countCaptures() error {
var ch rune
p.noteCaptureSlot(0, 0)
p.autocap = 1
for p.charsRight() > 0 {
pos := p.textpos()
ch = p.moveRightGetChar()
switch ch {
case '\\':
if p.charsRight() > 0 {
p.moveRight(1)
}
case '#':
if p.useOptionX() {
p.moveLeft()
p.scanBlank()
}
case '[':
p.scanCharSet(false, true)
case ')':
if !p.emptyOptionsStack() {
p.popOptions()
}
case '(':
if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
p.moveLeft()
p.scanBlank()
} else {
p.pushOptions()
if p.charsRight() > 0 && p.rightChar(0) == '?' {
// we have (?...
p.moveRight(1)
if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
// named group: (?<... or (?'...
p.moveRight(1)
ch = p.rightChar(0)
if ch != '0' && IsWordChar(ch) {
if ch >= '1' && ch <= '9' {
dec, err := p.scanDecimal()
if err != nil {
return err
}
p.noteCaptureSlot(dec, pos)
} else {
p.noteCaptureName(p.scanCapname(), pos)
}
}
} else {
// (?...
// get the options if it's an option construct (?cimsx-cimsx...)
p.scanOptions()
if p.charsRight() > 0 {
if p.rightChar(0) == ')' {
// (?cimsx-cimsx)
p.moveRight(1)
p.popKeepOptions()
} else if p.rightChar(0) == '(' {
// alternation construct: (?(foo)yes|no)
// ignore the next paren so we don't capture the condition
p.ignoreNextParen = true
// break from here so we don't reset ignoreNextParen
continue
}
}
}
} else {
if !p.useOptionN() && !p.ignoreNextParen {
p.noteCaptureSlot(p.consumeAutocap(), pos)
}
}
}
p.ignoreNextParen = false
}
}
p.assignNameSlots()
return nil
}
func (p *parser) reset(topopts RegexOptions) {
p.currentPos = 0
p.autocap = 1
p.ignoreNextParen = false
if len(p.optionsStack) > 0 {
p.optionsStack = p.optionsStack[:0]
}
p.options = topopts
p.stack = nil
}
func (p *parser) scanRegex() (*regexNode, error) {
ch := '@' // nonspecial ch, means at beginning
isQuant := false
p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
for p.charsRight() > 0 {
wasPrevQuantifier := isQuant
isQuant = false
if err := p.scanBlank(); err != nil {
return nil, err
}
startpos := p.textpos()
// move past all of the normal characters. We'll stop when we hit some kind of control character,
// or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
if p.useOptionX() {
for p.charsRight() > 0 {
ch = p.rightChar(0)
//UGLY: clean up, this is ugly
if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
break
}
p.moveRight(1)
}
} else {
for p.charsRight() > 0 {
ch = p.rightChar(0)
if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
break
}
p.moveRight(1)
}
}
endpos := p.textpos()
p.scanBlank()
if p.charsRight() == 0 {
ch = '!' // nonspecial, means at end
} else if ch = p.rightChar(0); isSpecial(ch) {
isQuant = isQuantifier(ch)
p.moveRight(1)
} else {
ch = ' ' // nonspecial, means at ordinary char
}
if startpos < endpos {
cchUnquantified := endpos - startpos
if isQuant {
cchUnquantified--
}
wasPrevQuantifier = false
if cchUnquantified > 0 {
p.addToConcatenate(startpos, cchUnquantified, false)
}
if isQuant {
p.addUnitOne(p.charAt(endpos - 1))
}
}
switch ch {
case '!':
goto BreakOuterScan
case ' ':
goto ContinueOuterScan
case '[':
cc, err := p.scanCharSet(p.useOptionI(), false)
if err != nil {
return nil, err
}
p.addUnitSet(cc)
case '(':
p.pushOptions()
if grouper, err := p.scanGroupOpen(); err != nil {
return nil, err
} else if grouper == nil {
p.popKeepOptions()
} else {
p.pushGroup()
p.startGroup(grouper)
}
continue
case '|':
p.addAlternate()
goto ContinueOuterScan
case ')':
if p.emptyStack() {
return nil, p.getErr(ErrUnexpectedParen)
}
if err := p.addGroup(); err != nil {
return nil, err
}
if err := p.popGroup(); err != nil {
return nil, err
}
p.popOptions()
if p.unit == nil {
goto ContinueOuterScan
}
case '\\':
n, err := p.scanBackslash()
if err != nil {
return nil, err
}
p.addUnitNode(n)
case '^':
if p.useOptionM() {
p.addUnitType(ntBol)
} else {
p.addUnitType(ntBeginning)
}
case '$':
if p.useOptionM() {
p.addUnitType(ntEol)
} else {
p.addUnitType(ntEndZ)
}
case '.':
if p.useOptionE() {
p.addUnitSet(ECMAAnyClass())
} else if p.useOptionS() {
p.addUnitSet(AnyClass())
} else {
p.addUnitNotone('\n')
}
case '{', '*', '+', '?':
if p.unit == nil {
if wasPrevQuantifier {
return nil, p.getErr(ErrInvalidRepeatOp)
} else {
return nil, p.getErr(ErrMissingRepeatArgument)
}
}
p.moveLeft()
default:
return nil, p.getErr(ErrInternalError)
}
if err := p.scanBlank(); err != nil {
return nil, err
}
if p.charsRight() > 0 {
isQuant = p.isTrueQuantifier()
}
if p.charsRight() == 0 || !isQuant {
//maintain odd C# assignment order -- not sure if required, could clean up?
p.addConcatenate()
goto ContinueOuterScan
}
ch = p.moveRightGetChar()
// Handle quantifiers
for p.unit != nil {
var min, max int
var lazy bool
switch ch {
case '*':
min = 0
max = math.MaxInt32
case '?':
min = 0
max = 1
case '+':
min = 1
max = math.MaxInt32
case '{':
{
var err error
startpos = p.textpos()
if min, err = p.scanDecimal(); err != nil {
return nil, err
}
max = min
if startpos < p.textpos() {
if p.charsRight() > 0 && p.rightChar(0) == ',' {
p.moveRight(1)
if p.charsRight() == 0 || p.rightChar(0) == '}' {
max = math.MaxInt32
} else {
if max, err = p.scanDecimal(); err != nil {
return nil, err
}
}
}
}
if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
p.addConcatenate()
p.textto(startpos - 1)
goto ContinueOuterScan
}
}
default:
return nil, p.getErr(ErrInternalError)
}
if err := p.scanBlank(); err != nil {
return nil, err
}
if p.charsRight() == 0 || p.rightChar(0) != '?' {
lazy = false
} else {
p.moveRight(1)
lazy = true
}
if min > max {
return nil, p.getErr(ErrInvalidRepeatSize)
}
p.addConcatenate3(lazy, min, max)
}
ContinueOuterScan:
}
BreakOuterScan:
;
if !p.emptyStack() {
return nil, p.getErr(ErrMissingParen)
}
if err := p.addGroup(); err != nil {
return nil, err
}
return p.unit, nil
}
/*
* Simple parsing for replacement patterns
*/
func (p *parser) scanReplacement() (*regexNode, error) {
var c, startpos int
p.concatenation = newRegexNode(ntConcatenate, p.options)
for {
c = p.charsRight()
if c == 0 {
break
}
startpos = p.textpos()
for c > 0 && p.rightChar(0) != '$' {
p.moveRight(1)
c--
}
p.addToConcatenate(startpos, p.textpos()-startpos, true)
if c > 0 {
if p.moveRightGetChar() == '$' {
n, err := p.scanDollar()
if err != nil {
return nil, err
}
p.addUnitNode(n)
}
p.addConcatenate()
}
}
return p.concatenation, nil
}
/*
* Scans $ patterns recognized within replacement patterns
*/
func (p *parser) scanDollar() (*regexNode, error) {
if p.charsRight() == 0 {
return newRegexNodeCh(ntOne, p.options, '$'), nil
}
ch := p.rightChar(0)
angled := false
backpos := p.textpos()
lastEndPos := backpos
// Note angle
if ch == '{' && p.charsRight() > 1 {
angled = true
p.moveRight(1)
ch = p.rightChar(0)
}
// Try to parse backreference: \1 or \{1} or \{cap}
if ch >= '0' && ch <= '9' {
if !angled && p.useOptionE() {
capnum := -1
newcapnum := int(ch - '0')
p.moveRight(1)
if p.isCaptureSlot(newcapnum) {
capnum = newcapnum
lastEndPos = p.textpos()
}
for p.charsRight() > 0 {
ch = p.rightChar(0)
if ch < '0' || ch > '9' {
break
}
digit := int(ch - '0')
if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
return nil, p.getErr(ErrCaptureGroupOutOfRange)
}
newcapnum = newcapnum*10 + digit
p.moveRight(1)
if p.isCaptureSlot(newcapnum) {
capnum = newcapnum
lastEndPos = p.textpos()
}
}
p.textto(lastEndPos)
if capnum >= 0 {
return newRegexNodeM(ntRef, p.options, capnum), nil
}
} else {
capnum, err := p.scanDecimal()
if err != nil {
return nil, err
}
if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
if p.isCaptureSlot(capnum) {
return newRegexNodeM(ntRef, p.options, capnum), nil
}
}
}
} else if angled && IsWordChar(ch) {
capname := p.scanCapname()
if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
if p.isCaptureName(capname) {
return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
}
}
} else if !angled {
capnum := 1
switch ch {
case '$':
p.moveRight(1)
return newRegexNodeCh(ntOne, p.options, '$'), nil
case '&':
capnum = 0
case '`':
capnum = replaceLeftPortion
case '\'':
capnum = replaceRightPortion
case '+':
capnum = replaceLastGroup
case '_':
capnum = replaceWholeString
}
if capnum != 1 {
p.moveRight(1)
return newRegexNodeM(ntRef, p.options, capnum), nil
}
}
// unrecognized $: literalize
p.textto(backpos)
return newRegexNodeCh(ntOne, p.options, '$'), nil
}
// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
// a RegexNode for the type of group scanned, or nil if the group
// simply changed options (?cimsx-cimsx) or was a comment (#...).
func (p *parser) scanGroupOpen() (*regexNode, error) {
var ch rune
var nt nodeType
var err error
close := '>'
start := p.textpos()
// just return a RegexNode if we have:
// 1. "(" followed by nothing
// 2. "(x" where x != ?
// 3. "(?)"
if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
if p.useOptionN() || p.ignoreNextParen {
p.ignoreNextParen = false
return newRegexNode(ntGroup, p.options), nil
}
return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
}
p.moveRight(1)
for {
if p.charsRight() == 0 {
break
}
switch ch = p.moveRightGetChar(); ch {
case ':':
nt = ntGroup
case '=':
p.options &= ^RightToLeft
nt = ntRequire
case '!':
p.options &= ^RightToLeft
nt = ntPrevent
case '>':
nt = ntGreedy
case '\'':
close = '\''
fallthrough
case '<':
if p.charsRight() == 0 {
goto BreakRecognize
}
switch ch = p.moveRightGetChar(); ch {
case '=':
if close == '\'' {
goto BreakRecognize
}
p.options |= RightToLeft
nt = ntRequire
case '!':
if close == '\'' {
goto BreakRecognize
}
p.options |= RightToLeft
nt = ntPrevent
default:
p.moveLeft()
capnum := -1
uncapnum := -1
proceed := false
// grab part before -
if ch >= '0' && ch <= '9' {
if capnum, err = p.scanDecimal(); err != nil {
return nil, err
}
if !p.isCaptureSlot(capnum) {
capnum = -1
}
// check if we have bogus characters after the number
if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
return nil, p.getErr(ErrInvalidGroupName)
}
if capnum == 0 {
return nil, p.getErr(ErrCapNumNotZero)
}
} else if IsWordChar(ch) {
capname := p.scanCapname()
if p.isCaptureName(capname) {
capnum = p.captureSlotFromName(capname)
}
// check if we have bogus character after the name
if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
return nil, p.getErr(ErrInvalidGroupName)
}
} else if ch == '-' {
proceed = true
} else {
// bad group name - starts with something other than a word character and isn't a number
return nil, p.getErr(ErrInvalidGroupName)
}
// grab part after - if any
if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
p.moveRight(1)
//no more chars left, no closing char, etc
if p.charsRight() == 0 {
return nil, p.getErr(ErrInvalidGroupName)
}
ch = p.rightChar(0)
if ch >= '0' && ch <= '9' {
if uncapnum, err = p.scanDecimal(); err != nil {
return nil, err
}
if !p.isCaptureSlot(uncapnum) {
return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
}
// check if we have bogus characters after the number
if p.charsRight() > 0 && p.rightChar(0) != close {
return nil, p.getErr(ErrInvalidGroupName)
}
} else if IsWordChar(ch) {
uncapname := p.scanCapname()
if !p.isCaptureName(uncapname) {
return nil, p.getErr(ErrUndefinedNameRef, uncapname)
}
uncapnum = p.captureSlotFromName(uncapname)
// check if we have bogus character after the name
if p.charsRight() > 0 && p.rightChar(0) != close {
return nil, p.getErr(ErrInvalidGroupName)
}
} else {
// bad group name - starts with something other than a word character and isn't a number
return nil, p.getErr(ErrInvalidGroupName)
}
}
// actually make the node
if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
}
goto BreakRecognize
}
case '(':
// alternation construct (?(...) | )
parenPos := p.textpos()
if p.charsRight() > 0 {
ch = p.rightChar(0)
// check if the alternation condition is a backref
if ch >= '0' && ch <= '9' {
var capnum int
if capnum, err = p.scanDecimal(); err != nil {
return nil, err
}
if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
if p.isCaptureSlot(capnum) {
return newRegexNodeM(ntTestref, p.options, capnum), nil
}
return nil, p.getErr(ErrUndefinedReference, capnum)
}
return nil, p.getErr(ErrMalformedReference, capnum)
} else if IsWordChar(ch) {
capname := p.scanCapname()
if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
}
}
}
// not a backref
nt = ntTestgroup
p.textto(parenPos - 1) // jump to the start of the parentheses
p.ignoreNextParen = true // but make sure we don't try to capture the insides
charsRight := p.charsRight()
if charsRight >= 3 && p.rightChar(1) == '?' {
rightchar2 := p.rightChar(2)
// disallow comments in the condition
if rightchar2 == '#' {
return nil, p.getErr(ErrAlternationCantHaveComment)
}
// disallow named capture group (?<..>..) in the condition
if rightchar2 == '\'' {
return nil, p.getErr(ErrAlternationCantCapture)
}
if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
return nil, p.getErr(ErrAlternationCantCapture)
}
}
default:
p.moveLeft()
nt = ntGroup
// disallow options in the children of a testgroup node
if p.group.t != ntTestgroup {
p.scanOptions()
}
if p.charsRight() == 0 {
goto BreakRecognize
}
if ch = p.moveRightGetChar(); ch == ')' {
return nil, nil
}
if ch != ':' {
goto BreakRecognize
}
}
return newRegexNode(nt, p.options), nil
}
BreakRecognize:
// break Recognize comes here
return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
}
// scans backslash specials and basics
func (p *parser) scanBackslash() (*regexNode, error) {
if p.charsRight() == 0 {
return nil, p.getErr(ErrIllegalEndEscape)
}
switch ch := p.rightChar(0); ch {
case 'b', 'B', 'A', 'G', 'Z', 'z':
p.moveRight(1)
return newRegexNode(p.typeFromCode(ch), p.options), nil
case 'w':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
}
return newRegexNodeSet(ntSet, p.options, WordClass()), nil
case 'W':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
}
return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
case 's':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
}
return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
case 'S':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
}
return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
case 'd':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
}
return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
case 'D':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
}
return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
case 'p', 'P':
p.moveRight(1)
prop, err := p.parseProperty()
if err != nil {
return nil, err
}
cc := &CharSet{}
cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
if p.useOptionI() {
cc.addLowercase()
}
return newRegexNodeSet(ntSet, p.options, cc), nil
default:
return p.scanBasicBackslash()
}
}
// Scans \-style backreferences and character escapes
func (p *parser) scanBasicBackslash() (*regexNode, error) {
if p.charsRight() == 0 {
return nil, p.getErr(ErrIllegalEndEscape)
}
angled := false
close := '\x00'
backpos := p.textpos()
ch := p.rightChar(0)
// allow \k<foo> instead of \<foo>, which is now deprecated
if ch == 'k' {
if p.charsRight() >= 2 {
p.moveRight(1)
ch = p.moveRightGetChar()
if ch == '<' || ch == '\'' {
angled = true
if ch == '\'' {
close = '\''
} else {
close = '>'
}
}
}
if !angled || p.charsRight() <= 0 {
return nil, p.getErr(ErrMalformedNameRef)
}
ch = p.rightChar(0)
} else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
angled = true
if ch == '\'' {
close = '\''
} else {
close = '>'
}
p.moveRight(1)
ch = p.rightChar(0)
}
// Try to parse backreference: \<1> or \<cap>
if angled && ch >= '0' && ch <= '9' {
capnum, err := p.scanDecimal()
if err != nil {
return nil, err
}
if p.charsRight() > 0 && p.moveRightGetChar() == close {
if p.isCaptureSlot(capnum) {
return newRegexNodeM(ntRef, p.options, capnum), nil
} else {
return nil, p.getErr(ErrUndefinedBackRef, capnum)
}
}
} else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
capnum, err := p.scanDecimal()
if err != nil {
return nil, err
}
if p.useOptionE() || p.isCaptureSlot(capnum) {
return newRegexNodeM(ntRef, p.options, capnum), nil
}
if capnum <= 9 {
return nil, p.getErr(ErrUndefinedBackRef, capnum)
}
} else if angled && IsWordChar(ch) {
capname := p.scanCapname()
if p.charsRight() > 0 && p.moveRightGetChar() == close {
if p.isCaptureName(capname) {
return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
}
return nil, p.getErr(ErrUndefinedNameRef, capname)
}
}
// Not backreference: must be char code
p.textto(backpos)
ch, err := p.scanCharEscape()
if err != nil {
return nil, err
}
if p.useOptionI() {
ch = unicode.ToLower(ch)
}
return newRegexNodeCh(ntOne, p.options, ch), nil
}
// Scans X for \p{X} or \P{X}
func (p *parser) parseProperty() (string, error) {
if p.charsRight() < 3 {
return "", p.getErr(ErrIncompleteSlashP)
}
ch := p.moveRightGetChar()
if ch != '{' {
return "", p.getErr(ErrMalformedSlashP)
}
startpos := p.textpos()
for p.charsRight() > 0 {
ch = p.moveRightGetChar()
if !(IsWordChar(ch) || ch == '-') {
p.moveLeft()
break
}
}
capname := string(p.pattern[startpos:p.textpos()])
if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
return "", p.getErr(ErrIncompleteSlashP)
}
if !isValidUnicodeCat(capname) {
return "", p.getErr(ErrUnknownSlashP, capname)
}
return capname, nil
}
// Returns ReNode type for zero-length assertions with a \ code.
func (p *parser) typeFromCode(ch rune) nodeType {
switch ch {
case 'b':
if p.useOptionE() {
return ntECMABoundary
}
return ntBoundary
case 'B':
if p.useOptionE() {
return ntNonECMABoundary
}
return ntNonboundary
case 'A':
return ntBeginning
case 'G':
return ntStart
case 'Z':
return ntEndZ
case 'z':
return ntEnd
default:
return ntNothing
}
}
// Scans whitespace or x-mode comments.
func (p *parser) scanBlank() error {
if p.useOptionX() {
for {
for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
p.moveRight(1)
}
if p.charsRight() == 0 {
break
}
if p.rightChar(0) == '#' {
for p.charsRight() > 0 && p.rightChar(0) != '\n' {
p.moveRight(1)
}
} else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
p.rightChar(1) == '?' && p.rightChar(0) == '(' {
for p.charsRight() > 0 && p.rightChar(0) != ')' {
p.moveRight(1)
}
if p.charsRight() == 0 {
return p.getErr(ErrUnterminatedComment)
}
p.moveRight(1)
} else {
break
}
}
} else {
for {
if p.charsRight() < 3 || p.rightChar(2) != '#' ||
p.rightChar(1) != '?' || p.rightChar(0) != '(' {
return nil
}
for p.charsRight() > 0 && p.rightChar(0) != ')' {
p.moveRight(1)
}
if p.charsRight() == 0 {
return p.getErr(ErrUnterminatedComment)
}
p.moveRight(1)
}
}
return nil
}
func (p *parser) scanCapname() string {
startpos := p.textpos()
for p.charsRight() > 0 {
if !IsWordChar(p.moveRightGetChar()) {
p.moveLeft()
break
}
}
return string(p.pattern[startpos:p.textpos()])
}
//Scans contents of [] (not including []'s), and converts to a set.
func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
ch := '\x00'
chPrev := '\x00'
inRange := false
firstChar := true
closed := false
var cc *CharSet
if !scanOnly {
cc = &CharSet{}
}
if p.charsRight() > 0 && p.rightChar(0) == '^' {
p.moveRight(1)
if !scanOnly {
cc.negate = true
}
}
for ; p.charsRight() > 0; firstChar = false {
fTranslatedChar := false
ch = p.moveRightGetChar()
if ch == ']' {
if !firstChar {
closed = true
break
} else if p.useOptionE() {
if !scanOnly {
cc.addRanges(NoneClass().ranges)
}
closed = true
break
}
} else if ch == '\\' && p.charsRight() > 0 {
switch ch = p.moveRightGetChar(); ch {
case 'D', 'd':
if !scanOnly {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
}
continue
case 'S', 's':
if !scanOnly {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
cc.addSpace(p.useOptionE(), ch == 'S')
}
continue
case 'W', 'w':
if !scanOnly {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
cc.addWord(p.useOptionE(), ch == 'W')
}
continue
case 'p', 'P':
if !scanOnly {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
prop, err := p.parseProperty()
if err != nil {
return nil, err
}
cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
} else {
p.parseProperty()
}
continue
case '-':
if !scanOnly {
cc.addRange(ch, ch)
}
continue
default:
p.moveLeft()
var err error
ch, err = p.scanCharEscape() // non-literal character
if err != nil {
return nil, err
}
fTranslatedChar = true
break // this break will only break out of the switch
}
} else if ch == '[' {
// This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
// It currently doesn't do anything other than skip the whole thing!
if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
savePos := p.textpos()
p.moveRight(1)
p.scanCapname() // throwaway the name
if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
p.textto(savePos)
}
// else lookup name (nyi)
}
}
if inRange {
inRange = false
if !scanOnly {
if ch == '[' && !fTranslatedChar && !firstChar {
// We thought we were in a range, but we're actually starting a subtraction.
// In that case, we'll add chPrev to our char class, skip the opening [, and
// scan the new character class recursively.
cc.addChar(chPrev)
sub, err := p.scanCharSet(caseInsensitive, false)
if err != nil {
return nil, err
}
cc.addSubtraction(sub)
if p.charsRight() > 0 && p.rightChar(0) != ']' {
return nil, p.getErr(ErrSubtractionMustBeLast)
}
} else {
// a regular range, like a-z
if chPrev > ch {
return nil, p.getErr(ErrReversedCharRange)
}
cc.addRange(chPrev, ch)
}
}
} else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
// this could be the start of a range
chPrev = ch
inRange = true
p.moveRight(1)
} else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
// we aren't in a range, and now there is a subtraction. Usually this happens
// only when a subtraction follows a range, like [a-z-[b]]
if !scanOnly {
p.moveRight(1)
sub, err := p.scanCharSet(caseInsensitive, false)
if err != nil {
return nil, err
}
cc.addSubtraction(sub)
if p.charsRight() > 0 && p.rightChar(0) != ']' {
return nil, p.getErr(ErrSubtractionMustBeLast)
}
} else {
p.moveRight(1)
p.scanCharSet(caseInsensitive, true)
}
} else {
if !scanOnly {
cc.addRange(ch, ch)
}
}
}
if !closed {
return nil, p.getErr(ErrUnterminatedBracket)
}
if !scanOnly && caseInsensitive {
cc.addLowercase()
}
return cc, nil
}
// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
func (p *parser) scanDecimal() (int, error) {
i := 0
var d int
for p.charsRight() > 0 {
d = int(p.rightChar(0) - '0')
if d < 0 || d > 9 {
break
}
p.moveRight(1)
if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
return 0, p.getErr(ErrCaptureGroupOutOfRange)
}
i *= 10
i += d
}
return int(i), nil
}
// Returns true for options allowed only at the top level
func isOnlyTopOption(option RegexOptions) bool {
return option == RightToLeft || option == ECMAScript
}
// Scans cimsx-cimsx option string, stops at the first unrecognized char.
func (p *parser) scanOptions() {
for off := false; p.charsRight() > 0; p.moveRight(1) {
ch := p.rightChar(0)
if ch == '-' {
off = true
} else if ch == '+' {
off = false
} else {
option := optionFromCode(ch)
if option == 0 || isOnlyTopOption(option) {
return
}
if off {
p.options &= ^option
} else {
p.options |= option
}
}
}
}
// Scans \ code for escape codes that map to single unicode chars.
func (p *parser) scanCharEscape() (rune, error) {
ch := p.moveRightGetChar()
if ch >= '0' && ch <= '7' {
p.moveLeft()
return p.scanOctal(), nil
}
switch ch {
case 'x':
// support for \x{HEX} syntax from Perl and PCRE
if p.charsRight() > 0 && p.rightChar(0) == '{' {
p.moveRight(1)
return p.scanHexUntilBrace()
}
return p.scanHex(2)
case 'u':
return p.scanHex(4)
case 'a':
return '\u0007', nil
case 'b':
return '\b', nil
case 'e':
return '\u001B', nil
case 'f':
return '\f', nil
case 'n':
return '\n', nil
case 'r':
return '\r', nil
case 't':
return '\t', nil
case 'v':
return '\u000B', nil
case 'c':
return p.scanControl()
default:
if !p.useOptionE() && IsWordChar(ch) {
return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
}
return ch, nil
}
}
// Grabs and converts an ascii control character
func (p *parser) scanControl() (rune, error) {
if p.charsRight() <= 0 {
return 0, p.getErr(ErrMissingControl)
}
ch := p.moveRightGetChar()
// \ca interpreted as \cA
if ch >= 'a' && ch <= 'z' {
ch = (ch - ('a' - 'A'))
}
ch = (ch - '@')
if ch >= 0 && ch < ' ' {
return ch, nil
}
return 0, p.getErr(ErrUnrecognizedControl)
}
// Scan hex digits until we hit a closing brace.
// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
func (p *parser) scanHexUntilBrace() (rune, error) {
// PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
// so we can enforce that
i := 0
hasContent := false
for p.charsRight() > 0 {
ch := p.moveRightGetChar()
if ch == '}' {
// hit our close brace, we're done here
// prevent \x{}
if !hasContent {
return 0, p.getErr(ErrTooFewHex)
}
return rune(i), nil
}
hasContent = true
// no brace needs to be hex digit
d := hexDigit(ch)
if d < 0 {
return 0, p.getErr(ErrMissingBrace)
}
i *= 0x10
i += d
if i > unicode.MaxRune {
return 0, p.getErr(ErrInvalidHex)
}
}
// we only make it here if we run out of digits without finding the brace
return 0, p.getErr(ErrMissingBrace)
}
// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
func (p *parser) scanHex(c int) (rune, error) {
i := 0
if p.charsRight() >= c {
for c > 0 {
d := hexDigit(p.moveRightGetChar())
if d < 0 {
break
}
i *= 0x10
i += d
c--
}
}
if c > 0 {
return 0, p.getErr(ErrTooFewHex)
}
return rune(i), nil
}
// Returns n <= 0xF for a hex digit.
func hexDigit(ch rune) int {
if d := uint(ch - '0'); d <= 9 {
return int(d)
}
if d := uint(ch - 'a'); d <= 5 {
return int(d + 0xa)
}
if d := uint(ch - 'A'); d <= 5 {
return int(d + 0xa)
}
return -1
}
// Scans up to three octal digits (stops before exceeding 0377).
func (p *parser) scanOctal() rune {
// Consume octal chars only up to 3 digits and value 0377
c := 3
if c > p.charsRight() {
c = p.charsRight()
}
//we know the first char is good because the caller had to check
i := 0
d := int(p.rightChar(0) - '0')
for c > 0 && d <= 7 {
i *= 8
i += d
if p.useOptionE() && i >= 0x20 {
break
}
c--
p.moveRight(1)
if !p.rightMost() {
d = int(p.rightChar(0) - '0')
}
}
// Octal codes only go up to 255. Any larger and the behavior that Perl follows
// is simply to truncate the high bits.
i &= 0xFF
return rune(i)
}
// Returns the current parsing position.
func (p *parser) textpos() int {
return p.currentPos
}
// Zaps to a specific parsing position.
func (p *parser) textto(pos int) {
p.currentPos = pos
}
// Returns the char at the right of the current parsing position and advances to the right.
func (p *parser) moveRightGetChar() rune {
ch := p.pattern[p.currentPos]
p.currentPos++
return ch
}
// Moves the current position to the right.
func (p *parser) moveRight(i int) {
// default would be 1
p.currentPos += i
}
// Moves the current parsing position one to the left.
func (p *parser) moveLeft() {
p.currentPos--
}
// Returns the char left of the current parsing position.
func (p *parser) charAt(i int) rune {
return p.pattern[i]
}
// Returns the char i chars right of the current parsing position.
func (p *parser) rightChar(i int) rune {
// default would be 0
return p.pattern[p.currentPos+i]
}
// Number of characters to the right of the current parsing position.
func (p *parser) charsRight() int {
return len(p.pattern) - p.currentPos
}
func (p *parser) rightMost() bool {
return p.currentPos == len(p.pattern)
}
// Looks up the slot number for a given name
func (p *parser) captureSlotFromName(capname string) int {
return p.capnames[capname]
}
// True if the capture slot was noted
func (p *parser) isCaptureSlot(i int) bool {
if p.caps != nil {
_, ok := p.caps[i]
return ok
}
return (i >= 0 && i < p.capsize)
}
// Looks up the slot number for a given name
func (p *parser) isCaptureName(capname string) bool {
if p.capnames == nil {
return false
}
_, ok := p.capnames[capname]
return ok
}
// option shortcuts
// True if N option disabling '(' autocapture is on.
func (p *parser) useOptionN() bool {
return (p.options & ExplicitCapture) != 0
}
// True if I option enabling case-insensitivity is on.
func (p *parser) useOptionI() bool {
return (p.options & IgnoreCase) != 0
}
// True if M option altering meaning of $ and ^ is on.
func (p *parser) useOptionM() bool {
return (p.options & Multiline) != 0
}
// True if S option altering meaning of . is on.
func (p *parser) useOptionS() bool {
return (p.options & Singleline) != 0
}
// True if X option enabling whitespace/comment mode is on.
func (p *parser) useOptionX() bool {
return (p.options & IgnorePatternWhitespace) != 0
}
// True if E option enabling ECMAScript behavior on.
func (p *parser) useOptionE() bool {
return (p.options & ECMAScript) != 0
}
// True if options stack is empty.
func (p *parser) emptyOptionsStack() bool {
return len(p.optionsStack) == 0
}
// Finish the current quantifiable (when a quantifier is not found or is not possible)
func (p *parser) addConcatenate() {
// The first (| inside a Testgroup group goes directly to the group
p.concatenation.addChild(p.unit)
p.unit = nil
}
// Finish the current quantifiable (when a quantifier is found)
func (p *parser) addConcatenate3(lazy bool, min, max int) {
p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
p.unit = nil
}
// Sets the current unit to a single char node
func (p *parser) addUnitOne(ch rune) {
if p.useOptionI() {
ch = unicode.ToLower(ch)
}
p.unit = newRegexNodeCh(ntOne, p.options, ch)
}
// Sets the current unit to a single inverse-char node
func (p *parser) addUnitNotone(ch rune) {
if p.useOptionI() {
ch = unicode.ToLower(ch)
}
p.unit = newRegexNodeCh(ntNotone, p.options, ch)
}
// Sets the current unit to a single set node
func (p *parser) addUnitSet(set *CharSet) {
p.unit = newRegexNodeSet(ntSet, p.options, set)
}
// Sets the current unit to a subtree
func (p *parser) addUnitNode(node *regexNode) {
p.unit = node
}
// Sets the current unit to an assertion of the specified type
func (p *parser) addUnitType(t nodeType) {
p.unit = newRegexNode(t, p.options)
}
// Finish the current group (in response to a ')' or end)
func (p *parser) addGroup() error {
if p.group.t == ntTestgroup || p.group.t == ntTestref {
p.group.addChild(p.concatenation.reverseLeft())
if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
return p.getErr(ErrTooManyAlternates)
}
} else {
p.alternation.addChild(p.concatenation.reverseLeft())
p.group.addChild(p.alternation)
}
p.unit = p.group
return nil
}
// Pops the option stack, but keeps the current options unchanged.
func (p *parser) popKeepOptions() {
lastIdx := len(p.optionsStack) - 1
p.optionsStack = p.optionsStack[:lastIdx]
}
// Recalls options from the stack.
func (p *parser) popOptions() {
lastIdx := len(p.optionsStack) - 1
// get the last item on the stack and then remove it by reslicing
p.options = p.optionsStack[lastIdx]
p.optionsStack = p.optionsStack[:lastIdx]
}
// Saves options on a stack.
func (p *parser) pushOptions() {
p.optionsStack = append(p.optionsStack, p.options)
}
// Add a string to the last concatenate.
func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
var node *regexNode
if cch == 0 {
return
}
if cch > 1 {
str := p.pattern[pos : pos+cch]
if p.useOptionI() && !isReplacement {
// We do the ToLower character by character for consistency. With surrogate chars, doing
// a ToLower on the entire string could actually change the surrogate pair. This is more correct
// linguistically, but since Regex doesn't support surrogates, it's more important to be
// consistent.
for i := 0; i < len(str); i++ {
str[i] = unicode.ToLower(str[i])
}
}
node = newRegexNodeStr(ntMulti, p.options, str)
} else {
ch := p.charAt(pos)
if p.useOptionI() && !isReplacement {
ch = unicode.ToLower(ch)
}
node = newRegexNodeCh(ntOne, p.options, ch)
}
p.concatenation.addChild(node)
}
// Push the parser state (in response to an open paren)
func (p *parser) pushGroup() {
p.group.next = p.stack
p.alternation.next = p.group
p.concatenation.next = p.alternation
p.stack = p.concatenation
}
// Remember the pushed state (in response to a ')')
func (p *parser) popGroup() error {
p.concatenation = p.stack
p.alternation = p.concatenation.next
p.group = p.alternation.next
p.stack = p.group.next
// The first () inside a Testgroup group goes directly to the group
if p.group.t == ntTestgroup && len(p.group.children) == 0 {
if p.unit == nil {
return p.getErr(ErrConditionalExpression)
}
p.group.addChild(p.unit)
p.unit = nil
}
return nil
}
// True if the group stack is empty.
func (p *parser) emptyStack() bool {
return p.stack == nil
}
// Start a new round for the parser state (in response to an open paren or string start)
func (p *parser) startGroup(openGroup *regexNode) {
p.group = openGroup
p.alternation = newRegexNode(ntAlternate, p.options)
p.concatenation = newRegexNode(ntConcatenate, p.options)
}
// Finish the current concatenation (in response to a |)
func (p *parser) addAlternate() {
// The | parts inside a Testgroup group go directly to the group
if p.group.t == ntTestgroup || p.group.t == ntTestref {
p.group.addChild(p.concatenation.reverseLeft())
} else {
p.alternation.addChild(p.concatenation.reverseLeft())
}
p.concatenation = newRegexNode(ntConcatenate, p.options)
}
// For categorizing ascii characters.
const (
Q byte = 5 // quantifier
S = 4 // ordinary stopper
Z = 3 // ScanBlank stopper
X = 2 // whitespace
E = 1 // should be escaped
)
var _category = []byte{
//01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
//@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
//'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
}
func isSpace(ch rune) bool {
return (ch <= ' ' && _category[ch] == X)
}
// Returns true for those characters that terminate a string of ordinary chars.
func isSpecial(ch rune) bool {
return (ch <= '|' && _category[ch] >= S)
}
// Returns true for those characters that terminate a string of ordinary chars.
func isStopperX(ch rune) bool {
return (ch <= '|' && _category[ch] >= X)
}
// Returns true for those characters that begin a quantifier.
func isQuantifier(ch rune) bool {
return (ch <= '{' && _category[ch] >= Q)
}
func (p *parser) isTrueQuantifier() bool {
nChars := p.charsRight()
if nChars == 0 {
return false
}
startpos := p.textpos()
ch := p.charAt(startpos)
if ch != '{' {
return ch <= '{' && _category[ch] >= Q
}
//UGLY: this is ugly -- the original code was ugly too
pos := startpos
for {
nChars--
if nChars <= 0 {
break
}
pos++
ch = p.charAt(pos)
if ch < '0' || ch > '9' {
break
}
}
if nChars == 0 || pos-startpos == 1 {
return false
}
if ch == '}' {
return true
}
if ch != ',' {
return false
}
for {
nChars--
if nChars <= 0 {
break
}
pos++
ch = p.charAt(pos)
if ch < '0' || ch > '9' {
break
}
}
return nChars > 0 && ch == '}'
}