Pagination, readable error messages to user, syntax highlighting started.
Pagination now works. Temporary hardcoded 10 pastes per page, will be put in configuration later. Maybe. From now user will receive readable error message if error occured. Started to work on syntax highlighting, tried to make lexers detection work but apparently to no avail.
This commit is contained in:
784
vendor/github.com/dlclark/regexp2/syntax/charclass.go
generated
vendored
Normal file
784
vendor/github.com/dlclark/regexp2/syntax/charclass.go
generated
vendored
Normal file
@@ -0,0 +1,784 @@
|
||||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"sort"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// CharSet combines start-end rune ranges and unicode categories representing a set of characters
|
||||
type CharSet struct {
|
||||
ranges []singleRange
|
||||
categories []category
|
||||
sub *CharSet //optional subtractor
|
||||
negate bool
|
||||
anything bool
|
||||
}
|
||||
|
||||
type category struct {
|
||||
negate bool
|
||||
cat string
|
||||
}
|
||||
|
||||
type singleRange struct {
|
||||
first rune
|
||||
last rune
|
||||
}
|
||||
|
||||
const (
|
||||
spaceCategoryText = " "
|
||||
wordCategoryText = "W"
|
||||
)
|
||||
|
||||
var (
|
||||
ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
|
||||
ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
|
||||
ecmaDigit = []rune{0x0030, 0x003a}
|
||||
)
|
||||
|
||||
var (
|
||||
AnyClass = getCharSetFromOldString([]rune{0}, false)
|
||||
ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
|
||||
NoneClass = getCharSetFromOldString(nil, false)
|
||||
ECMAWordClass = getCharSetFromOldString(ecmaWord, false)
|
||||
NotECMAWordClass = getCharSetFromOldString(ecmaWord, true)
|
||||
ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false)
|
||||
NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
|
||||
ECMADigitClass = getCharSetFromOldString(ecmaDigit, false)
|
||||
NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)
|
||||
|
||||
WordClass = getCharSetFromCategoryString(false, false, wordCategoryText)
|
||||
NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText)
|
||||
SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText)
|
||||
NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
|
||||
DigitClass = getCharSetFromCategoryString(false, false, "Nd")
|
||||
NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
|
||||
)
|
||||
|
||||
var unicodeCategories = func() map[string]*unicode.RangeTable {
|
||||
retVal := make(map[string]*unicode.RangeTable)
|
||||
for k, v := range unicode.Scripts {
|
||||
retVal[k] = v
|
||||
}
|
||||
for k, v := range unicode.Categories {
|
||||
retVal[k] = v
|
||||
}
|
||||
for k, v := range unicode.Properties {
|
||||
retVal[k] = v
|
||||
}
|
||||
return retVal
|
||||
}()
|
||||
|
||||
func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet {
|
||||
if negateCat && negateSet {
|
||||
panic("BUG! You should only negate the set OR the category in a constant setup, but not both")
|
||||
}
|
||||
|
||||
c := CharSet{negate: negateSet}
|
||||
|
||||
c.categories = make([]category, len(cats))
|
||||
for i, cat := range cats {
|
||||
c.categories[i] = category{cat: cat, negate: negateCat}
|
||||
}
|
||||
return func() *CharSet {
|
||||
//make a copy each time
|
||||
local := c
|
||||
//return that address
|
||||
return &local
|
||||
}
|
||||
}
|
||||
|
||||
func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet {
|
||||
c := CharSet{}
|
||||
if len(setText) > 0 {
|
||||
fillFirst := false
|
||||
l := len(setText)
|
||||
if negate {
|
||||
if setText[0] == 0 {
|
||||
setText = setText[1:]
|
||||
} else {
|
||||
l++
|
||||
fillFirst = true
|
||||
}
|
||||
}
|
||||
|
||||
if l%2 == 0 {
|
||||
c.ranges = make([]singleRange, l/2)
|
||||
} else {
|
||||
c.ranges = make([]singleRange, l/2+1)
|
||||
}
|
||||
|
||||
first := true
|
||||
if fillFirst {
|
||||
c.ranges[0] = singleRange{first: 0}
|
||||
first = false
|
||||
}
|
||||
|
||||
i := 0
|
||||
for _, r := range setText {
|
||||
if first {
|
||||
// lower bound in a new range
|
||||
c.ranges[i] = singleRange{first: r}
|
||||
first = false
|
||||
} else {
|
||||
c.ranges[i].last = r - 1
|
||||
i++
|
||||
first = true
|
||||
}
|
||||
}
|
||||
if !first {
|
||||
c.ranges[i].last = utf8.MaxRune
|
||||
}
|
||||
}
|
||||
|
||||
return func() *CharSet {
|
||||
local := c
|
||||
return &local
|
||||
}
|
||||
}
|
||||
|
||||
// Copy makes a deep copy to prevent accidental mutation of a set
|
||||
func (c CharSet) Copy() CharSet {
|
||||
ret := CharSet{
|
||||
anything: c.anything,
|
||||
negate: c.negate,
|
||||
}
|
||||
|
||||
ret.ranges = append(ret.ranges, c.ranges...)
|
||||
ret.categories = append(ret.categories, c.categories...)
|
||||
|
||||
if c.sub != nil {
|
||||
sub := c.sub.Copy()
|
||||
ret.sub = &sub
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
// gets a human-readable description for a set string
|
||||
func (c CharSet) String() string {
|
||||
buf := &bytes.Buffer{}
|
||||
buf.WriteRune('[')
|
||||
|
||||
if c.IsNegated() {
|
||||
buf.WriteRune('^')
|
||||
}
|
||||
|
||||
for _, r := range c.ranges {
|
||||
|
||||
buf.WriteString(CharDescription(r.first))
|
||||
if r.first != r.last {
|
||||
if r.last-r.first != 1 {
|
||||
//groups that are 1 char apart skip the dash
|
||||
buf.WriteRune('-')
|
||||
}
|
||||
buf.WriteString(CharDescription(r.last))
|
||||
}
|
||||
}
|
||||
|
||||
for _, c := range c.categories {
|
||||
buf.WriteString(c.String())
|
||||
}
|
||||
|
||||
if c.sub != nil {
|
||||
buf.WriteRune('-')
|
||||
buf.WriteString(c.sub.String())
|
||||
}
|
||||
|
||||
buf.WriteRune(']')
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// mapHashFill converts a charset into a buffer for use in maps
|
||||
func (c CharSet) mapHashFill(buf *bytes.Buffer) {
|
||||
if c.negate {
|
||||
buf.WriteByte(0)
|
||||
} else {
|
||||
buf.WriteByte(1)
|
||||
}
|
||||
|
||||
binary.Write(buf, binary.LittleEndian, len(c.ranges))
|
||||
binary.Write(buf, binary.LittleEndian, len(c.categories))
|
||||
for _, r := range c.ranges {
|
||||
buf.WriteRune(r.first)
|
||||
buf.WriteRune(r.last)
|
||||
}
|
||||
for _, ct := range c.categories {
|
||||
buf.WriteString(ct.cat)
|
||||
if ct.negate {
|
||||
buf.WriteByte(1)
|
||||
} else {
|
||||
buf.WriteByte(0)
|
||||
}
|
||||
}
|
||||
|
||||
if c.sub != nil {
|
||||
c.sub.mapHashFill(buf)
|
||||
}
|
||||
}
|
||||
|
||||
// CharIn returns true if the rune is in our character set (either ranges or categories).
|
||||
// It handles negations and subtracted sub-charsets.
|
||||
func (c CharSet) CharIn(ch rune) bool {
|
||||
val := false
|
||||
// in s && !s.subtracted
|
||||
|
||||
//check ranges
|
||||
for _, r := range c.ranges {
|
||||
if ch < r.first {
|
||||
continue
|
||||
}
|
||||
if ch <= r.last {
|
||||
val = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
//check categories if we haven't already found a range
|
||||
if !val && len(c.categories) > 0 {
|
||||
for _, ct := range c.categories {
|
||||
// special categories...then unicode
|
||||
if ct.cat == spaceCategoryText {
|
||||
if unicode.IsSpace(ch) {
|
||||
// we found a space so we're done
|
||||
// negate means this is a "bad" thing
|
||||
val = !ct.negate
|
||||
break
|
||||
} else if ct.negate {
|
||||
val = true
|
||||
break
|
||||
}
|
||||
} else if ct.cat == wordCategoryText {
|
||||
if IsWordChar(ch) {
|
||||
val = !ct.negate
|
||||
break
|
||||
} else if ct.negate {
|
||||
val = true
|
||||
break
|
||||
}
|
||||
} else if unicode.Is(unicodeCategories[ct.cat], ch) {
|
||||
// if we're in this unicode category then we're done
|
||||
// if negate=true on this category then we "failed" our test
|
||||
// otherwise we're good that we found it
|
||||
val = !ct.negate
|
||||
break
|
||||
} else if ct.negate {
|
||||
val = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// negate the whole char set
|
||||
if c.negate {
|
||||
val = !val
|
||||
}
|
||||
|
||||
// get subtracted recurse
|
||||
if val && c.sub != nil {
|
||||
val = !c.sub.CharIn(ch)
|
||||
}
|
||||
|
||||
//log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val)
|
||||
return val
|
||||
}
|
||||
|
||||
func (c category) String() string {
|
||||
switch c.cat {
|
||||
case spaceCategoryText:
|
||||
if c.negate {
|
||||
return "\\S"
|
||||
}
|
||||
return "\\s"
|
||||
case wordCategoryText:
|
||||
if c.negate {
|
||||
return "\\W"
|
||||
}
|
||||
return "\\w"
|
||||
}
|
||||
if _, ok := unicodeCategories[c.cat]; ok {
|
||||
|
||||
if c.negate {
|
||||
return "\\P{" + c.cat + "}"
|
||||
}
|
||||
return "\\p{" + c.cat + "}"
|
||||
}
|
||||
return "Unknown category: " + c.cat
|
||||
}
|
||||
|
||||
// CharDescription Produces a human-readable description for a single character.
|
||||
func CharDescription(ch rune) string {
|
||||
/*if ch == '\\' {
|
||||
return "\\\\"
|
||||
}
|
||||
|
||||
if ch > ' ' && ch <= '~' {
|
||||
return string(ch)
|
||||
} else if ch == '\n' {
|
||||
return "\\n"
|
||||
} else if ch == ' ' {
|
||||
return "\\ "
|
||||
}*/
|
||||
|
||||
b := &bytes.Buffer{}
|
||||
escape(b, ch, false) //fmt.Sprintf("%U", ch)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
|
||||
// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
|
||||
// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
|
||||
// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
|
||||
func IsWordChar(r rune) bool {
|
||||
//"L", "Mn", "Nd", "Pc"
|
||||
return unicode.In(r,
|
||||
unicode.Categories["L"], unicode.Categories["Mn"],
|
||||
unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C'
|
||||
//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
|
||||
}
|
||||
|
||||
func IsECMAWordChar(r rune) bool {
|
||||
return unicode.In(r,
|
||||
unicode.Categories["L"], unicode.Categories["Mn"],
|
||||
unicode.Categories["Nd"], unicode.Categories["Pc"])
|
||||
|
||||
//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
|
||||
}
|
||||
|
||||
// SingletonChar will return the char from the first range without validation.
|
||||
// It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
|
||||
func (c CharSet) SingletonChar() rune {
|
||||
return c.ranges[0].first
|
||||
}
|
||||
|
||||
func (c CharSet) IsSingleton() bool {
|
||||
return !c.negate && //negated is multiple chars
|
||||
len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
|
||||
c.sub == nil && // subtraction means we've got multiple chars
|
||||
c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
|
||||
}
|
||||
|
||||
func (c CharSet) IsSingletonInverse() bool {
|
||||
return c.negate && //same as above, but requires negated
|
||||
len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
|
||||
c.sub == nil && // subtraction means we've got multiple chars
|
||||
c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
|
||||
}
|
||||
|
||||
func (c CharSet) IsMergeable() bool {
|
||||
return !c.IsNegated() && !c.HasSubtraction()
|
||||
}
|
||||
|
||||
func (c CharSet) IsNegated() bool {
|
||||
return c.negate
|
||||
}
|
||||
|
||||
func (c CharSet) HasSubtraction() bool {
|
||||
return c.sub != nil
|
||||
}
|
||||
|
||||
func (c CharSet) IsEmpty() bool {
|
||||
return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil
|
||||
}
|
||||
|
||||
func (c *CharSet) addDigit(ecma, negate bool, pattern string) {
|
||||
if ecma {
|
||||
if negate {
|
||||
c.addRanges(NotECMADigitClass().ranges)
|
||||
} else {
|
||||
c.addRanges(ECMADigitClass().ranges)
|
||||
}
|
||||
} else {
|
||||
c.addCategories(category{cat: "Nd", negate: negate})
|
||||
}
|
||||
}
|
||||
|
||||
func (c *CharSet) addChar(ch rune) {
|
||||
c.addRange(ch, ch)
|
||||
}
|
||||
|
||||
func (c *CharSet) addSpace(ecma, negate bool) {
|
||||
if ecma {
|
||||
if negate {
|
||||
c.addRanges(NotECMASpaceClass().ranges)
|
||||
} else {
|
||||
c.addRanges(ECMASpaceClass().ranges)
|
||||
}
|
||||
} else {
|
||||
c.addCategories(category{cat: spaceCategoryText, negate: negate})
|
||||
}
|
||||
}
|
||||
|
||||
func (c *CharSet) addWord(ecma, negate bool) {
|
||||
if ecma {
|
||||
if negate {
|
||||
c.addRanges(NotECMAWordClass().ranges)
|
||||
} else {
|
||||
c.addRanges(ECMAWordClass().ranges)
|
||||
}
|
||||
} else {
|
||||
c.addCategories(category{cat: wordCategoryText, negate: negate})
|
||||
}
|
||||
}
|
||||
|
||||
// Add set ranges and categories into ours -- no deduping or anything
|
||||
func (c *CharSet) addSet(set CharSet) {
|
||||
if c.anything {
|
||||
return
|
||||
}
|
||||
if set.anything {
|
||||
c.makeAnything()
|
||||
return
|
||||
}
|
||||
// just append here to prevent double-canon
|
||||
c.ranges = append(c.ranges, set.ranges...)
|
||||
c.addCategories(set.categories...)
|
||||
c.canonicalize()
|
||||
}
|
||||
|
||||
func (c *CharSet) makeAnything() {
|
||||
c.anything = true
|
||||
c.categories = []category{}
|
||||
c.ranges = AnyClass().ranges
|
||||
}
|
||||
|
||||
func (c *CharSet) addCategories(cats ...category) {
|
||||
// don't add dupes and remove positive+negative
|
||||
if c.anything {
|
||||
// if we've had a previous positive+negative group then
|
||||
// just return, we're as broad as we can get
|
||||
return
|
||||
}
|
||||
|
||||
for _, ct := range cats {
|
||||
found := false
|
||||
for _, ct2 := range c.categories {
|
||||
if ct.cat == ct2.cat {
|
||||
if ct.negate != ct2.negate {
|
||||
// oposite negations...this mean we just
|
||||
// take us as anything and move on
|
||||
c.makeAnything()
|
||||
return
|
||||
}
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
c.categories = append(c.categories, ct)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Merges new ranges to our own
|
||||
func (c *CharSet) addRanges(ranges []singleRange) {
|
||||
if c.anything {
|
||||
return
|
||||
}
|
||||
c.ranges = append(c.ranges, ranges...)
|
||||
c.canonicalize()
|
||||
}
|
||||
|
||||
func isValidUnicodeCat(catName string) bool {
|
||||
_, ok := unicodeCategories[catName]
|
||||
return ok
|
||||
}
|
||||
|
||||
func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) {
|
||||
if !isValidUnicodeCat(categoryName) {
|
||||
// unknown unicode category, script, or property "blah"
|
||||
panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName))
|
||||
|
||||
}
|
||||
|
||||
if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") {
|
||||
// when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match
|
||||
c.addCategories(
|
||||
category{cat: "Ll", negate: negate},
|
||||
category{cat: "Lu", negate: negate},
|
||||
category{cat: "Lt", negate: negate})
|
||||
}
|
||||
c.addCategories(category{cat: categoryName, negate: negate})
|
||||
}
|
||||
|
||||
func (c *CharSet) addSubtraction(sub *CharSet) {
|
||||
c.sub = sub
|
||||
}
|
||||
|
||||
func (c *CharSet) addRange(chMin, chMax rune) {
|
||||
c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax})
|
||||
c.canonicalize()
|
||||
}
|
||||
|
||||
type singleRangeSorter []singleRange
|
||||
|
||||
func (p singleRangeSorter) Len() int { return len(p) }
|
||||
func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first }
|
||||
func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
|
||||
|
||||
// Logic to reduce a character class to a unique, sorted form.
|
||||
func (c *CharSet) canonicalize() {
|
||||
var i, j int
|
||||
var last rune
|
||||
|
||||
//
|
||||
// Find and eliminate overlapping or abutting ranges
|
||||
//
|
||||
|
||||
if len(c.ranges) > 1 {
|
||||
sort.Sort(singleRangeSorter(c.ranges))
|
||||
|
||||
done := false
|
||||
|
||||
for i, j = 1, 0; ; i++ {
|
||||
for last = c.ranges[j].last; ; i++ {
|
||||
if i == len(c.ranges) || last == utf8.MaxRune {
|
||||
done = true
|
||||
break
|
||||
}
|
||||
|
||||
CurrentRange := c.ranges[i]
|
||||
if CurrentRange.first > last+1 {
|
||||
break
|
||||
}
|
||||
|
||||
if last < CurrentRange.last {
|
||||
last = CurrentRange.last
|
||||
}
|
||||
}
|
||||
|
||||
c.ranges[j] = singleRange{first: c.ranges[j].first, last: last}
|
||||
|
||||
j++
|
||||
|
||||
if done {
|
||||
break
|
||||
}
|
||||
|
||||
if j < i {
|
||||
c.ranges[j] = c.ranges[i]
|
||||
}
|
||||
}
|
||||
|
||||
c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...)
|
||||
}
|
||||
}
|
||||
|
||||
// Adds to the class any lowercase versions of characters already
|
||||
// in the class. Used for case-insensitivity.
|
||||
func (c *CharSet) addLowercase() {
|
||||
if c.anything {
|
||||
return
|
||||
}
|
||||
toAdd := []singleRange{}
|
||||
for i := 0; i < len(c.ranges); i++ {
|
||||
r := c.ranges[i]
|
||||
if r.first == r.last {
|
||||
lower := unicode.ToLower(r.first)
|
||||
c.ranges[i] = singleRange{first: lower, last: lower}
|
||||
} else {
|
||||
toAdd = append(toAdd, r)
|
||||
}
|
||||
}
|
||||
|
||||
for _, r := range toAdd {
|
||||
c.addLowercaseRange(r.first, r.last)
|
||||
}
|
||||
c.canonicalize()
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
Let U be the set of Unicode character values and let L be the lowercase
|
||||
function, mapping from U to U. To perform case insensitive matching of
|
||||
character sets, we need to be able to map an interval I in U, say
|
||||
|
||||
I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
|
||||
|
||||
to a set A such that A contains L(I) and A is contained in the union of
|
||||
I and L(I).
|
||||
|
||||
The table below partitions U into intervals on which L is non-decreasing.
|
||||
Thus, for any interval J = [a, b] contained in one of these intervals,
|
||||
L(J) is contained in [L(a), L(b)].
|
||||
|
||||
It is also true that for any such J, [L(a), L(b)] is contained in the
|
||||
union of J and L(J). This does not follow from L being non-decreasing on
|
||||
these intervals. It follows from the nature of the L on each interval.
|
||||
On each interval, L has one of the following forms:
|
||||
|
||||
(1) L(ch) = constant (LowercaseSet)
|
||||
(2) L(ch) = ch + offset (LowercaseAdd)
|
||||
(3) L(ch) = ch | 1 (LowercaseBor)
|
||||
(4) L(ch) = ch + (ch & 1) (LowercaseBad)
|
||||
|
||||
It is easy to verify that for any of these forms [L(a), L(b)] is
|
||||
contained in the union of [a, b] and L([a, b]).
|
||||
***************************************************************************/
|
||||
|
||||
const (
|
||||
LowercaseSet = 0 // Set to arg.
|
||||
LowercaseAdd = 1 // Add arg.
|
||||
LowercaseBor = 2 // Bitwise or with 1.
|
||||
LowercaseBad = 3 // Bitwise and with 1 and add original.
|
||||
)
|
||||
|
||||
type lcMap struct {
|
||||
chMin, chMax rune
|
||||
op, data int32
|
||||
}
|
||||
|
||||
var lcTable = []lcMap{
|
||||
lcMap{'\u0041', '\u005A', LowercaseAdd, 32},
|
||||
lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32},
|
||||
lcMap{'\u0100', '\u012E', LowercaseBor, 0},
|
||||
lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069},
|
||||
lcMap{'\u0132', '\u0136', LowercaseBor, 0},
|
||||
lcMap{'\u0139', '\u0147', LowercaseBad, 0},
|
||||
lcMap{'\u014A', '\u0176', LowercaseBor, 0},
|
||||
lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF},
|
||||
lcMap{'\u0179', '\u017D', LowercaseBad, 0},
|
||||
lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253},
|
||||
lcMap{'\u0182', '\u0184', LowercaseBor, 0},
|
||||
lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254},
|
||||
lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188},
|
||||
lcMap{'\u0189', '\u018A', LowercaseAdd, 205},
|
||||
lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C},
|
||||
lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD},
|
||||
lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259},
|
||||
lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B},
|
||||
lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192},
|
||||
lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260},
|
||||
lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263},
|
||||
lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269},
|
||||
lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268},
|
||||
lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199},
|
||||
lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F},
|
||||
lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272},
|
||||
lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275},
|
||||
lcMap{'\u01A0', '\u01A4', LowercaseBor, 0},
|
||||
lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8},
|
||||
lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283},
|
||||
lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD},
|
||||
lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288},
|
||||
lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0},
|
||||
lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217},
|
||||
lcMap{'\u01B3', '\u01B5', LowercaseBad, 0},
|
||||
lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292},
|
||||
lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9},
|
||||
lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD},
|
||||
lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6},
|
||||
lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9},
|
||||
lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC},
|
||||
lcMap{'\u01CD', '\u01DB', LowercaseBad, 0},
|
||||
lcMap{'\u01DE', '\u01EE', LowercaseBor, 0},
|
||||
lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3},
|
||||
lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5},
|
||||
lcMap{'\u01FA', '\u0216', LowercaseBor, 0},
|
||||
lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC},
|
||||
lcMap{'\u0388', '\u038A', LowercaseAdd, 37},
|
||||
lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC},
|
||||
lcMap{'\u038E', '\u038F', LowercaseAdd, 63},
|
||||
lcMap{'\u0391', '\u03AB', LowercaseAdd, 32},
|
||||
lcMap{'\u03E2', '\u03EE', LowercaseBor, 0},
|
||||
lcMap{'\u0401', '\u040F', LowercaseAdd, 80},
|
||||
lcMap{'\u0410', '\u042F', LowercaseAdd, 32},
|
||||
lcMap{'\u0460', '\u0480', LowercaseBor, 0},
|
||||
lcMap{'\u0490', '\u04BE', LowercaseBor, 0},
|
||||
lcMap{'\u04C1', '\u04C3', LowercaseBad, 0},
|
||||
lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8},
|
||||
lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC},
|
||||
lcMap{'\u04D0', '\u04EA', LowercaseBor, 0},
|
||||
lcMap{'\u04EE', '\u04F4', LowercaseBor, 0},
|
||||
lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9},
|
||||
lcMap{'\u0531', '\u0556', LowercaseAdd, 48},
|
||||
lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48},
|
||||
lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0},
|
||||
lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8},
|
||||
lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8},
|
||||
lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8},
|
||||
lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8},
|
||||
lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8},
|
||||
lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51},
|
||||
lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53},
|
||||
lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55},
|
||||
lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57},
|
||||
lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8},
|
||||
lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8},
|
||||
lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8},
|
||||
lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8},
|
||||
lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8},
|
||||
lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74},
|
||||
lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3},
|
||||
lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86},
|
||||
lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3},
|
||||
lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8},
|
||||
lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100},
|
||||
lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8},
|
||||
lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112},
|
||||
lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5},
|
||||
lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128},
|
||||
lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126},
|
||||
lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3},
|
||||
lcMap{'\u2160', '\u216F', LowercaseAdd, 16},
|
||||
lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26},
|
||||
lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32},
|
||||
}
|
||||
|
||||
func (c *CharSet) addLowercaseRange(chMin, chMax rune) {
|
||||
var i, iMax, iMid int
|
||||
var chMinT, chMaxT rune
|
||||
var lc lcMap
|
||||
|
||||
for i, iMax = 0, len(lcTable); i < iMax; {
|
||||
iMid = (i + iMax) / 2
|
||||
if lcTable[iMid].chMax < chMin {
|
||||
i = iMid + 1
|
||||
} else {
|
||||
iMax = iMid
|
||||
}
|
||||
}
|
||||
|
||||
for ; i < len(lcTable); i++ {
|
||||
lc = lcTable[i]
|
||||
if lc.chMin > chMax {
|
||||
return
|
||||
}
|
||||
chMinT = lc.chMin
|
||||
if chMinT < chMin {
|
||||
chMinT = chMin
|
||||
}
|
||||
|
||||
chMaxT = lc.chMax
|
||||
if chMaxT > chMax {
|
||||
chMaxT = chMax
|
||||
}
|
||||
|
||||
switch lc.op {
|
||||
case LowercaseSet:
|
||||
chMinT = rune(lc.data)
|
||||
chMaxT = rune(lc.data)
|
||||
break
|
||||
case LowercaseAdd:
|
||||
chMinT += lc.data
|
||||
chMaxT += lc.data
|
||||
break
|
||||
case LowercaseBor:
|
||||
chMinT |= 1
|
||||
chMaxT |= 1
|
||||
break
|
||||
case LowercaseBad:
|
||||
chMinT += (chMinT & 1)
|
||||
chMaxT += (chMaxT & 1)
|
||||
break
|
||||
}
|
||||
|
||||
if chMinT < chMin || chMaxT > chMax {
|
||||
c.addRange(chMinT, chMaxT)
|
||||
}
|
||||
}
|
||||
}
|
274
vendor/github.com/dlclark/regexp2/syntax/code.go
generated
vendored
Normal file
274
vendor/github.com/dlclark/regexp2/syntax/code.go
generated
vendored
Normal file
@@ -0,0 +1,274 @@
|
||||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
)
|
||||
|
||||
// similar to prog.go in the go regex package...also with comment 'may not belong in this package'
|
||||
|
||||
// File provides operator constants for use by the Builder and the Machine.
|
||||
|
||||
// Implementation notes:
|
||||
//
|
||||
// Regexps are built into RegexCodes, which contain an operation array,
|
||||
// a string table, and some constants.
|
||||
//
|
||||
// Each operation is one of the codes below, followed by the integer
|
||||
// operands specified for each op.
|
||||
//
|
||||
// Strings and sets are indices into a string table.
|
||||
|
||||
type InstOp int
|
||||
|
||||
const (
|
||||
// lef/back operands description
|
||||
|
||||
Onerep InstOp = 0 // lef,back char,min,max a {n}
|
||||
Notonerep = 1 // lef,back char,min,max .{n}
|
||||
Setrep = 2 // lef,back set,min,max [\d]{n}
|
||||
|
||||
Oneloop = 3 // lef,back char,min,max a {,n}
|
||||
Notoneloop = 4 // lef,back char,min,max .{,n}
|
||||
Setloop = 5 // lef,back set,min,max [\d]{,n}
|
||||
|
||||
Onelazy = 6 // lef,back char,min,max a {,n}?
|
||||
Notonelazy = 7 // lef,back char,min,max .{,n}?
|
||||
Setlazy = 8 // lef,back set,min,max [\d]{,n}?
|
||||
|
||||
One = 9 // lef char a
|
||||
Notone = 10 // lef char [^a]
|
||||
Set = 11 // lef set [a-z\s] \w \s \d
|
||||
|
||||
Multi = 12 // lef string abcd
|
||||
Ref = 13 // lef group \#
|
||||
|
||||
Bol = 14 // ^
|
||||
Eol = 15 // $
|
||||
Boundary = 16 // \b
|
||||
Nonboundary = 17 // \B
|
||||
Beginning = 18 // \A
|
||||
Start = 19 // \G
|
||||
EndZ = 20 // \Z
|
||||
End = 21 // \Z
|
||||
|
||||
Nothing = 22 // Reject!
|
||||
|
||||
// Primitive control structures
|
||||
|
||||
Lazybranch = 23 // back jump straight first
|
||||
Branchmark = 24 // back jump branch first for loop
|
||||
Lazybranchmark = 25 // back jump straight first for loop
|
||||
Nullcount = 26 // back val set counter, null mark
|
||||
Setcount = 27 // back val set counter, make mark
|
||||
Branchcount = 28 // back jump,limit branch++ if zero<=c<limit
|
||||
Lazybranchcount = 29 // back jump,limit same, but straight first
|
||||
Nullmark = 30 // back save position
|
||||
Setmark = 31 // back save position
|
||||
Capturemark = 32 // back group define group
|
||||
Getmark = 33 // back recall position
|
||||
Setjump = 34 // back save backtrack state
|
||||
Backjump = 35 // zap back to saved state
|
||||
Forejump = 36 // zap backtracking state
|
||||
Testref = 37 // backtrack if ref undefined
|
||||
Goto = 38 // jump just go
|
||||
|
||||
Prune = 39 // prune it baby
|
||||
Stop = 40 // done!
|
||||
|
||||
ECMABoundary = 41 // \b
|
||||
NonECMABoundary = 42 // \B
|
||||
|
||||
// Modifiers for alternate modes
|
||||
|
||||
Mask = 63 // Mask to get unmodified ordinary operator
|
||||
Rtl = 64 // bit to indicate that we're reverse scanning.
|
||||
Back = 128 // bit to indicate that we're backtracking.
|
||||
Back2 = 256 // bit to indicate that we're backtracking on a second branch.
|
||||
Ci = 512 // bit to indicate that we're case-insensitive.
|
||||
)
|
||||
|
||||
type Code struct {
|
||||
Codes []int // the code
|
||||
Strings [][]rune // string table
|
||||
Sets []*CharSet //character set table
|
||||
TrackCount int // how many instructions use backtracking
|
||||
Caps map[int]int // mapping of user group numbers -> impl group slots
|
||||
Capsize int // number of impl group slots
|
||||
FcPrefix *Prefix // the set of candidate first characters (may be null)
|
||||
BmPrefix *BmPrefix // the fixed prefix string as a Boyer-Moore machine (may be null)
|
||||
Anchors AnchorLoc // the set of zero-length start anchors (RegexFCD.Bol, etc)
|
||||
RightToLeft bool // true if right to left
|
||||
}
|
||||
|
||||
func opcodeBacktracks(op InstOp) bool {
|
||||
op &= Mask
|
||||
|
||||
switch op {
|
||||
case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark,
|
||||
Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump,
|
||||
Forejump, Goto:
|
||||
return true
|
||||
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func opcodeSize(op InstOp) int {
|
||||
op &= Mask
|
||||
|
||||
switch op {
|
||||
case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ,
|
||||
End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop:
|
||||
return 1
|
||||
|
||||
case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark,
|
||||
Prune, Set:
|
||||
return 2
|
||||
|
||||
case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy,
|
||||
Setlazy, Setrep, Setloop:
|
||||
return 3
|
||||
|
||||
default:
|
||||
panic(fmt.Errorf("Unexpected op code: %v", op))
|
||||
}
|
||||
}
|
||||
|
||||
var codeStr = []string{
|
||||
"Onerep", "Notonerep", "Setrep",
|
||||
"Oneloop", "Notoneloop", "Setloop",
|
||||
"Onelazy", "Notonelazy", "Setlazy",
|
||||
"One", "Notone", "Set",
|
||||
"Multi", "Ref",
|
||||
"Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End",
|
||||
"Nothing",
|
||||
"Lazybranch", "Branchmark", "Lazybranchmark",
|
||||
"Nullcount", "Setcount", "Branchcount", "Lazybranchcount",
|
||||
"Nullmark", "Setmark", "Capturemark", "Getmark",
|
||||
"Setjump", "Backjump", "Forejump", "Testref", "Goto",
|
||||
"Prune", "Stop",
|
||||
"ECMABoundary", "NonECMABoundary",
|
||||
}
|
||||
|
||||
func operatorDescription(op InstOp) string {
|
||||
desc := codeStr[op&Mask]
|
||||
if (op & Ci) != 0 {
|
||||
desc += "-Ci"
|
||||
}
|
||||
if (op & Rtl) != 0 {
|
||||
desc += "-Rtl"
|
||||
}
|
||||
if (op & Back) != 0 {
|
||||
desc += "-Back"
|
||||
}
|
||||
if (op & Back2) != 0 {
|
||||
desc += "-Back2"
|
||||
}
|
||||
|
||||
return desc
|
||||
}
|
||||
|
||||
// OpcodeDescription is a humman readable string of the specific offset
|
||||
func (c *Code) OpcodeDescription(offset int) string {
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
op := InstOp(c.Codes[offset])
|
||||
fmt.Fprintf(buf, "%06d ", offset)
|
||||
|
||||
if opcodeBacktracks(op & Mask) {
|
||||
buf.WriteString("*")
|
||||
} else {
|
||||
buf.WriteString(" ")
|
||||
}
|
||||
buf.WriteString(operatorDescription(op))
|
||||
buf.WriteString("(")
|
||||
op &= Mask
|
||||
|
||||
switch op {
|
||||
case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy:
|
||||
buf.WriteString("Ch = ")
|
||||
buf.WriteString(CharDescription(rune(c.Codes[offset+1])))
|
||||
|
||||
case Set, Setrep, Setloop, Setlazy:
|
||||
buf.WriteString("Set = ")
|
||||
buf.WriteString(c.Sets[c.Codes[offset+1]].String())
|
||||
|
||||
case Multi:
|
||||
fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]]))
|
||||
|
||||
case Ref, Testref:
|
||||
fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
|
||||
|
||||
case Capturemark:
|
||||
fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
|
||||
if c.Codes[offset+2] != -1 {
|
||||
fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2])
|
||||
}
|
||||
|
||||
case Nullcount, Setcount:
|
||||
fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1])
|
||||
|
||||
case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount:
|
||||
fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1])
|
||||
}
|
||||
|
||||
switch op {
|
||||
case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy:
|
||||
buf.WriteString(", Rep = ")
|
||||
if c.Codes[offset+2] == math.MaxInt32 {
|
||||
buf.WriteString("inf")
|
||||
} else {
|
||||
fmt.Fprintf(buf, "%d", c.Codes[offset+2])
|
||||
}
|
||||
|
||||
case Branchcount, Lazybranchcount:
|
||||
buf.WriteString(", Limit = ")
|
||||
if c.Codes[offset+2] == math.MaxInt32 {
|
||||
buf.WriteString("inf")
|
||||
} else {
|
||||
fmt.Fprintf(buf, "%d", c.Codes[offset+2])
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
buf.WriteString(")")
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func (c *Code) Dump() string {
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
if c.RightToLeft {
|
||||
fmt.Fprintln(buf, "Direction: right-to-left")
|
||||
} else {
|
||||
fmt.Fprintln(buf, "Direction: left-to-right")
|
||||
}
|
||||
if c.FcPrefix == nil {
|
||||
fmt.Fprintln(buf, "Firstchars: n/a")
|
||||
} else {
|
||||
fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String())
|
||||
}
|
||||
|
||||
if c.BmPrefix == nil {
|
||||
fmt.Fprintln(buf, "Prefix: n/a")
|
||||
} else {
|
||||
fmt.Fprintf(buf, "Prefix: %v\n", Escape(c.BmPrefix.String()))
|
||||
}
|
||||
|
||||
fmt.Fprintf(buf, "Anchors: %v\n", c.Anchors)
|
||||
fmt.Fprintln(buf)
|
||||
|
||||
if c.BmPrefix != nil {
|
||||
fmt.Fprintln(buf, "BoyerMoore:")
|
||||
fmt.Fprintln(buf, c.BmPrefix.Dump(" "))
|
||||
}
|
||||
for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) {
|
||||
fmt.Fprintln(buf, c.OpcodeDescription(i))
|
||||
}
|
||||
|
||||
return buf.String()
|
||||
}
|
94
vendor/github.com/dlclark/regexp2/syntax/escape.go
generated
vendored
Normal file
94
vendor/github.com/dlclark/regexp2/syntax/escape.go
generated
vendored
Normal file
@@ -0,0 +1,94 @@
|
||||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
func Escape(input string) string {
|
||||
b := &bytes.Buffer{}
|
||||
for _, r := range input {
|
||||
escape(b, r, false)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
const meta = `\.+*?()|[]{}^$# `
|
||||
|
||||
func escape(b *bytes.Buffer, r rune, force bool) {
|
||||
if unicode.IsPrint(r) {
|
||||
if strings.IndexRune(meta, r) >= 0 || force {
|
||||
b.WriteRune('\\')
|
||||
}
|
||||
b.WriteRune(r)
|
||||
return
|
||||
}
|
||||
|
||||
switch r {
|
||||
case '\a':
|
||||
b.WriteString(`\a`)
|
||||
case '\f':
|
||||
b.WriteString(`\f`)
|
||||
case '\n':
|
||||
b.WriteString(`\n`)
|
||||
case '\r':
|
||||
b.WriteString(`\r`)
|
||||
case '\t':
|
||||
b.WriteString(`\t`)
|
||||
case '\v':
|
||||
b.WriteString(`\v`)
|
||||
default:
|
||||
if r < 0x100 {
|
||||
b.WriteString(`\x`)
|
||||
s := strconv.FormatInt(int64(r), 16)
|
||||
if len(s) == 1 {
|
||||
b.WriteRune('0')
|
||||
}
|
||||
b.WriteString(s)
|
||||
break
|
||||
}
|
||||
b.WriteString(`\u`)
|
||||
b.WriteString(strconv.FormatInt(int64(r), 16))
|
||||
}
|
||||
}
|
||||
|
||||
func Unescape(input string) (string, error) {
|
||||
idx := strings.IndexRune(input, '\\')
|
||||
// no slashes means no unescape needed
|
||||
if idx == -1 {
|
||||
return input, nil
|
||||
}
|
||||
|
||||
buf := bytes.NewBufferString(input[:idx])
|
||||
// get the runes for the rest of the string -- we're going full parser scan on this
|
||||
|
||||
p := parser{}
|
||||
p.setPattern(input[idx+1:])
|
||||
for {
|
||||
if p.rightMost() {
|
||||
return "", p.getErr(ErrIllegalEndEscape)
|
||||
}
|
||||
r, err := p.scanCharEscape()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
buf.WriteRune(r)
|
||||
// are we done?
|
||||
if p.rightMost() {
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
||||
r = p.moveRightGetChar()
|
||||
for r != '\\' {
|
||||
buf.WriteRune(r)
|
||||
if p.rightMost() {
|
||||
// we're done, no more slashes
|
||||
return buf.String(), nil
|
||||
}
|
||||
// keep scanning until we get another slash
|
||||
r = p.moveRightGetChar()
|
||||
}
|
||||
}
|
||||
}
|
20
vendor/github.com/dlclark/regexp2/syntax/fuzz.go
generated
vendored
Normal file
20
vendor/github.com/dlclark/regexp2/syntax/fuzz.go
generated
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
// +build gofuzz
|
||||
|
||||
package syntax
|
||||
|
||||
// Fuzz is the input point for go-fuzz
|
||||
func Fuzz(data []byte) int {
|
||||
sdata := string(data)
|
||||
tree, err := Parse(sdata, RegexOptions(0))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
// translate it to code
|
||||
_, err = Write(tree)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return 1
|
||||
}
|
2125
vendor/github.com/dlclark/regexp2/syntax/parser.go
generated
vendored
Normal file
2125
vendor/github.com/dlclark/regexp2/syntax/parser.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
896
vendor/github.com/dlclark/regexp2/syntax/prefix.go
generated
vendored
Normal file
896
vendor/github.com/dlclark/regexp2/syntax/prefix.go
generated
vendored
Normal file
@@ -0,0 +1,896 @@
|
||||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
type Prefix struct {
|
||||
PrefixStr []rune
|
||||
PrefixSet CharSet
|
||||
CaseInsensitive bool
|
||||
}
|
||||
|
||||
// It takes a RegexTree and computes the set of chars that can start it.
|
||||
func getFirstCharsPrefix(tree *RegexTree) *Prefix {
|
||||
s := regexFcd{
|
||||
fcStack: make([]regexFc, 32),
|
||||
intStack: make([]int, 32),
|
||||
}
|
||||
fc := s.regexFCFromRegexTree(tree)
|
||||
|
||||
if fc == nil || fc.nullable || fc.cc.IsEmpty() {
|
||||
return nil
|
||||
}
|
||||
fcSet := fc.getFirstChars()
|
||||
return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive}
|
||||
}
|
||||
|
||||
type regexFcd struct {
|
||||
intStack []int
|
||||
intDepth int
|
||||
fcStack []regexFc
|
||||
fcDepth int
|
||||
skipAllChildren bool // don't process any more children at the current level
|
||||
skipchild bool // don't process the current child.
|
||||
failed bool
|
||||
}
|
||||
|
||||
/*
|
||||
* The main FC computation. It does a shortcutted depth-first walk
|
||||
* through the tree and calls CalculateFC to emits code before
|
||||
* and after each child of an interior node, and at each leaf.
|
||||
*/
|
||||
func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc {
|
||||
curNode := tree.root
|
||||
curChild := 0
|
||||
|
||||
for {
|
||||
if len(curNode.children) == 0 {
|
||||
// This is a leaf node
|
||||
s.calculateFC(curNode.t, curNode, 0)
|
||||
} else if curChild < len(curNode.children) && !s.skipAllChildren {
|
||||
// This is an interior node, and we have more children to analyze
|
||||
s.calculateFC(curNode.t|beforeChild, curNode, curChild)
|
||||
|
||||
if !s.skipchild {
|
||||
curNode = curNode.children[curChild]
|
||||
// this stack is how we get a depth first walk of the tree.
|
||||
s.pushInt(curChild)
|
||||
curChild = 0
|
||||
} else {
|
||||
curChild++
|
||||
s.skipchild = false
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// This is an interior node where we've finished analyzing all the children, or
|
||||
// the end of a leaf node.
|
||||
s.skipAllChildren = false
|
||||
|
||||
if s.intIsEmpty() {
|
||||
break
|
||||
}
|
||||
|
||||
curChild = s.popInt()
|
||||
curNode = curNode.next
|
||||
|
||||
s.calculateFC(curNode.t|afterChild, curNode, curChild)
|
||||
if s.failed {
|
||||
return nil
|
||||
}
|
||||
|
||||
curChild++
|
||||
}
|
||||
|
||||
if s.fcIsEmpty() {
|
||||
return nil
|
||||
}
|
||||
|
||||
return s.popFC()
|
||||
}
|
||||
|
||||
// To avoid recursion, we use a simple integer stack.
|
||||
// This is the push.
|
||||
func (s *regexFcd) pushInt(I int) {
|
||||
if s.intDepth >= len(s.intStack) {
|
||||
expanded := make([]int, s.intDepth*2)
|
||||
copy(expanded, s.intStack)
|
||||
s.intStack = expanded
|
||||
}
|
||||
|
||||
s.intStack[s.intDepth] = I
|
||||
s.intDepth++
|
||||
}
|
||||
|
||||
// True if the stack is empty.
|
||||
func (s *regexFcd) intIsEmpty() bool {
|
||||
return s.intDepth == 0
|
||||
}
|
||||
|
||||
// This is the pop.
|
||||
func (s *regexFcd) popInt() int {
|
||||
s.intDepth--
|
||||
return s.intStack[s.intDepth]
|
||||
}
|
||||
|
||||
// We also use a stack of RegexFC objects.
|
||||
// This is the push.
|
||||
func (s *regexFcd) pushFC(fc regexFc) {
|
||||
if s.fcDepth >= len(s.fcStack) {
|
||||
expanded := make([]regexFc, s.fcDepth*2)
|
||||
copy(expanded, s.fcStack)
|
||||
s.fcStack = expanded
|
||||
}
|
||||
|
||||
s.fcStack[s.fcDepth] = fc
|
||||
s.fcDepth++
|
||||
}
|
||||
|
||||
// True if the stack is empty.
|
||||
func (s *regexFcd) fcIsEmpty() bool {
|
||||
return s.fcDepth == 0
|
||||
}
|
||||
|
||||
// This is the pop.
|
||||
func (s *regexFcd) popFC() *regexFc {
|
||||
s.fcDepth--
|
||||
return &s.fcStack[s.fcDepth]
|
||||
}
|
||||
|
||||
// This is the top.
|
||||
func (s *regexFcd) topFC() *regexFc {
|
||||
return &s.fcStack[s.fcDepth-1]
|
||||
}
|
||||
|
||||
// Called in Beforechild to prevent further processing of the current child
|
||||
func (s *regexFcd) skipChild() {
|
||||
s.skipchild = true
|
||||
}
|
||||
|
||||
// FC computation and shortcut cases for each node type
|
||||
func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) {
|
||||
//fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description())
|
||||
ci := false
|
||||
rtl := false
|
||||
|
||||
if nt <= ntRef {
|
||||
if (node.options & IgnoreCase) != 0 {
|
||||
ci = true
|
||||
}
|
||||
if (node.options & RightToLeft) != 0 {
|
||||
rtl = true
|
||||
}
|
||||
}
|
||||
|
||||
switch nt {
|
||||
case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild:
|
||||
break
|
||||
|
||||
case ntTestgroup | beforeChild:
|
||||
if CurIndex == 0 {
|
||||
s.skipChild()
|
||||
}
|
||||
break
|
||||
|
||||
case ntEmpty:
|
||||
s.pushFC(regexFc{nullable: true})
|
||||
break
|
||||
|
||||
case ntConcatenate | afterChild:
|
||||
if CurIndex != 0 {
|
||||
child := s.popFC()
|
||||
cumul := s.topFC()
|
||||
|
||||
s.failed = !cumul.addFC(*child, true)
|
||||
}
|
||||
|
||||
fc := s.topFC()
|
||||
if !fc.nullable {
|
||||
s.skipAllChildren = true
|
||||
}
|
||||
break
|
||||
|
||||
case ntTestgroup | afterChild:
|
||||
if CurIndex > 1 {
|
||||
child := s.popFC()
|
||||
cumul := s.topFC()
|
||||
|
||||
s.failed = !cumul.addFC(*child, false)
|
||||
}
|
||||
break
|
||||
|
||||
case ntAlternate | afterChild, ntTestref | afterChild:
|
||||
if CurIndex != 0 {
|
||||
child := s.popFC()
|
||||
cumul := s.topFC()
|
||||
|
||||
s.failed = !cumul.addFC(*child, false)
|
||||
}
|
||||
break
|
||||
|
||||
case ntLoop | afterChild, ntLazyloop | afterChild:
|
||||
if node.m == 0 {
|
||||
fc := s.topFC()
|
||||
fc.nullable = true
|
||||
}
|
||||
break
|
||||
|
||||
case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild:
|
||||
break
|
||||
|
||||
case ntRequire | beforeChild, ntPrevent | beforeChild:
|
||||
s.skipChild()
|
||||
s.pushFC(regexFc{nullable: true})
|
||||
break
|
||||
|
||||
case ntRequire | afterChild, ntPrevent | afterChild:
|
||||
break
|
||||
|
||||
case ntOne, ntNotone:
|
||||
s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci))
|
||||
break
|
||||
|
||||
case ntOneloop, ntOnelazy:
|
||||
s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci))
|
||||
break
|
||||
|
||||
case ntNotoneloop, ntNotonelazy:
|
||||
s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci))
|
||||
break
|
||||
|
||||
case ntMulti:
|
||||
if len(node.str) == 0 {
|
||||
s.pushFC(regexFc{nullable: true})
|
||||
} else if !rtl {
|
||||
s.pushFC(newRegexFc(node.str[0], false, false, ci))
|
||||
} else {
|
||||
s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci))
|
||||
}
|
||||
break
|
||||
|
||||
case ntSet:
|
||||
s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci})
|
||||
break
|
||||
|
||||
case ntSetloop, ntSetlazy:
|
||||
s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci})
|
||||
break
|
||||
|
||||
case ntRef:
|
||||
s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false})
|
||||
break
|
||||
|
||||
case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
|
||||
s.pushFC(regexFc{nullable: true})
|
||||
break
|
||||
|
||||
default:
|
||||
panic(fmt.Sprintf("unexpected op code: %v", nt))
|
||||
}
|
||||
}
|
||||
|
||||
type regexFc struct {
|
||||
cc CharSet
|
||||
nullable bool
|
||||
caseInsensitive bool
|
||||
}
|
||||
|
||||
func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc {
|
||||
r := regexFc{
|
||||
caseInsensitive: caseInsensitive,
|
||||
nullable: nullable,
|
||||
}
|
||||
if not {
|
||||
if ch > 0 {
|
||||
r.cc.addRange('\x00', ch-1)
|
||||
}
|
||||
if ch < 0xFFFF {
|
||||
r.cc.addRange(ch+1, utf8.MaxRune)
|
||||
}
|
||||
} else {
|
||||
r.cc.addRange(ch, ch)
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
func (r *regexFc) getFirstChars() CharSet {
|
||||
if r.caseInsensitive {
|
||||
r.cc.addLowercase()
|
||||
}
|
||||
|
||||
return r.cc
|
||||
}
|
||||
|
||||
func (r *regexFc) addFC(fc regexFc, concatenate bool) bool {
|
||||
if !r.cc.IsMergeable() || !fc.cc.IsMergeable() {
|
||||
return false
|
||||
}
|
||||
|
||||
if concatenate {
|
||||
if !r.nullable {
|
||||
return true
|
||||
}
|
||||
|
||||
if !fc.nullable {
|
||||
r.nullable = false
|
||||
}
|
||||
} else {
|
||||
if fc.nullable {
|
||||
r.nullable = true
|
||||
}
|
||||
}
|
||||
|
||||
r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive
|
||||
r.cc.addSet(fc.cc)
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// This is a related computation: it takes a RegexTree and computes the
|
||||
// leading substring if it sees one. It's quite trivial and gives up easily.
|
||||
func getPrefix(tree *RegexTree) *Prefix {
|
||||
var concatNode *regexNode
|
||||
nextChild := 0
|
||||
|
||||
curNode := tree.root
|
||||
|
||||
for {
|
||||
switch curNode.t {
|
||||
case ntConcatenate:
|
||||
if len(curNode.children) > 0 {
|
||||
concatNode = curNode
|
||||
nextChild = 0
|
||||
}
|
||||
|
||||
case ntGreedy, ntCapture:
|
||||
curNode = curNode.children[0]
|
||||
concatNode = nil
|
||||
continue
|
||||
|
||||
case ntOneloop, ntOnelazy:
|
||||
if curNode.m > 0 {
|
||||
return &Prefix{
|
||||
PrefixStr: repeat(curNode.ch, curNode.m),
|
||||
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
case ntOne:
|
||||
return &Prefix{
|
||||
PrefixStr: []rune{curNode.ch},
|
||||
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
||||
}
|
||||
|
||||
case ntMulti:
|
||||
return &Prefix{
|
||||
PrefixStr: curNode.str,
|
||||
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
||||
}
|
||||
|
||||
case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart,
|
||||
ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent:
|
||||
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
|
||||
if concatNode == nil || nextChild >= len(concatNode.children) {
|
||||
return nil
|
||||
}
|
||||
|
||||
curNode = concatNode.children[nextChild]
|
||||
nextChild++
|
||||
}
|
||||
}
|
||||
|
||||
// repeat the rune r, c times... up to the max of MaxPrefixSize
|
||||
func repeat(r rune, c int) []rune {
|
||||
if c > MaxPrefixSize {
|
||||
c = MaxPrefixSize
|
||||
}
|
||||
|
||||
ret := make([]rune, c)
|
||||
|
||||
// binary growth using copy for speed
|
||||
ret[0] = r
|
||||
bp := 1
|
||||
for bp < len(ret) {
|
||||
copy(ret[bp:], ret[:bp])
|
||||
bp *= 2
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
// BmPrefix precomputes the Boyer-Moore
|
||||
// tables for fast string scanning. These tables allow
|
||||
// you to scan for the first occurrence of a string within
|
||||
// a large body of text without examining every character.
|
||||
// The performance of the heuristic depends on the actual
|
||||
// string and the text being searched, but usually, the longer
|
||||
// the string that is being searched for, the fewer characters
|
||||
// need to be examined.
|
||||
type BmPrefix struct {
|
||||
positive []int
|
||||
negativeASCII []int
|
||||
negativeUnicode [][]int
|
||||
pattern []rune
|
||||
lowASCII rune
|
||||
highASCII rune
|
||||
rightToLeft bool
|
||||
caseInsensitive bool
|
||||
}
|
||||
|
||||
func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix {
|
||||
|
||||
b := &BmPrefix{
|
||||
rightToLeft: rightToLeft,
|
||||
caseInsensitive: caseInsensitive,
|
||||
pattern: pattern,
|
||||
}
|
||||
|
||||
if caseInsensitive {
|
||||
for i := 0; i < len(b.pattern); i++ {
|
||||
// We do the ToLower character by character for consistency. With surrogate chars, doing
|
||||
// a ToLower on the entire string could actually change the surrogate pair. This is more correct
|
||||
// linguistically, but since Regex doesn't support surrogates, it's more important to be
|
||||
// consistent.
|
||||
|
||||
b.pattern[i] = unicode.ToLower(b.pattern[i])
|
||||
}
|
||||
}
|
||||
|
||||
var beforefirst, last, bump int
|
||||
var scan, match int
|
||||
|
||||
if !rightToLeft {
|
||||
beforefirst = -1
|
||||
last = len(b.pattern) - 1
|
||||
bump = 1
|
||||
} else {
|
||||
beforefirst = len(b.pattern)
|
||||
last = 0
|
||||
bump = -1
|
||||
}
|
||||
|
||||
// PART I - the good-suffix shift table
|
||||
//
|
||||
// compute the positive requirement:
|
||||
// if char "i" is the first one from the right that doesn't match,
|
||||
// then we know the matcher can advance by _positive[i].
|
||||
//
|
||||
// This algorithm is a simplified variant of the standard
|
||||
// Boyer-Moore good suffix calculation.
|
||||
|
||||
b.positive = make([]int, len(b.pattern))
|
||||
|
||||
examine := last
|
||||
ch := b.pattern[examine]
|
||||
b.positive[examine] = bump
|
||||
examine -= bump
|
||||
|
||||
Outerloop:
|
||||
for {
|
||||
// find an internal char (examine) that matches the tail
|
||||
|
||||
for {
|
||||
if examine == beforefirst {
|
||||
break Outerloop
|
||||
}
|
||||
if b.pattern[examine] == ch {
|
||||
break
|
||||
}
|
||||
examine -= bump
|
||||
}
|
||||
|
||||
match = last
|
||||
scan = examine
|
||||
|
||||
// find the length of the match
|
||||
for {
|
||||
if scan == beforefirst || b.pattern[match] != b.pattern[scan] {
|
||||
// at the end of the match, note the difference in _positive
|
||||
// this is not the length of the match, but the distance from the internal match
|
||||
// to the tail suffix.
|
||||
if b.positive[match] == 0 {
|
||||
b.positive[match] = match - scan
|
||||
}
|
||||
|
||||
// System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
scan -= bump
|
||||
match -= bump
|
||||
}
|
||||
|
||||
examine -= bump
|
||||
}
|
||||
|
||||
match = last - bump
|
||||
|
||||
// scan for the chars for which there are no shifts that yield a different candidate
|
||||
|
||||
// The inside of the if statement used to say
|
||||
// "_positive[match] = last - beforefirst;"
|
||||
// This is slightly less aggressive in how much we skip, but at worst it
|
||||
// should mean a little more work rather than skipping a potential match.
|
||||
for match != beforefirst {
|
||||
if b.positive[match] == 0 {
|
||||
b.positive[match] = bump
|
||||
}
|
||||
|
||||
match -= bump
|
||||
}
|
||||
|
||||
// PART II - the bad-character shift table
|
||||
//
|
||||
// compute the negative requirement:
|
||||
// if char "ch" is the reject character when testing position "i",
|
||||
// we can slide up by _negative[ch];
|
||||
// (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
|
||||
//
|
||||
// the lookup table is divided into ASCII and Unicode portions;
|
||||
// only those parts of the Unicode 16-bit code set that actually
|
||||
// appear in the string are in the table. (Maximum size with
|
||||
// Unicode is 65K; ASCII only case is 512 bytes.)
|
||||
|
||||
b.negativeASCII = make([]int, 128)
|
||||
|
||||
for i := 0; i < len(b.negativeASCII); i++ {
|
||||
b.negativeASCII[i] = last - beforefirst
|
||||
}
|
||||
|
||||
b.lowASCII = 127
|
||||
b.highASCII = 0
|
||||
|
||||
for examine = last; examine != beforefirst; examine -= bump {
|
||||
ch = b.pattern[examine]
|
||||
|
||||
switch {
|
||||
case ch < 128:
|
||||
if b.lowASCII > ch {
|
||||
b.lowASCII = ch
|
||||
}
|
||||
|
||||
if b.highASCII < ch {
|
||||
b.highASCII = ch
|
||||
}
|
||||
|
||||
if b.negativeASCII[ch] == last-beforefirst {
|
||||
b.negativeASCII[ch] = last - examine
|
||||
}
|
||||
case ch <= 0xffff:
|
||||
i, j := ch>>8, ch&0xFF
|
||||
|
||||
if b.negativeUnicode == nil {
|
||||
b.negativeUnicode = make([][]int, 256)
|
||||
}
|
||||
|
||||
if b.negativeUnicode[i] == nil {
|
||||
newarray := make([]int, 256)
|
||||
|
||||
for k := 0; k < len(newarray); k++ {
|
||||
newarray[k] = last - beforefirst
|
||||
}
|
||||
|
||||
if i == 0 {
|
||||
copy(newarray, b.negativeASCII)
|
||||
//TODO: this line needed?
|
||||
b.negativeASCII = newarray
|
||||
}
|
||||
|
||||
b.negativeUnicode[i] = newarray
|
||||
}
|
||||
|
||||
if b.negativeUnicode[i][j] == last-beforefirst {
|
||||
b.negativeUnicode[i][j] = last - examine
|
||||
}
|
||||
default:
|
||||
// we can't do the filter because this algo doesn't support
|
||||
// unicode chars >0xffff
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return b
|
||||
}
|
||||
|
||||
func (b *BmPrefix) String() string {
|
||||
return string(b.pattern)
|
||||
}
|
||||
|
||||
// Dump returns the contents of the filter as a human readable string
|
||||
func (b *BmPrefix) Dump(indent string) string {
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent)
|
||||
for i := 0; i < len(b.positive); i++ {
|
||||
buf.WriteString(strconv.Itoa(b.positive[i]))
|
||||
buf.WriteRune(' ')
|
||||
}
|
||||
buf.WriteRune('\n')
|
||||
|
||||
if b.negativeASCII != nil {
|
||||
buf.WriteString(indent)
|
||||
buf.WriteString("Negative table\n")
|
||||
for i := 0; i < len(b.negativeASCII); i++ {
|
||||
if b.negativeASCII[i] != len(b.pattern) {
|
||||
fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i]))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// Scan uses the Boyer-Moore algorithm to find the first occurrence
|
||||
// of the specified string within text, beginning at index, and
|
||||
// constrained within beglimit and endlimit.
|
||||
//
|
||||
// The direction and case-sensitivity of the match is determined
|
||||
// by the arguments to the RegexBoyerMoore constructor.
|
||||
func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
|
||||
var (
|
||||
defadv, test, test2 int
|
||||
match, startmatch, endmatch int
|
||||
bump, advance int
|
||||
chTest rune
|
||||
unicodeLookup []int
|
||||
)
|
||||
|
||||
if !b.rightToLeft {
|
||||
defadv = len(b.pattern)
|
||||
startmatch = len(b.pattern) - 1
|
||||
endmatch = 0
|
||||
test = index + defadv - 1
|
||||
bump = 1
|
||||
} else {
|
||||
defadv = -len(b.pattern)
|
||||
startmatch = 0
|
||||
endmatch = -defadv - 1
|
||||
test = index + defadv
|
||||
bump = -1
|
||||
}
|
||||
|
||||
chMatch := b.pattern[startmatch]
|
||||
|
||||
for {
|
||||
if test >= endlimit || test < beglimit {
|
||||
return -1
|
||||
}
|
||||
|
||||
chTest = text[test]
|
||||
|
||||
if b.caseInsensitive {
|
||||
chTest = unicode.ToLower(chTest)
|
||||
}
|
||||
|
||||
if chTest != chMatch {
|
||||
if chTest < 128 {
|
||||
advance = b.negativeASCII[chTest]
|
||||
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
|
||||
unicodeLookup = b.negativeUnicode[chTest>>8]
|
||||
if len(unicodeLookup) > 0 {
|
||||
advance = unicodeLookup[chTest&0xFF]
|
||||
} else {
|
||||
advance = defadv
|
||||
}
|
||||
} else {
|
||||
advance = defadv
|
||||
}
|
||||
|
||||
test += advance
|
||||
} else { // if (chTest == chMatch)
|
||||
test2 = test
|
||||
match = startmatch
|
||||
|
||||
for {
|
||||
if match == endmatch {
|
||||
if b.rightToLeft {
|
||||
return test2 + 1
|
||||
} else {
|
||||
return test2
|
||||
}
|
||||
}
|
||||
|
||||
match -= bump
|
||||
test2 -= bump
|
||||
|
||||
chTest = text[test2]
|
||||
|
||||
if b.caseInsensitive {
|
||||
chTest = unicode.ToLower(chTest)
|
||||
}
|
||||
|
||||
if chTest != b.pattern[match] {
|
||||
advance = b.positive[match]
|
||||
if (chTest & 0xFF80) == 0 {
|
||||
test2 = (match - startmatch) + b.negativeASCII[chTest]
|
||||
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
|
||||
unicodeLookup = b.negativeUnicode[chTest>>8]
|
||||
if len(unicodeLookup) > 0 {
|
||||
test2 = (match - startmatch) + unicodeLookup[chTest&0xFF]
|
||||
} else {
|
||||
test += advance
|
||||
break
|
||||
}
|
||||
} else {
|
||||
test += advance
|
||||
break
|
||||
}
|
||||
|
||||
if b.rightToLeft {
|
||||
if test2 < advance {
|
||||
advance = test2
|
||||
}
|
||||
} else if test2 > advance {
|
||||
advance = test2
|
||||
}
|
||||
|
||||
test += advance
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
|
||||
func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool {
|
||||
if !b.rightToLeft {
|
||||
if index < beglimit || endlimit-index < len(b.pattern) {
|
||||
return false
|
||||
}
|
||||
|
||||
return b.matchPattern(text, index)
|
||||
} else {
|
||||
if index > endlimit || index-beglimit < len(b.pattern) {
|
||||
return false
|
||||
}
|
||||
|
||||
return b.matchPattern(text, index-len(b.pattern))
|
||||
}
|
||||
}
|
||||
|
||||
func (b *BmPrefix) matchPattern(text []rune, index int) bool {
|
||||
if len(text)-index < len(b.pattern) {
|
||||
return false
|
||||
}
|
||||
|
||||
if b.caseInsensitive {
|
||||
for i := 0; i < len(b.pattern); i++ {
|
||||
//Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
|
||||
if unicode.ToLower(text[index+i]) != b.pattern[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
} else {
|
||||
for i := 0; i < len(b.pattern); i++ {
|
||||
if text[index+i] != b.pattern[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
type AnchorLoc int16
|
||||
|
||||
// where the regex can be pegged
|
||||
const (
|
||||
AnchorBeginning AnchorLoc = 0x0001
|
||||
AnchorBol = 0x0002
|
||||
AnchorStart = 0x0004
|
||||
AnchorEol = 0x0008
|
||||
AnchorEndZ = 0x0010
|
||||
AnchorEnd = 0x0020
|
||||
AnchorBoundary = 0x0040
|
||||
AnchorECMABoundary = 0x0080
|
||||
)
|
||||
|
||||
func getAnchors(tree *RegexTree) AnchorLoc {
|
||||
|
||||
var concatNode *regexNode
|
||||
nextChild, result := 0, AnchorLoc(0)
|
||||
|
||||
curNode := tree.root
|
||||
|
||||
for {
|
||||
switch curNode.t {
|
||||
case ntConcatenate:
|
||||
if len(curNode.children) > 0 {
|
||||
concatNode = curNode
|
||||
nextChild = 0
|
||||
}
|
||||
|
||||
case ntGreedy, ntCapture:
|
||||
curNode = curNode.children[0]
|
||||
concatNode = nil
|
||||
continue
|
||||
|
||||
case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning,
|
||||
ntStart, ntEndZ, ntEnd:
|
||||
return result | anchorFromType(curNode.t)
|
||||
|
||||
case ntEmpty, ntRequire, ntPrevent:
|
||||
|
||||
default:
|
||||
return result
|
||||
}
|
||||
|
||||
if concatNode == nil || nextChild >= len(concatNode.children) {
|
||||
return result
|
||||
}
|
||||
|
||||
curNode = concatNode.children[nextChild]
|
||||
nextChild++
|
||||
}
|
||||
}
|
||||
|
||||
func anchorFromType(t nodeType) AnchorLoc {
|
||||
switch t {
|
||||
case ntBol:
|
||||
return AnchorBol
|
||||
case ntEol:
|
||||
return AnchorEol
|
||||
case ntBoundary:
|
||||
return AnchorBoundary
|
||||
case ntECMABoundary:
|
||||
return AnchorECMABoundary
|
||||
case ntBeginning:
|
||||
return AnchorBeginning
|
||||
case ntStart:
|
||||
return AnchorStart
|
||||
case ntEndZ:
|
||||
return AnchorEndZ
|
||||
case ntEnd:
|
||||
return AnchorEnd
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// anchorDescription returns a human-readable description of the anchors
|
||||
func (anchors AnchorLoc) String() string {
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
if 0 != (anchors & AnchorBeginning) {
|
||||
buf.WriteString(", Beginning")
|
||||
}
|
||||
if 0 != (anchors & AnchorStart) {
|
||||
buf.WriteString(", Start")
|
||||
}
|
||||
if 0 != (anchors & AnchorBol) {
|
||||
buf.WriteString(", Bol")
|
||||
}
|
||||
if 0 != (anchors & AnchorBoundary) {
|
||||
buf.WriteString(", Boundary")
|
||||
}
|
||||
if 0 != (anchors & AnchorECMABoundary) {
|
||||
buf.WriteString(", ECMABoundary")
|
||||
}
|
||||
if 0 != (anchors & AnchorEol) {
|
||||
buf.WriteString(", Eol")
|
||||
}
|
||||
if 0 != (anchors & AnchorEnd) {
|
||||
buf.WriteString(", End")
|
||||
}
|
||||
if 0 != (anchors & AnchorEndZ) {
|
||||
buf.WriteString(", EndZ")
|
||||
}
|
||||
|
||||
// trim off comma
|
||||
if buf.Len() >= 2 {
|
||||
return buf.String()[2:]
|
||||
}
|
||||
return "None"
|
||||
}
|
87
vendor/github.com/dlclark/regexp2/syntax/replacerdata.go
generated
vendored
Normal file
87
vendor/github.com/dlclark/regexp2/syntax/replacerdata.go
generated
vendored
Normal file
@@ -0,0 +1,87 @@
|
||||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
)
|
||||
|
||||
type ReplacerData struct {
|
||||
Rep string
|
||||
Strings []string
|
||||
Rules []int
|
||||
}
|
||||
|
||||
const (
|
||||
replaceSpecials = 4
|
||||
replaceLeftPortion = -1
|
||||
replaceRightPortion = -2
|
||||
replaceLastGroup = -3
|
||||
replaceWholeString = -4
|
||||
)
|
||||
|
||||
//ErrReplacementError is a general error during parsing the replacement text
|
||||
var ErrReplacementError = errors.New("Replacement pattern error.")
|
||||
|
||||
// NewReplacerData will populate a reusable replacer data struct based on the given replacement string
|
||||
// and the capture group data from a regexp
|
||||
func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error) {
|
||||
p := parser{
|
||||
options: op,
|
||||
caps: caps,
|
||||
capsize: capsize,
|
||||
capnames: capnames,
|
||||
}
|
||||
p.setPattern(rep)
|
||||
concat, err := p.scanReplacement()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if concat.t != ntConcatenate {
|
||||
panic(ErrReplacementError)
|
||||
}
|
||||
|
||||
sb := &bytes.Buffer{}
|
||||
var (
|
||||
strings []string
|
||||
rules []int
|
||||
)
|
||||
|
||||
for _, child := range concat.children {
|
||||
switch child.t {
|
||||
case ntMulti:
|
||||
child.writeStrToBuf(sb)
|
||||
|
||||
case ntOne:
|
||||
sb.WriteRune(child.ch)
|
||||
|
||||
case ntRef:
|
||||
if sb.Len() > 0 {
|
||||
rules = append(rules, len(strings))
|
||||
strings = append(strings, sb.String())
|
||||
sb.Reset()
|
||||
}
|
||||
slot := child.m
|
||||
|
||||
if len(caps) > 0 && slot >= 0 {
|
||||
slot = caps[slot]
|
||||
}
|
||||
|
||||
rules = append(rules, -replaceSpecials-1-slot)
|
||||
|
||||
default:
|
||||
panic(ErrReplacementError)
|
||||
}
|
||||
}
|
||||
|
||||
if sb.Len() > 0 {
|
||||
rules = append(rules, len(strings))
|
||||
strings = append(strings, sb.String())
|
||||
}
|
||||
|
||||
return &ReplacerData{
|
||||
Rep: rep,
|
||||
Strings: strings,
|
||||
Rules: rules,
|
||||
}, nil
|
||||
}
|
654
vendor/github.com/dlclark/regexp2/syntax/tree.go
generated
vendored
Normal file
654
vendor/github.com/dlclark/regexp2/syntax/tree.go
generated
vendored
Normal file
@@ -0,0 +1,654 @@
|
||||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type RegexTree struct {
|
||||
root *regexNode
|
||||
caps map[int]int
|
||||
capnumlist []int
|
||||
captop int
|
||||
Capnames map[string]int
|
||||
Caplist []string
|
||||
options RegexOptions
|
||||
}
|
||||
|
||||
// It is built into a parsed tree for a regular expression.
|
||||
|
||||
// Implementation notes:
|
||||
//
|
||||
// Since the node tree is a temporary data structure only used
|
||||
// during compilation of the regexp to integer codes, it's
|
||||
// designed for clarity and convenience rather than
|
||||
// space efficiency.
|
||||
//
|
||||
// RegexNodes are built into a tree, linked by the n.children list.
|
||||
// Each node also has a n.parent and n.ichild member indicating
|
||||
// its parent and which child # it is in its parent's list.
|
||||
//
|
||||
// RegexNodes come in as many types as there are constructs in
|
||||
// a regular expression, for example, "concatenate", "alternate",
|
||||
// "one", "rept", "group". There are also node types for basic
|
||||
// peephole optimizations, e.g., "onerep", "notsetrep", etc.
|
||||
//
|
||||
// Because perl 5 allows "lookback" groups that scan backwards,
|
||||
// each node also gets a "direction". Normally the value of
|
||||
// boolean n.backward = false.
|
||||
//
|
||||
// During parsing, top-level nodes are also stacked onto a parse
|
||||
// stack (a stack of trees). For this purpose we have a n.next
|
||||
// pointer. [Note that to save a few bytes, we could overload the
|
||||
// n.parent pointer instead.]
|
||||
//
|
||||
// On the parse stack, each tree has a "role" - basically, the
|
||||
// nonterminal in the grammar that the parser has currently
|
||||
// assigned to the tree. That code is stored in n.role.
|
||||
//
|
||||
// Finally, some of the different kinds of nodes have data.
|
||||
// Two integers (for the looping constructs) are stored in
|
||||
// n.operands, an an object (either a string or a set)
|
||||
// is stored in n.data
|
||||
type regexNode struct {
|
||||
t nodeType
|
||||
children []*regexNode
|
||||
str []rune
|
||||
set *CharSet
|
||||
ch rune
|
||||
m int
|
||||
n int
|
||||
options RegexOptions
|
||||
next *regexNode
|
||||
}
|
||||
|
||||
type nodeType int32
|
||||
|
||||
const (
|
||||
// The following are leaves, and correspond to primitive operations
|
||||
|
||||
ntOnerep nodeType = 0 // lef,back char,min,max a {n}
|
||||
ntNotonerep = 1 // lef,back char,min,max .{n}
|
||||
ntSetrep = 2 // lef,back set,min,max [\d]{n}
|
||||
ntOneloop = 3 // lef,back char,min,max a {,n}
|
||||
ntNotoneloop = 4 // lef,back char,min,max .{,n}
|
||||
ntSetloop = 5 // lef,back set,min,max [\d]{,n}
|
||||
ntOnelazy = 6 // lef,back char,min,max a {,n}?
|
||||
ntNotonelazy = 7 // lef,back char,min,max .{,n}?
|
||||
ntSetlazy = 8 // lef,back set,min,max [\d]{,n}?
|
||||
ntOne = 9 // lef char a
|
||||
ntNotone = 10 // lef char [^a]
|
||||
ntSet = 11 // lef set [a-z\s] \w \s \d
|
||||
ntMulti = 12 // lef string abcd
|
||||
ntRef = 13 // lef group \#
|
||||
ntBol = 14 // ^
|
||||
ntEol = 15 // $
|
||||
ntBoundary = 16 // \b
|
||||
ntNonboundary = 17 // \B
|
||||
ntBeginning = 18 // \A
|
||||
ntStart = 19 // \G
|
||||
ntEndZ = 20 // \Z
|
||||
ntEnd = 21 // \Z
|
||||
|
||||
// Interior nodes do not correspond to primitive operations, but
|
||||
// control structures compositing other operations
|
||||
|
||||
// Concat and alternate take n children, and can run forward or backwards
|
||||
|
||||
ntNothing = 22 // []
|
||||
ntEmpty = 23 // ()
|
||||
ntAlternate = 24 // a|b
|
||||
ntConcatenate = 25 // ab
|
||||
ntLoop = 26 // m,x * + ? {,}
|
||||
ntLazyloop = 27 // m,x *? +? ?? {,}?
|
||||
ntCapture = 28 // n ()
|
||||
ntGroup = 29 // (?:)
|
||||
ntRequire = 30 // (?=) (?<=)
|
||||
ntPrevent = 31 // (?!) (?<!)
|
||||
ntGreedy = 32 // (?>) (?<)
|
||||
ntTestref = 33 // (?(n) | )
|
||||
ntTestgroup = 34 // (?(...) | )
|
||||
|
||||
ntECMABoundary = 41 // \b
|
||||
ntNonECMABoundary = 42 // \B
|
||||
)
|
||||
|
||||
func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
|
||||
return ®exNode{
|
||||
t: t,
|
||||
options: opt,
|
||||
}
|
||||
}
|
||||
|
||||
func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
|
||||
return ®exNode{
|
||||
t: t,
|
||||
options: opt,
|
||||
ch: ch,
|
||||
}
|
||||
}
|
||||
|
||||
func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
|
||||
return ®exNode{
|
||||
t: t,
|
||||
options: opt,
|
||||
str: str,
|
||||
}
|
||||
}
|
||||
|
||||
func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
|
||||
return ®exNode{
|
||||
t: t,
|
||||
options: opt,
|
||||
set: set,
|
||||
}
|
||||
}
|
||||
|
||||
func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
|
||||
return ®exNode{
|
||||
t: t,
|
||||
options: opt,
|
||||
m: m,
|
||||
}
|
||||
}
|
||||
func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
|
||||
return ®exNode{
|
||||
t: t,
|
||||
options: opt,
|
||||
m: m,
|
||||
n: n,
|
||||
}
|
||||
}
|
||||
|
||||
func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
|
||||
for i := 0; i < len(n.str); i++ {
|
||||
buf.WriteRune(n.str[i])
|
||||
}
|
||||
}
|
||||
|
||||
func (n *regexNode) addChild(child *regexNode) {
|
||||
reduced := child.reduce()
|
||||
n.children = append(n.children, reduced)
|
||||
reduced.next = n
|
||||
}
|
||||
|
||||
func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
|
||||
newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
|
||||
n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
|
||||
}
|
||||
|
||||
// removes children including the start but not the end index
|
||||
func (n *regexNode) removeChildren(startIndex, endIndex int) {
|
||||
n.children = append(n.children[:startIndex], n.children[endIndex:]...)
|
||||
}
|
||||
|
||||
// Pass type as OneLazy or OneLoop
|
||||
func (n *regexNode) makeRep(t nodeType, min, max int) {
|
||||
n.t += (t - ntOne)
|
||||
n.m = min
|
||||
n.n = max
|
||||
}
|
||||
|
||||
func (n *regexNode) reduce() *regexNode {
|
||||
switch n.t {
|
||||
case ntAlternate:
|
||||
return n.reduceAlternation()
|
||||
|
||||
case ntConcatenate:
|
||||
return n.reduceConcatenation()
|
||||
|
||||
case ntLoop, ntLazyloop:
|
||||
return n.reduceRep()
|
||||
|
||||
case ntGroup:
|
||||
return n.reduceGroup()
|
||||
|
||||
case ntSet, ntSetloop:
|
||||
return n.reduceSet()
|
||||
|
||||
default:
|
||||
return n
|
||||
}
|
||||
}
|
||||
|
||||
// Basic optimization. Single-letter alternations can be replaced
|
||||
// by faster set specifications, and nested alternations with no
|
||||
// intervening operators can be flattened:
|
||||
//
|
||||
// a|b|c|def|g|h -> [a-c]|def|[gh]
|
||||
// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
|
||||
func (n *regexNode) reduceAlternation() *regexNode {
|
||||
if len(n.children) == 0 {
|
||||
return newRegexNode(ntNothing, n.options)
|
||||
}
|
||||
|
||||
wasLastSet := false
|
||||
lastNodeCannotMerge := false
|
||||
var optionsLast RegexOptions
|
||||
var i, j int
|
||||
|
||||
for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
|
||||
at := n.children[i]
|
||||
|
||||
if j < i {
|
||||
n.children[j] = at
|
||||
}
|
||||
|
||||
for {
|
||||
if at.t == ntAlternate {
|
||||
for k := 0; k < len(at.children); k++ {
|
||||
at.children[k].next = n
|
||||
}
|
||||
n.insertChildren(i+1, at.children)
|
||||
|
||||
j--
|
||||
} else if at.t == ntSet || at.t == ntOne {
|
||||
// Cannot merge sets if L or I options differ, or if either are negated.
|
||||
optionsAt := at.options & (RightToLeft | IgnoreCase)
|
||||
|
||||
if at.t == ntSet {
|
||||
if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
|
||||
wasLastSet = true
|
||||
lastNodeCannotMerge = !at.set.IsMergeable()
|
||||
optionsLast = optionsAt
|
||||
break
|
||||
}
|
||||
} else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
|
||||
wasLastSet = true
|
||||
lastNodeCannotMerge = false
|
||||
optionsLast = optionsAt
|
||||
break
|
||||
}
|
||||
|
||||
// The last node was a Set or a One, we're a Set or One and our options are the same.
|
||||
// Merge the two nodes.
|
||||
j--
|
||||
prev := n.children[j]
|
||||
|
||||
var prevCharClass *CharSet
|
||||
if prev.t == ntOne {
|
||||
prevCharClass = &CharSet{}
|
||||
prevCharClass.addChar(prev.ch)
|
||||
} else {
|
||||
prevCharClass = prev.set
|
||||
}
|
||||
|
||||
if at.t == ntOne {
|
||||
prevCharClass.addChar(at.ch)
|
||||
} else {
|
||||
prevCharClass.addSet(*at.set)
|
||||
}
|
||||
|
||||
prev.t = ntSet
|
||||
prev.set = prevCharClass
|
||||
} else if at.t == ntNothing {
|
||||
j--
|
||||
} else {
|
||||
wasLastSet = false
|
||||
lastNodeCannotMerge = false
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if j < i {
|
||||
n.removeChildren(j, i)
|
||||
}
|
||||
|
||||
return n.stripEnation(ntNothing)
|
||||
}
|
||||
|
||||
// Basic optimization. Adjacent strings can be concatenated.
|
||||
//
|
||||
// (?:abc)(?:def) -> abcdef
|
||||
func (n *regexNode) reduceConcatenation() *regexNode {
|
||||
// Eliminate empties and concat adjacent strings/chars
|
||||
|
||||
var optionsLast RegexOptions
|
||||
var optionsAt RegexOptions
|
||||
var i, j int
|
||||
|
||||
if len(n.children) == 0 {
|
||||
return newRegexNode(ntEmpty, n.options)
|
||||
}
|
||||
|
||||
wasLastString := false
|
||||
|
||||
for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
|
||||
var at, prev *regexNode
|
||||
|
||||
at = n.children[i]
|
||||
|
||||
if j < i {
|
||||
n.children[j] = at
|
||||
}
|
||||
|
||||
if at.t == ntConcatenate &&
|
||||
((at.options & RightToLeft) == (n.options & RightToLeft)) {
|
||||
for k := 0; k < len(at.children); k++ {
|
||||
at.children[k].next = n
|
||||
}
|
||||
|
||||
//insert at.children at i+1 index in n.children
|
||||
n.insertChildren(i+1, at.children)
|
||||
|
||||
j--
|
||||
} else if at.t == ntMulti || at.t == ntOne {
|
||||
// Cannot merge strings if L or I options differ
|
||||
optionsAt = at.options & (RightToLeft | IgnoreCase)
|
||||
|
||||
if !wasLastString || optionsLast != optionsAt {
|
||||
wasLastString = true
|
||||
optionsLast = optionsAt
|
||||
continue
|
||||
}
|
||||
|
||||
j--
|
||||
prev = n.children[j]
|
||||
|
||||
if prev.t == ntOne {
|
||||
prev.t = ntMulti
|
||||
prev.str = []rune{prev.ch}
|
||||
}
|
||||
|
||||
if (optionsAt & RightToLeft) == 0 {
|
||||
if at.t == ntOne {
|
||||
prev.str = append(prev.str, at.ch)
|
||||
} else {
|
||||
prev.str = append(prev.str, at.str...)
|
||||
}
|
||||
} else {
|
||||
if at.t == ntOne {
|
||||
// insert at the front by expanding our slice, copying the data over, and then setting the value
|
||||
prev.str = append(prev.str, 0)
|
||||
copy(prev.str[1:], prev.str)
|
||||
prev.str[0] = at.ch
|
||||
} else {
|
||||
//insert at the front...this one we'll make a new slice and copy both into it
|
||||
merge := make([]rune, len(prev.str)+len(at.str))
|
||||
copy(merge, at.str)
|
||||
copy(merge[len(at.str):], prev.str)
|
||||
prev.str = merge
|
||||
}
|
||||
}
|
||||
} else if at.t == ntEmpty {
|
||||
j--
|
||||
} else {
|
||||
wasLastString = false
|
||||
}
|
||||
}
|
||||
|
||||
if j < i {
|
||||
// remove indices j through i from the children
|
||||
n.removeChildren(j, i)
|
||||
}
|
||||
|
||||
return n.stripEnation(ntEmpty)
|
||||
}
|
||||
|
||||
// Nested repeaters just get multiplied with each other if they're not
|
||||
// too lumpy
|
||||
func (n *regexNode) reduceRep() *regexNode {
|
||||
|
||||
u := n
|
||||
t := n.t
|
||||
min := n.m
|
||||
max := n.n
|
||||
|
||||
for {
|
||||
if len(u.children) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
child := u.children[0]
|
||||
|
||||
// multiply reps of the same type only
|
||||
if child.t != t {
|
||||
childType := child.t
|
||||
|
||||
if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
|
||||
childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
|
||||
// [but things like (a {2,})+ are not too lumpy...]
|
||||
if u.m == 0 && child.m > 1 || child.n < child.m*2 {
|
||||
break
|
||||
}
|
||||
|
||||
u = child
|
||||
if u.m > 0 {
|
||||
if (math.MaxInt32-1)/u.m < min {
|
||||
u.m = math.MaxInt32
|
||||
} else {
|
||||
u.m = u.m * min
|
||||
}
|
||||
}
|
||||
if u.n > 0 {
|
||||
if (math.MaxInt32-1)/u.n < max {
|
||||
u.n = math.MaxInt32
|
||||
} else {
|
||||
u.n = u.n * max
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if math.MaxInt32 == min {
|
||||
return newRegexNode(ntNothing, n.options)
|
||||
}
|
||||
return u
|
||||
|
||||
}
|
||||
|
||||
// Simple optimization. If a concatenation or alternation has only
|
||||
// one child strip out the intermediate node. If it has zero children,
|
||||
// turn it into an empty.
|
||||
func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
|
||||
switch len(n.children) {
|
||||
case 0:
|
||||
return newRegexNode(emptyType, n.options)
|
||||
case 1:
|
||||
return n.children[0]
|
||||
default:
|
||||
return n
|
||||
}
|
||||
}
|
||||
|
||||
func (n *regexNode) reduceGroup() *regexNode {
|
||||
u := n
|
||||
|
||||
for u.t == ntGroup {
|
||||
u = u.children[0]
|
||||
}
|
||||
|
||||
return u
|
||||
}
|
||||
|
||||
// Simple optimization. If a set is a singleton, an inverse singleton,
|
||||
// or empty, it's transformed accordingly.
|
||||
func (n *regexNode) reduceSet() *regexNode {
|
||||
// Extract empty-set, one and not-one case as special
|
||||
|
||||
if n.set == nil {
|
||||
n.t = ntNothing
|
||||
} else if n.set.IsSingleton() {
|
||||
n.ch = n.set.SingletonChar()
|
||||
n.set = nil
|
||||
n.t += (ntOne - ntSet)
|
||||
} else if n.set.IsSingletonInverse() {
|
||||
n.ch = n.set.SingletonChar()
|
||||
n.set = nil
|
||||
n.t += (ntNotone - ntSet)
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
func (n *regexNode) reverseLeft() *regexNode {
|
||||
if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
|
||||
//reverse children order
|
||||
for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
|
||||
n.children[left], n.children[right] = n.children[right], n.children[left]
|
||||
}
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
|
||||
if min == 0 && max == 0 {
|
||||
return newRegexNode(ntEmpty, n.options)
|
||||
}
|
||||
|
||||
if min == 1 && max == 1 {
|
||||
return n
|
||||
}
|
||||
|
||||
switch n.t {
|
||||
case ntOne, ntNotone, ntSet:
|
||||
if lazy {
|
||||
n.makeRep(Onelazy, min, max)
|
||||
} else {
|
||||
n.makeRep(Oneloop, min, max)
|
||||
}
|
||||
return n
|
||||
|
||||
default:
|
||||
var t nodeType
|
||||
if lazy {
|
||||
t = ntLazyloop
|
||||
} else {
|
||||
t = ntLoop
|
||||
}
|
||||
result := newRegexNodeMN(t, n.options, min, max)
|
||||
result.addChild(n)
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
// debug functions
|
||||
|
||||
var typeStr = []string{
|
||||
"Onerep", "Notonerep", "Setrep",
|
||||
"Oneloop", "Notoneloop", "Setloop",
|
||||
"Onelazy", "Notonelazy", "Setlazy",
|
||||
"One", "Notone", "Set",
|
||||
"Multi", "Ref",
|
||||
"Bol", "Eol", "Boundary", "Nonboundary",
|
||||
"Beginning", "Start", "EndZ", "End",
|
||||
"Nothing", "Empty",
|
||||
"Alternate", "Concatenate",
|
||||
"Loop", "Lazyloop",
|
||||
"Capture", "Group", "Require", "Prevent", "Greedy",
|
||||
"Testref", "Testgroup",
|
||||
"Unknown", "Unknown", "Unknown",
|
||||
"Unknown", "Unknown", "Unknown",
|
||||
"ECMABoundary", "NonECMABoundary",
|
||||
}
|
||||
|
||||
func (n *regexNode) description() string {
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
buf.WriteString(typeStr[n.t])
|
||||
|
||||
if (n.options & ExplicitCapture) != 0 {
|
||||
buf.WriteString("-C")
|
||||
}
|
||||
if (n.options & IgnoreCase) != 0 {
|
||||
buf.WriteString("-I")
|
||||
}
|
||||
if (n.options & RightToLeft) != 0 {
|
||||
buf.WriteString("-L")
|
||||
}
|
||||
if (n.options & Multiline) != 0 {
|
||||
buf.WriteString("-M")
|
||||
}
|
||||
if (n.options & Singleline) != 0 {
|
||||
buf.WriteString("-S")
|
||||
}
|
||||
if (n.options & IgnorePatternWhitespace) != 0 {
|
||||
buf.WriteString("-X")
|
||||
}
|
||||
if (n.options & ECMAScript) != 0 {
|
||||
buf.WriteString("-E")
|
||||
}
|
||||
|
||||
switch n.t {
|
||||
case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
|
||||
buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
|
||||
break
|
||||
case ntCapture:
|
||||
buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
|
||||
break
|
||||
case ntRef, ntTestref:
|
||||
buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
|
||||
break
|
||||
case ntMulti:
|
||||
fmt.Fprintf(buf, "(String = %s)", string(n.str))
|
||||
break
|
||||
case ntSet, ntSetloop, ntSetlazy:
|
||||
buf.WriteString("(Set = " + n.set.String() + ")")
|
||||
break
|
||||
}
|
||||
|
||||
switch n.t {
|
||||
case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
|
||||
buf.WriteString("(Min = ")
|
||||
buf.WriteString(strconv.Itoa(n.m))
|
||||
buf.WriteString(", Max = ")
|
||||
if n.n == math.MaxInt32 {
|
||||
buf.WriteString("inf")
|
||||
} else {
|
||||
buf.WriteString(strconv.Itoa(n.n))
|
||||
}
|
||||
buf.WriteString(")")
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
var padSpace = []byte(" ")
|
||||
|
||||
func (t *RegexTree) Dump() string {
|
||||
return t.root.dump()
|
||||
}
|
||||
|
||||
func (n *regexNode) dump() string {
|
||||
var stack []int
|
||||
CurNode := n
|
||||
CurChild := 0
|
||||
|
||||
buf := bytes.NewBufferString(CurNode.description())
|
||||
buf.WriteRune('\n')
|
||||
|
||||
for {
|
||||
if CurNode.children != nil && CurChild < len(CurNode.children) {
|
||||
stack = append(stack, CurChild+1)
|
||||
CurNode = CurNode.children[CurChild]
|
||||
CurChild = 0
|
||||
|
||||
Depth := len(stack)
|
||||
if Depth > 32 {
|
||||
Depth = 32
|
||||
}
|
||||
buf.Write(padSpace[:Depth])
|
||||
buf.WriteString(CurNode.description())
|
||||
buf.WriteRune('\n')
|
||||
} else {
|
||||
if len(stack) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
CurChild = stack[len(stack)-1]
|
||||
stack = stack[:len(stack)-1]
|
||||
CurNode = CurNode.next
|
||||
}
|
||||
}
|
||||
return buf.String()
|
||||
}
|
500
vendor/github.com/dlclark/regexp2/syntax/writer.go
generated
vendored
Normal file
500
vendor/github.com/dlclark/regexp2/syntax/writer.go
generated
vendored
Normal file
@@ -0,0 +1,500 @@
|
||||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
)
|
||||
|
||||
func Write(tree *RegexTree) (*Code, error) {
|
||||
w := writer{
|
||||
intStack: make([]int, 0, 32),
|
||||
emitted: make([]int, 2),
|
||||
stringhash: make(map[string]int),
|
||||
sethash: make(map[string]int),
|
||||
}
|
||||
|
||||
code, err := w.codeFromTree(tree)
|
||||
|
||||
if tree.options&Debug > 0 && code != nil {
|
||||
os.Stdout.WriteString(code.Dump())
|
||||
os.Stdout.WriteString("\n")
|
||||
}
|
||||
|
||||
return code, err
|
||||
}
|
||||
|
||||
type writer struct {
|
||||
emitted []int
|
||||
|
||||
intStack []int
|
||||
curpos int
|
||||
stringhash map[string]int
|
||||
stringtable [][]rune
|
||||
sethash map[string]int
|
||||
settable []*CharSet
|
||||
counting bool
|
||||
count int
|
||||
trackcount int
|
||||
caps map[int]int
|
||||
}
|
||||
|
||||
const (
|
||||
beforeChild nodeType = 64
|
||||
afterChild = 128
|
||||
//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
|
||||
MaxPrefixSize = 50
|
||||
)
|
||||
|
||||
// The top level RegexCode generator. It does a depth-first walk
|
||||
// through the tree and calls EmitFragment to emits code before
|
||||
// and after each child of an interior node, and at each leaf.
|
||||
//
|
||||
// It runs two passes, first to count the size of the generated
|
||||
// code, and second to generate the code.
|
||||
//
|
||||
// We should time it against the alternative, which is
|
||||
// to just generate the code and grow the array as we go.
|
||||
func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
|
||||
var (
|
||||
curNode *regexNode
|
||||
curChild int
|
||||
capsize int
|
||||
)
|
||||
// construct sparse capnum mapping if some numbers are unused
|
||||
|
||||
if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
|
||||
capsize = tree.captop
|
||||
w.caps = nil
|
||||
} else {
|
||||
capsize = len(tree.capnumlist)
|
||||
w.caps = tree.caps
|
||||
for i := 0; i < len(tree.capnumlist); i++ {
|
||||
w.caps[tree.capnumlist[i]] = i
|
||||
}
|
||||
}
|
||||
|
||||
w.counting = true
|
||||
|
||||
for {
|
||||
if !w.counting {
|
||||
w.emitted = make([]int, w.count)
|
||||
}
|
||||
|
||||
curNode = tree.root
|
||||
curChild = 0
|
||||
|
||||
w.emit1(Lazybranch, 0)
|
||||
|
||||
for {
|
||||
if len(curNode.children) == 0 {
|
||||
w.emitFragment(curNode.t, curNode, 0)
|
||||
} else if curChild < len(curNode.children) {
|
||||
w.emitFragment(curNode.t|beforeChild, curNode, curChild)
|
||||
|
||||
curNode = curNode.children[curChild]
|
||||
|
||||
w.pushInt(curChild)
|
||||
curChild = 0
|
||||
continue
|
||||
}
|
||||
|
||||
if w.emptyStack() {
|
||||
break
|
||||
}
|
||||
|
||||
curChild = w.popInt()
|
||||
curNode = curNode.next
|
||||
|
||||
w.emitFragment(curNode.t|afterChild, curNode, curChild)
|
||||
curChild++
|
||||
}
|
||||
|
||||
w.patchJump(0, w.curPos())
|
||||
w.emit(Stop)
|
||||
|
||||
if !w.counting {
|
||||
break
|
||||
}
|
||||
|
||||
w.counting = false
|
||||
}
|
||||
|
||||
fcPrefix := getFirstCharsPrefix(tree)
|
||||
prefix := getPrefix(tree)
|
||||
rtl := (tree.options & RightToLeft) != 0
|
||||
|
||||
var bmPrefix *BmPrefix
|
||||
//TODO: benchmark string prefixes
|
||||
if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
|
||||
if len(prefix.PrefixStr) > MaxPrefixSize {
|
||||
// limit prefix changes to 10k
|
||||
prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
|
||||
}
|
||||
bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
|
||||
} else {
|
||||
bmPrefix = nil
|
||||
}
|
||||
|
||||
return &Code{
|
||||
Codes: w.emitted,
|
||||
Strings: w.stringtable,
|
||||
Sets: w.settable,
|
||||
TrackCount: w.trackcount,
|
||||
Caps: w.caps,
|
||||
Capsize: capsize,
|
||||
FcPrefix: fcPrefix,
|
||||
BmPrefix: bmPrefix,
|
||||
Anchors: getAnchors(tree),
|
||||
RightToLeft: rtl,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// The main RegexCode generator. It does a depth-first walk
|
||||
// through the tree and calls EmitFragment to emits code before
|
||||
// and after each child of an interior node, and at each leaf.
|
||||
func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
|
||||
bits := InstOp(0)
|
||||
|
||||
if nodetype <= ntRef {
|
||||
if (node.options & RightToLeft) != 0 {
|
||||
bits |= Rtl
|
||||
}
|
||||
if (node.options & IgnoreCase) != 0 {
|
||||
bits |= Ci
|
||||
}
|
||||
}
|
||||
ntBits := nodeType(bits)
|
||||
|
||||
switch nodetype {
|
||||
case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
|
||||
break
|
||||
|
||||
case ntAlternate | beforeChild:
|
||||
if curIndex < len(node.children)-1 {
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Lazybranch, 0)
|
||||
}
|
||||
|
||||
case ntAlternate | afterChild:
|
||||
if curIndex < len(node.children)-1 {
|
||||
lbPos := w.popInt()
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Goto, 0)
|
||||
w.patchJump(lbPos, w.curPos())
|
||||
} else {
|
||||
for i := 0; i < curIndex; i++ {
|
||||
w.patchJump(w.popInt(), w.curPos())
|
||||
}
|
||||
}
|
||||
break
|
||||
|
||||
case ntTestref | beforeChild:
|
||||
if curIndex == 0 {
|
||||
w.emit(Setjump)
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Lazybranch, 0)
|
||||
w.emit1(Testref, w.mapCapnum(node.m))
|
||||
w.emit(Forejump)
|
||||
}
|
||||
|
||||
case ntTestref | afterChild:
|
||||
if curIndex == 0 {
|
||||
branchpos := w.popInt()
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Goto, 0)
|
||||
w.patchJump(branchpos, w.curPos())
|
||||
w.emit(Forejump)
|
||||
if len(node.children) <= 1 {
|
||||
w.patchJump(w.popInt(), w.curPos())
|
||||
}
|
||||
} else if curIndex == 1 {
|
||||
w.patchJump(w.popInt(), w.curPos())
|
||||
}
|
||||
|
||||
case ntTestgroup | beforeChild:
|
||||
if curIndex == 0 {
|
||||
w.emit(Setjump)
|
||||
w.emit(Setmark)
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Lazybranch, 0)
|
||||
}
|
||||
|
||||
case ntTestgroup | afterChild:
|
||||
if curIndex == 0 {
|
||||
w.emit(Getmark)
|
||||
w.emit(Forejump)
|
||||
} else if curIndex == 1 {
|
||||
Branchpos := w.popInt()
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Goto, 0)
|
||||
w.patchJump(Branchpos, w.curPos())
|
||||
w.emit(Getmark)
|
||||
w.emit(Forejump)
|
||||
if len(node.children) <= 2 {
|
||||
w.patchJump(w.popInt(), w.curPos())
|
||||
}
|
||||
} else if curIndex == 2 {
|
||||
w.patchJump(w.popInt(), w.curPos())
|
||||
}
|
||||
|
||||
case ntLoop | beforeChild, ntLazyloop | beforeChild:
|
||||
|
||||
if node.n < math.MaxInt32 || node.m > 1 {
|
||||
if node.m == 0 {
|
||||
w.emit1(Nullcount, 0)
|
||||
} else {
|
||||
w.emit1(Setcount, 1-node.m)
|
||||
}
|
||||
} else if node.m == 0 {
|
||||
w.emit(Nullmark)
|
||||
} else {
|
||||
w.emit(Setmark)
|
||||
}
|
||||
|
||||
if node.m == 0 {
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Goto, 0)
|
||||
}
|
||||
w.pushInt(w.curPos())
|
||||
|
||||
case ntLoop | afterChild, ntLazyloop | afterChild:
|
||||
|
||||
startJumpPos := w.curPos()
|
||||
lazy := (nodetype - (ntLoop | afterChild))
|
||||
|
||||
if node.n < math.MaxInt32 || node.m > 1 {
|
||||
if node.n == math.MaxInt32 {
|
||||
w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
|
||||
} else {
|
||||
w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
|
||||
}
|
||||
} else {
|
||||
w.emit1(InstOp(Branchmark+lazy), w.popInt())
|
||||
}
|
||||
|
||||
if node.m == 0 {
|
||||
w.patchJump(w.popInt(), startJumpPos)
|
||||
}
|
||||
|
||||
case ntGroup | beforeChild, ntGroup | afterChild:
|
||||
|
||||
case ntCapture | beforeChild:
|
||||
w.emit(Setmark)
|
||||
|
||||
case ntCapture | afterChild:
|
||||
w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
|
||||
|
||||
case ntRequire | beforeChild:
|
||||
// NOTE: the following line causes lookahead/lookbehind to be
|
||||
// NON-BACKTRACKING. It can be commented out with (*)
|
||||
w.emit(Setjump)
|
||||
|
||||
w.emit(Setmark)
|
||||
|
||||
case ntRequire | afterChild:
|
||||
w.emit(Getmark)
|
||||
|
||||
// NOTE: the following line causes lookahead/lookbehind to be
|
||||
// NON-BACKTRACKING. It can be commented out with (*)
|
||||
w.emit(Forejump)
|
||||
|
||||
case ntPrevent | beforeChild:
|
||||
w.emit(Setjump)
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Lazybranch, 0)
|
||||
|
||||
case ntPrevent | afterChild:
|
||||
w.emit(Backjump)
|
||||
w.patchJump(w.popInt(), w.curPos())
|
||||
w.emit(Forejump)
|
||||
|
||||
case ntGreedy | beforeChild:
|
||||
w.emit(Setjump)
|
||||
|
||||
case ntGreedy | afterChild:
|
||||
w.emit(Forejump)
|
||||
|
||||
case ntOne, ntNotone:
|
||||
w.emit1(InstOp(node.t|ntBits), int(node.ch))
|
||||
|
||||
case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
|
||||
if node.m > 0 {
|
||||
if node.t == ntOneloop || node.t == ntOnelazy {
|
||||
w.emit2(Onerep|bits, int(node.ch), node.m)
|
||||
} else {
|
||||
w.emit2(Notonerep|bits, int(node.ch), node.m)
|
||||
}
|
||||
}
|
||||
if node.n > node.m {
|
||||
if node.n == math.MaxInt32 {
|
||||
w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
|
||||
} else {
|
||||
w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
|
||||
}
|
||||
}
|
||||
|
||||
case ntSetloop, ntSetlazy:
|
||||
if node.m > 0 {
|
||||
w.emit2(Setrep|bits, w.setCode(node.set), node.m)
|
||||
}
|
||||
if node.n > node.m {
|
||||
if node.n == math.MaxInt32 {
|
||||
w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
|
||||
} else {
|
||||
w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
|
||||
}
|
||||
}
|
||||
|
||||
case ntMulti:
|
||||
w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
|
||||
|
||||
case ntSet:
|
||||
w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
|
||||
|
||||
case ntRef:
|
||||
w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
|
||||
|
||||
case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
|
||||
w.emit(InstOp(node.t))
|
||||
|
||||
default:
|
||||
return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// To avoid recursion, we use a simple integer stack.
|
||||
// This is the push.
|
||||
func (w *writer) pushInt(i int) {
|
||||
w.intStack = append(w.intStack, i)
|
||||
}
|
||||
|
||||
// Returns true if the stack is empty.
|
||||
func (w *writer) emptyStack() bool {
|
||||
return len(w.intStack) == 0
|
||||
}
|
||||
|
||||
// This is the pop.
|
||||
func (w *writer) popInt() int {
|
||||
//get our item
|
||||
idx := len(w.intStack) - 1
|
||||
i := w.intStack[idx]
|
||||
//trim our slice
|
||||
w.intStack = w.intStack[:idx]
|
||||
return i
|
||||
}
|
||||
|
||||
// Returns the current position in the emitted code.
|
||||
func (w *writer) curPos() int {
|
||||
return w.curpos
|
||||
}
|
||||
|
||||
// Fixes up a jump instruction at the specified offset
|
||||
// so that it jumps to the specified jumpDest.
|
||||
func (w *writer) patchJump(offset, jumpDest int) {
|
||||
w.emitted[offset+1] = jumpDest
|
||||
}
|
||||
|
||||
// Returns an index in the set table for a charset
|
||||
// uses a map to eliminate duplicates.
|
||||
func (w *writer) setCode(set *CharSet) int {
|
||||
if w.counting {
|
||||
return 0
|
||||
}
|
||||
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
set.mapHashFill(buf)
|
||||
hash := buf.String()
|
||||
i, ok := w.sethash[hash]
|
||||
if !ok {
|
||||
i = len(w.sethash)
|
||||
w.sethash[hash] = i
|
||||
w.settable = append(w.settable, set)
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
// Returns an index in the string table for a string.
|
||||
// uses a map to eliminate duplicates.
|
||||
func (w *writer) stringCode(str []rune) int {
|
||||
if w.counting {
|
||||
return 0
|
||||
}
|
||||
|
||||
hash := string(str)
|
||||
i, ok := w.stringhash[hash]
|
||||
if !ok {
|
||||
i = len(w.stringhash)
|
||||
w.stringhash[hash] = i
|
||||
w.stringtable = append(w.stringtable, str)
|
||||
}
|
||||
|
||||
return i
|
||||
}
|
||||
|
||||
// When generating code on a regex that uses a sparse set
|
||||
// of capture slots, we hash them to a dense set of indices
|
||||
// for an array of capture slots. Instead of doing the hash
|
||||
// at match time, it's done at compile time, here.
|
||||
func (w *writer) mapCapnum(capnum int) int {
|
||||
if capnum == -1 {
|
||||
return -1
|
||||
}
|
||||
|
||||
if w.caps != nil {
|
||||
return w.caps[capnum]
|
||||
}
|
||||
|
||||
return capnum
|
||||
}
|
||||
|
||||
// Emits a zero-argument operation. Note that the emit
|
||||
// functions all run in two modes: they can emit code, or
|
||||
// they can just count the size of the code.
|
||||
func (w *writer) emit(op InstOp) {
|
||||
if w.counting {
|
||||
w.count++
|
||||
if opcodeBacktracks(op) {
|
||||
w.trackcount++
|
||||
}
|
||||
return
|
||||
}
|
||||
w.emitted[w.curpos] = int(op)
|
||||
w.curpos++
|
||||
}
|
||||
|
||||
// Emits a one-argument operation.
|
||||
func (w *writer) emit1(op InstOp, opd1 int) {
|
||||
if w.counting {
|
||||
w.count += 2
|
||||
if opcodeBacktracks(op) {
|
||||
w.trackcount++
|
||||
}
|
||||
return
|
||||
}
|
||||
w.emitted[w.curpos] = int(op)
|
||||
w.curpos++
|
||||
w.emitted[w.curpos] = opd1
|
||||
w.curpos++
|
||||
}
|
||||
|
||||
// Emits a two-argument operation.
|
||||
func (w *writer) emit2(op InstOp, opd1, opd2 int) {
|
||||
if w.counting {
|
||||
w.count += 3
|
||||
if opcodeBacktracks(op) {
|
||||
w.trackcount++
|
||||
}
|
||||
return
|
||||
}
|
||||
w.emitted[w.curpos] = int(op)
|
||||
w.curpos++
|
||||
w.emitted[w.curpos] = opd1
|
||||
w.curpos++
|
||||
w.emitted[w.curpos] = opd2
|
||||
w.curpos++
|
||||
}
|
Reference in New Issue
Block a user