package syntax import ( "bytes" "encoding/binary" "fmt" "sort" "unicode" "unicode/utf8" ) // CharSet combines start-end rune ranges and unicode categories representing a set of characters type CharSet struct { ranges []singleRange categories []category sub *CharSet //optional subtractor negate bool anything bool } type category struct { negate bool cat string } type singleRange struct { first rune last rune } const ( spaceCategoryText = " " wordCategoryText = "W" ) var ( ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00} ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b} ecmaDigit = []rune{0x0030, 0x003a} ) var ( AnyClass = getCharSetFromOldString([]rune{0}, false) ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false) NoneClass = getCharSetFromOldString(nil, false) ECMAWordClass = getCharSetFromOldString(ecmaWord, false) NotECMAWordClass = getCharSetFromOldString(ecmaWord, true) ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false) NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true) ECMADigitClass = getCharSetFromOldString(ecmaDigit, false) NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true) WordClass = getCharSetFromCategoryString(false, false, wordCategoryText) NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText) SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText) NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText) DigitClass = getCharSetFromCategoryString(false, false, "Nd") NotDigitClass = getCharSetFromCategoryString(false, true, "Nd") ) var unicodeCategories = func() map[string]*unicode.RangeTable { retVal := make(map[string]*unicode.RangeTable) for k, v := range unicode.Scripts { retVal[k] = v } for k, v := range unicode.Categories { retVal[k] = v } for k, v := range unicode.Properties { retVal[k] = v } return retVal }() func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet { if negateCat && negateSet { panic("BUG! You should only negate the set OR the category in a constant setup, but not both") } c := CharSet{negate: negateSet} c.categories = make([]category, len(cats)) for i, cat := range cats { c.categories[i] = category{cat: cat, negate: negateCat} } return func() *CharSet { //make a copy each time local := c //return that address return &local } } func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet { c := CharSet{} if len(setText) > 0 { fillFirst := false l := len(setText) if negate { if setText[0] == 0 { setText = setText[1:] } else { l++ fillFirst = true } } if l%2 == 0 { c.ranges = make([]singleRange, l/2) } else { c.ranges = make([]singleRange, l/2+1) } first := true if fillFirst { c.ranges[0] = singleRange{first: 0} first = false } i := 0 for _, r := range setText { if first { // lower bound in a new range c.ranges[i] = singleRange{first: r} first = false } else { c.ranges[i].last = r - 1 i++ first = true } } if !first { c.ranges[i].last = utf8.MaxRune } } return func() *CharSet { local := c return &local } } // Copy makes a deep copy to prevent accidental mutation of a set func (c CharSet) Copy() CharSet { ret := CharSet{ anything: c.anything, negate: c.negate, } ret.ranges = append(ret.ranges, c.ranges...) ret.categories = append(ret.categories, c.categories...) if c.sub != nil { sub := c.sub.Copy() ret.sub = &sub } return ret } // gets a human-readable description for a set string func (c CharSet) String() string { buf := &bytes.Buffer{} buf.WriteRune('[') if c.IsNegated() { buf.WriteRune('^') } for _, r := range c.ranges { buf.WriteString(CharDescription(r.first)) if r.first != r.last { if r.last-r.first != 1 { //groups that are 1 char apart skip the dash buf.WriteRune('-') } buf.WriteString(CharDescription(r.last)) } } for _, c := range c.categories { buf.WriteString(c.String()) } if c.sub != nil { buf.WriteRune('-') buf.WriteString(c.sub.String()) } buf.WriteRune(']') return buf.String() } // mapHashFill converts a charset into a buffer for use in maps func (c CharSet) mapHashFill(buf *bytes.Buffer) { if c.negate { buf.WriteByte(0) } else { buf.WriteByte(1) } binary.Write(buf, binary.LittleEndian, len(c.ranges)) binary.Write(buf, binary.LittleEndian, len(c.categories)) for _, r := range c.ranges { buf.WriteRune(r.first) buf.WriteRune(r.last) } for _, ct := range c.categories { buf.WriteString(ct.cat) if ct.negate { buf.WriteByte(1) } else { buf.WriteByte(0) } } if c.sub != nil { c.sub.mapHashFill(buf) } } // CharIn returns true if the rune is in our character set (either ranges or categories). // It handles negations and subtracted sub-charsets. func (c CharSet) CharIn(ch rune) bool { val := false // in s && !s.subtracted //check ranges for _, r := range c.ranges { if ch < r.first { continue } if ch <= r.last { val = true break } } //check categories if we haven't already found a range if !val && len(c.categories) > 0 { for _, ct := range c.categories { // special categories...then unicode if ct.cat == spaceCategoryText { if unicode.IsSpace(ch) { // we found a space so we're done // negate means this is a "bad" thing val = !ct.negate break } else if ct.negate { val = true break } } else if ct.cat == wordCategoryText { if IsWordChar(ch) { val = !ct.negate break } else if ct.negate { val = true break } } else if unicode.Is(unicodeCategories[ct.cat], ch) { // if we're in this unicode category then we're done // if negate=true on this category then we "failed" our test // otherwise we're good that we found it val = !ct.negate break } else if ct.negate { val = true break } } } // negate the whole char set if c.negate { val = !val } // get subtracted recurse if val && c.sub != nil { val = !c.sub.CharIn(ch) } //log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val) return val } func (c category) String() string { switch c.cat { case spaceCategoryText: if c.negate { return "\\S" } return "\\s" case wordCategoryText: if c.negate { return "\\W" } return "\\w" } if _, ok := unicodeCategories[c.cat]; ok { if c.negate { return "\\P{" + c.cat + "}" } return "\\p{" + c.cat + "}" } return "Unknown category: " + c.cat } // CharDescription Produces a human-readable description for a single character. func CharDescription(ch rune) string { /*if ch == '\\' { return "\\\\" } if ch > ' ' && ch <= '~' { return string(ch) } else if ch == '\n' { return "\\n" } else if ch == ' ' { return "\\ " }*/ b := &bytes.Buffer{} escape(b, ch, false) //fmt.Sprintf("%U", ch) return b.String() } // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) // RL 1.4 Simple Word Boundaries The class of includes all Alphabetic // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER. func IsWordChar(r rune) bool { //"L", "Mn", "Nd", "Pc" return unicode.In(r, unicode.Categories["L"], unicode.Categories["Mn"], unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C' //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' } func IsECMAWordChar(r rune) bool { return unicode.In(r, unicode.Categories["L"], unicode.Categories["Mn"], unicode.Categories["Nd"], unicode.Categories["Pc"]) //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' } // SingletonChar will return the char from the first range without validation. // It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input func (c CharSet) SingletonChar() rune { return c.ranges[0].first } func (c CharSet) IsSingleton() bool { return !c.negate && //negated is multiple chars len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars c.sub == nil && // subtraction means we've got multiple chars c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char } func (c CharSet) IsSingletonInverse() bool { return c.negate && //same as above, but requires negated len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars c.sub == nil && // subtraction means we've got multiple chars c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char } func (c CharSet) IsMergeable() bool { return !c.IsNegated() && !c.HasSubtraction() } func (c CharSet) IsNegated() bool { return c.negate } func (c CharSet) HasSubtraction() bool { return c.sub != nil } func (c CharSet) IsEmpty() bool { return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil } func (c *CharSet) addDigit(ecma, negate bool, pattern string) { if ecma { if negate { c.addRanges(NotECMADigitClass().ranges) } else { c.addRanges(ECMADigitClass().ranges) } } else { c.addCategories(category{cat: "Nd", negate: negate}) } } func (c *CharSet) addChar(ch rune) { c.addRange(ch, ch) } func (c *CharSet) addSpace(ecma, negate bool) { if ecma { if negate { c.addRanges(NotECMASpaceClass().ranges) } else { c.addRanges(ECMASpaceClass().ranges) } } else { c.addCategories(category{cat: spaceCategoryText, negate: negate}) } } func (c *CharSet) addWord(ecma, negate bool) { if ecma { if negate { c.addRanges(NotECMAWordClass().ranges) } else { c.addRanges(ECMAWordClass().ranges) } } else { c.addCategories(category{cat: wordCategoryText, negate: negate}) } } // Add set ranges and categories into ours -- no deduping or anything func (c *CharSet) addSet(set CharSet) { if c.anything { return } if set.anything { c.makeAnything() return } // just append here to prevent double-canon c.ranges = append(c.ranges, set.ranges...) c.addCategories(set.categories...) c.canonicalize() } func (c *CharSet) makeAnything() { c.anything = true c.categories = []category{} c.ranges = AnyClass().ranges } func (c *CharSet) addCategories(cats ...category) { // don't add dupes and remove positive+negative if c.anything { // if we've had a previous positive+negative group then // just return, we're as broad as we can get return } for _, ct := range cats { found := false for _, ct2 := range c.categories { if ct.cat == ct2.cat { if ct.negate != ct2.negate { // oposite negations...this mean we just // take us as anything and move on c.makeAnything() return } found = true break } } if !found { c.categories = append(c.categories, ct) } } } // Merges new ranges to our own func (c *CharSet) addRanges(ranges []singleRange) { if c.anything { return } c.ranges = append(c.ranges, ranges...) c.canonicalize() } func isValidUnicodeCat(catName string) bool { _, ok := unicodeCategories[catName] return ok } func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) { if !isValidUnicodeCat(categoryName) { // unknown unicode category, script, or property "blah" panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName)) } if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") { // when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match c.addCategories( category{cat: "Ll", negate: negate}, category{cat: "Lu", negate: negate}, category{cat: "Lt", negate: negate}) } c.addCategories(category{cat: categoryName, negate: negate}) } func (c *CharSet) addSubtraction(sub *CharSet) { c.sub = sub } func (c *CharSet) addRange(chMin, chMax rune) { c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax}) c.canonicalize() } type singleRangeSorter []singleRange func (p singleRangeSorter) Len() int { return len(p) } func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first } func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] } // Logic to reduce a character class to a unique, sorted form. func (c *CharSet) canonicalize() { var i, j int var last rune // // Find and eliminate overlapping or abutting ranges // if len(c.ranges) > 1 { sort.Sort(singleRangeSorter(c.ranges)) done := false for i, j = 1, 0; ; i++ { for last = c.ranges[j].last; ; i++ { if i == len(c.ranges) || last == utf8.MaxRune { done = true break } CurrentRange := c.ranges[i] if CurrentRange.first > last+1 { break } if last < CurrentRange.last { last = CurrentRange.last } } c.ranges[j] = singleRange{first: c.ranges[j].first, last: last} j++ if done { break } if j < i { c.ranges[j] = c.ranges[i] } } c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...) } } // Adds to the class any lowercase versions of characters already // in the class. Used for case-insensitivity. func (c *CharSet) addLowercase() { if c.anything { return } toAdd := []singleRange{} for i := 0; i < len(c.ranges); i++ { r := c.ranges[i] if r.first == r.last { lower := unicode.ToLower(r.first) c.ranges[i] = singleRange{first: lower, last: lower} } else { toAdd = append(toAdd, r) } } for _, r := range toAdd { c.addLowercaseRange(r.first, r.last) } c.canonicalize() } /************************************************************************** Let U be the set of Unicode character values and let L be the lowercase function, mapping from U to U. To perform case insensitive matching of character sets, we need to be able to map an interval I in U, say I = [chMin, chMax] = { ch : chMin <= ch <= chMax } to a set A such that A contains L(I) and A is contained in the union of I and L(I). The table below partitions U into intervals on which L is non-decreasing. Thus, for any interval J = [a, b] contained in one of these intervals, L(J) is contained in [L(a), L(b)]. It is also true that for any such J, [L(a), L(b)] is contained in the union of J and L(J). This does not follow from L being non-decreasing on these intervals. It follows from the nature of the L on each interval. On each interval, L has one of the following forms: (1) L(ch) = constant (LowercaseSet) (2) L(ch) = ch + offset (LowercaseAdd) (3) L(ch) = ch | 1 (LowercaseBor) (4) L(ch) = ch + (ch & 1) (LowercaseBad) It is easy to verify that for any of these forms [L(a), L(b)] is contained in the union of [a, b] and L([a, b]). ***************************************************************************/ const ( LowercaseSet = 0 // Set to arg. LowercaseAdd = 1 // Add arg. LowercaseBor = 2 // Bitwise or with 1. LowercaseBad = 3 // Bitwise and with 1 and add original. ) type lcMap struct { chMin, chMax rune op, data int32 } var lcTable = []lcMap{ lcMap{'\u0041', '\u005A', LowercaseAdd, 32}, lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32}, lcMap{'\u0100', '\u012E', LowercaseBor, 0}, lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069}, lcMap{'\u0132', '\u0136', LowercaseBor, 0}, lcMap{'\u0139', '\u0147', LowercaseBad, 0}, lcMap{'\u014A', '\u0176', LowercaseBor, 0}, lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF}, lcMap{'\u0179', '\u017D', LowercaseBad, 0}, lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253}, lcMap{'\u0182', '\u0184', LowercaseBor, 0}, lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254}, lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188}, lcMap{'\u0189', '\u018A', LowercaseAdd, 205}, lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C}, lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD}, lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259}, lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B}, lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192}, lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260}, lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263}, lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269}, lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268}, lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199}, lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F}, lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272}, lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275}, lcMap{'\u01A0', '\u01A4', LowercaseBor, 0}, lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8}, lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283}, lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD}, lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288}, lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0}, lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217}, lcMap{'\u01B3', '\u01B5', LowercaseBad, 0}, lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292}, lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9}, lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD}, lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6}, lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9}, lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC}, lcMap{'\u01CD', '\u01DB', LowercaseBad, 0}, lcMap{'\u01DE', '\u01EE', LowercaseBor, 0}, lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3}, lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5}, lcMap{'\u01FA', '\u0216', LowercaseBor, 0}, lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC}, lcMap{'\u0388', '\u038A', LowercaseAdd, 37}, lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC}, lcMap{'\u038E', '\u038F', LowercaseAdd, 63}, lcMap{'\u0391', '\u03AB', LowercaseAdd, 32}, lcMap{'\u03E2', '\u03EE', LowercaseBor, 0}, lcMap{'\u0401', '\u040F', LowercaseAdd, 80}, lcMap{'\u0410', '\u042F', LowercaseAdd, 32}, lcMap{'\u0460', '\u0480', LowercaseBor, 0}, lcMap{'\u0490', '\u04BE', LowercaseBor, 0}, lcMap{'\u04C1', '\u04C3', LowercaseBad, 0}, lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8}, lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC}, lcMap{'\u04D0', '\u04EA', LowercaseBor, 0}, lcMap{'\u04EE', '\u04F4', LowercaseBor, 0}, lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9}, lcMap{'\u0531', '\u0556', LowercaseAdd, 48}, lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48}, lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0}, lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8}, lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8}, lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8}, lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8}, lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8}, lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51}, lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53}, lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55}, lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57}, lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8}, lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8}, lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8}, lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8}, lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8}, lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74}, lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3}, lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86}, lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3}, lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8}, lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100}, lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8}, lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112}, lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5}, lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128}, lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126}, lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3}, lcMap{'\u2160', '\u216F', LowercaseAdd, 16}, lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26}, lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32}, } func (c *CharSet) addLowercaseRange(chMin, chMax rune) { var i, iMax, iMid int var chMinT, chMaxT rune var lc lcMap for i, iMax = 0, len(lcTable); i < iMax; { iMid = (i + iMax) / 2 if lcTable[iMid].chMax < chMin { i = iMid + 1 } else { iMax = iMid } } for ; i < len(lcTable); i++ { lc = lcTable[i] if lc.chMin > chMax { return } chMinT = lc.chMin if chMinT < chMin { chMinT = chMin } chMaxT = lc.chMax if chMaxT > chMax { chMaxT = chMax } switch lc.op { case LowercaseSet: chMinT = rune(lc.data) chMaxT = rune(lc.data) break case LowercaseAdd: chMinT += lc.data chMaxT += lc.data break case LowercaseBor: chMinT |= 1 chMaxT |= 1 break case LowercaseBad: chMinT += (chMinT & 1) chMaxT += (chMaxT & 1) break } if chMinT < chMin || chMaxT > chMax { c.addRange(chMinT, chMaxT) } } }