package tokenizer import ( "io" "sort" "sync" ) const newLine = '\n' // TokenKey token type identifier type TokenKey int const ( // TokenUnknown means that this token not embedded token and not user defined. TokenUnknown TokenKey = -6 // TokenStringFragment means that this is only fragment of quoted string with injections // For example, "one {{ two }} three", where "one " and " three" — TokenStringFragment TokenStringFragment TokenKey = -5 // TokenString means than this token is quoted string. // For example, "one two" TokenString TokenKey = -4 // TokenFloat means that this token is float number with point and/or exponent. // For example, 1.2, 1e6, 1E-6 TokenFloat TokenKey = -3 // TokenInteger means that this token is integer number. // For example, 3, 49983 TokenInteger TokenKey = -2 // TokenKeyword means that this token is word. // For example, one, two, три TokenKeyword TokenKey = -1 // TokenUndef means that token doesn't exist. // Then stream out of range of token list any getter or checker will return TokenUndef token. TokenUndef TokenKey = 0 ) const ( fStopOnUnknown uint16 = 0b1 fAllowKeywordUnderscore uint16 = 0b10 fAllowNumberUnderscore uint16 = 0b100 fAllowNumberInKeyword uint16 = 0b1000 ) // BackSlash just backslash byte const BackSlash = '\\' var defaultWhiteSpaces = []byte{' ', '\t', '\n', '\r'} // DefaultStringEscapes is default escaped symbols. Those symbols are often used everywhere. var DefaultStringEscapes = map[byte]byte{ 'n': '\n', 'r': '\r', 't': '\t', '\\': '\\', } // tokenItem describes one token. type tokenRef struct { // Token type. Not unique. Key TokenKey // Token value as is. Should be unique. Token []byte } // QuoteInjectSettings describes open injection token and close injection token. type QuoteInjectSettings struct { // Token type witch opens quoted string. StartKey TokenKey // Token type witch closes quoted string. EndKey TokenKey } // StringSettings describes framed(quoted) string tokens like quoted strings. type StringSettings struct { Key TokenKey StartToken []byte EndToken []byte EscapeSymbol byte SpecSymbols map[byte]byte Injects []QuoteInjectSettings } // AddInjection configure injection in to string. // Injection - parsable fragment of framed(quoted) string. // Often used for parsing of placeholders or template's expressions in the framed string. func (q *StringSettings) AddInjection(startTokenKey, endTokenKey TokenKey) *StringSettings { q.Injects = append(q.Injects, QuoteInjectSettings{StartKey: startTokenKey, EndKey: endTokenKey}) return q } // SetEscapeSymbol set escape symbol for framed(quoted) string. // Escape symbol allows ignoring close token of framed string. // Also escape symbol allows using special symbols in the frame strings, like \n, \t. func (q *StringSettings) SetEscapeSymbol(symbol byte) *StringSettings { q.EscapeSymbol = symbol return q } // SetSpecialSymbols set mapping of all escapable symbols for escape symbol, like \n, \t, \r. func (q *StringSettings) SetSpecialSymbols(special map[byte]byte) *StringSettings { q.SpecSymbols = special return q } // Tokenizer stores all tokens configuration and behaviors. type Tokenizer struct { // bit flags flags uint16 // all defined custom tokens {key: [token1, token2, ...], ...} tokens map[TokenKey][]*tokenRef index map[byte][]*tokenRef quotes []*StringSettings wSpaces []byte pool sync.Pool } // New creates new tokenizer. func New() *Tokenizer { t := Tokenizer{ flags: 0, tokens: map[TokenKey][]*tokenRef{}, index: map[byte][]*tokenRef{}, quotes: []*StringSettings{}, wSpaces: defaultWhiteSpaces, } t.pool.New = func() interface{} { return new(Token) } return &t } // SetWhiteSpaces sets custom whitespace symbols between tokens. // By default: {' ', '\t', '\n', '\r'} func (t *Tokenizer) SetWhiteSpaces(ws []byte) *Tokenizer { t.wSpaces = ws return t } // StopOnUndefinedToken stops parsing if unknown token detected. func (t *Tokenizer) StopOnUndefinedToken() *Tokenizer { t.flags |= fStopOnUnknown return t } // AllowKeywordUnderscore allows underscore symbol in keywords, like `one_two` or `_three` func (t *Tokenizer) AllowKeywordUnderscore() *Tokenizer { t.flags |= fAllowKeywordUnderscore return t } // AllowNumbersInKeyword allows numbers in keywords, like `one1` or `r2d2` // The method allows numbers in keywords, but the keyword itself must not start with a number. // There should be no spaces between letters and numbers. func (t *Tokenizer) AllowNumbersInKeyword() *Tokenizer { t.flags |= fAllowNumberInKeyword return t } // DefineTokens add custom token. // There `key` unique is identifier of `tokens`, `tokens` — slice of string of tokens. // If key already exists tokens will be rewritten. func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string) *Tokenizer { var tks []*tokenRef if key < 1 { return t } for _, token := range tokens { ref := tokenRef{ Key: key, Token: s2b(token), } head := ref.Token[0] tks = append(tks, &ref) if t.index[head] == nil { t.index[head] = []*tokenRef{} } t.index[head] = append(t.index[head], &ref) sort.Slice(t.index[head], func(i, j int) bool { return len(t.index[head][i].Token) > len(t.index[head][j].Token) }) } t.tokens[key] = tks return t } // DefineStringToken defines a token string. // For example, a piece of data surrounded by quotes: "string in quotes" or 'string on sigle quotes'. // Arguments startToken and endToken defines open and close "quotes". // - t.DefineStringToken("`", "`") - parse string "one `two three`" will be parsed as // [{key: TokenKeyword, value: "one"}, {key: TokenString, value: "`two three`"}] // - t.DefineStringToken("//", "\n") - parse string "parse // like comment\n" will be parsed as // [{key: TokenKeyword, value: "parse"}, {key: TokenString, value: "// like comment"}] func (t *Tokenizer) DefineStringToken(key TokenKey, startToken, endToken string) *StringSettings { q := &StringSettings{ Key: key, StartToken: s2b(startToken), EndToken: s2b(endToken), } if q.StartToken == nil { return q } t.quotes = append(t.quotes, q) return q } func (t *Tokenizer) allocToken() *Token { return t.pool.Get().(*Token) } func (t *Tokenizer) freeToken(token *Token) { token.next = nil token.prev = nil token.value = nil token.indent = nil token.offset = 0 token.line = 0 token.id = 0 token.key = 0 token.string = nil t.pool.Put(token) } // ParseString parse the string into tokens func (t *Tokenizer) ParseString(str string) *Stream { return t.ParseBytes(s2b(str)) } // ParseBytes parse the bytes slice into tokens func (t *Tokenizer) ParseBytes(str []byte) *Stream { p := newParser(t, str) p.parse() return NewStream(p) } // ParseStream parse the string into tokens. func (t *Tokenizer) ParseStream(r io.Reader, bufferSize uint) *Stream { p := newInfParser(t, r, bufferSize) p.preload() p.parse() return NewInfStream(p) }