Skip to content
Snippets Groups Projects
tokenizer.go 6.86 KiB
Newer Older
Volker Schukai's avatar
Volker Schukai committed
package tokenizer

import (
	"io"
	"sort"
	"sync"
)

const newLine = '\n'

// TokenKey token type identifier
type TokenKey int

const (
	// TokenUnknown means that this token not embedded token and not user defined.
	TokenUnknown TokenKey = -6
	// TokenStringFragment means that this is only fragment of quoted string with injections
	// For example, "one {{ two }} three", where "one " and " three" — TokenStringFragment
	TokenStringFragment TokenKey = -5
	// TokenString means than this token is quoted string.
	// For example, "one two"
	TokenString TokenKey = -4
	// TokenFloat means that this token is float number with point and/or exponent.
	// For example, 1.2, 1e6, 1E-6
	TokenFloat TokenKey = -3
	// TokenInteger means that this token is integer number.
	// For example, 3, 49983
	TokenInteger TokenKey = -2
	// TokenKeyword means that this token is word.
	// For example, one, two, три
	TokenKeyword TokenKey = -1
	// TokenUndef means that token doesn't exist.
	// Then stream out of range of token list any getter or checker will return TokenUndef token.
	TokenUndef TokenKey = 0
)

const (
	fStopOnUnknown          uint16 = 0b1
	fAllowKeywordUnderscore uint16 = 0b10
	fAllowNumberUnderscore  uint16 = 0b100
	fAllowNumberInKeyword   uint16 = 0b1000
)

// BackSlash just backslash byte
const BackSlash = '\\'

var defaultWhiteSpaces = []byte{' ', '\t', '\n', '\r'}

// DefaultStringEscapes is default escaped symbols. Those symbols are often used everywhere.
var DefaultStringEscapes = map[byte]byte{
	'n':  '\n',
	'r':  '\r',
	't':  '\t',
	'\\': '\\',
}

// tokenItem describes one token.
type tokenRef struct {
	// Token type. Not unique.
	Key TokenKey
	// Token value as is. Should be unique.
	Token []byte
}

// QuoteInjectSettings describes open injection token and close injection token.
type QuoteInjectSettings struct {
	// Token type witch opens quoted string.
	StartKey TokenKey
	// Token type witch closes quoted string.
	EndKey TokenKey
}

// StringSettings describes framed(quoted) string tokens like quoted strings.
type StringSettings struct {
	Key          TokenKey
	StartToken   []byte
	EndToken     []byte
	EscapeSymbol byte
	SpecSymbols  map[byte]byte
	Injects      []QuoteInjectSettings
}

// AddInjection configure injection in to string.
// Injection - parsable fragment of framed(quoted) string.
// Often used for parsing of placeholders or template's expressions in the framed string.
func (q *StringSettings) AddInjection(startTokenKey, endTokenKey TokenKey) *StringSettings {
	q.Injects = append(q.Injects, QuoteInjectSettings{StartKey: startTokenKey, EndKey: endTokenKey})
	return q
}

// SetEscapeSymbol set escape symbol for framed(quoted) string.
// Escape symbol allows ignoring close token of framed string.
// Also escape symbol allows using special symbols in the frame strings, like \n, \t.
func (q *StringSettings) SetEscapeSymbol(symbol byte) *StringSettings {
	q.EscapeSymbol = symbol
	return q
}

// SetSpecialSymbols set mapping of all escapable symbols for escape symbol, like \n, \t, \r.
func (q *StringSettings) SetSpecialSymbols(special map[byte]byte) *StringSettings {
	q.SpecSymbols = special
	return q
}

// Tokenizer stores all tokens configuration and behaviors.
type Tokenizer struct {
	// bit flags
	flags uint16
	// all defined custom tokens {key: [token1, token2, ...], ...}
	tokens  map[TokenKey][]*tokenRef
	index   map[byte][]*tokenRef
	quotes  []*StringSettings
	wSpaces []byte
	pool    sync.Pool
}

// New creates new tokenizer.
func New() *Tokenizer {
	t := Tokenizer{
		flags:   0,
		tokens:  map[TokenKey][]*tokenRef{},
		index:   map[byte][]*tokenRef{},
		quotes:  []*StringSettings{},
		wSpaces: defaultWhiteSpaces,
	}
	t.pool.New = func() interface{} {
		return new(Token)
	}
	return &t
}

// SetWhiteSpaces sets custom whitespace symbols between tokens.
// By default: {' ', '\t', '\n', '\r'}
func (t *Tokenizer) SetWhiteSpaces(ws []byte) *Tokenizer {
	t.wSpaces = ws
	return t
}

// StopOnUndefinedToken stops parsing if unknown token detected.
func (t *Tokenizer) StopOnUndefinedToken() *Tokenizer {
	t.flags |= fStopOnUnknown
	return t
}

// AllowKeywordUnderscore allows underscore symbol in keywords, like `one_two` or `_three`
func (t *Tokenizer) AllowKeywordUnderscore() *Tokenizer {
	t.flags |= fAllowKeywordUnderscore
	return t
}

// AllowNumbersInKeyword allows numbers in keywords, like `one1` or `r2d2`
// The method allows numbers in keywords, but the keyword itself must not start with a number.
// There should be no spaces between letters and numbers.
func (t *Tokenizer) AllowNumbersInKeyword() *Tokenizer {
	t.flags |= fAllowNumberInKeyword
	return t
}

// DefineTokens add custom token.
// There `key` unique is identifier of `tokens`, `tokens` — slice of string of tokens.
// If key already exists tokens will be rewritten.
func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string) *Tokenizer {
	var tks []*tokenRef
	if key < 1 {
		return t
	}
	for _, token := range tokens {
		ref := tokenRef{
			Key:   key,
			Token: s2b(token),
		}
		head := ref.Token[0]
		tks = append(tks, &ref)
		if t.index[head] == nil {
			t.index[head] = []*tokenRef{}
		}
		t.index[head] = append(t.index[head], &ref)
		sort.Slice(t.index[head], func(i, j int) bool {
			return len(t.index[head][i].Token) > len(t.index[head][j].Token)
		})
	}
	t.tokens[key] = tks

	return t
}

// DefineStringToken defines a token string.
// For example, a piece of data surrounded by quotes: "string in quotes" or 'string on sigle quotes'.
// Arguments startToken and endToken defines open and close "quotes".
Volker Schukai's avatar
Volker Schukai committed
//  - t.DefineStringToken("`", "`") - parse string "one `two three`" will be parsed as
// 			[{key: TokenKeyword, value: "one"}, {key: TokenString, value: "`two three`"}]
//  - t.DefineStringToken("//", "\n") - parse string "parse // like comment\n" will be parsed as
//			[{key: TokenKeyword, value: "parse"}, {key: TokenString, value: "// like comment"}]
Volker Schukai's avatar
Volker Schukai committed
func (t *Tokenizer) DefineStringToken(key TokenKey, startToken, endToken string) *StringSettings {
	q := &StringSettings{
		Key:        key,
		StartToken: s2b(startToken),
		EndToken:   s2b(endToken),
	}
	if q.StartToken == nil {
		return q
	}
	t.quotes = append(t.quotes, q)

	return q
}

func (t *Tokenizer) allocToken() *Token {
	return t.pool.Get().(*Token)
}

func (t *Tokenizer) freeToken(token *Token) {
	token.next = nil
	token.prev = nil
	token.value = nil
	token.indent = nil
	token.offset = 0
	token.line = 0
	token.id = 0
	token.key = 0
	token.string = nil
	t.pool.Put(token)
}

// ParseString parse the string into tokens
func (t *Tokenizer) ParseString(str string) *Stream {
	return t.ParseBytes(s2b(str))
}

// ParseBytes parse the bytes slice into tokens
func (t *Tokenizer) ParseBytes(str []byte) *Stream {
	p := newParser(t, str)
	p.parse()
	return NewStream(p)
}

// ParseStream parse the string into tokens.
func (t *Tokenizer) ParseStream(r io.Reader, bufferSize uint) *Stream {
	p := newInfParser(t, r, bufferSize)
	p.preload()
	p.parse()
	return NewInfStream(p)
}