Skip to content
Snippets Groups Projects
tokenizer.go 6.87 KiB
Newer Older
  • Learn to ignore specific revisions
  • Volker Schukai's avatar
    Volker Schukai committed
    package tokenizer
    
    import (
    	"io"
    	"sort"
    	"sync"
    )
    
    const newLine = '\n'
    
    // TokenKey token type identifier
    type TokenKey int
    
    const (
    	// TokenUnknown means that this token not embedded token and not user defined.
    	TokenUnknown TokenKey = -6
    	// TokenStringFragment means that this is only fragment of quoted string with injections
    	// For example, "one {{ two }} three", where "one " and " three" — TokenStringFragment
    	TokenStringFragment TokenKey = -5
    	// TokenString means than this token is quoted string.
    	// For example, "one two"
    	TokenString TokenKey = -4
    	// TokenFloat means that this token is float number with point and/or exponent.
    	// For example, 1.2, 1e6, 1E-6
    	TokenFloat TokenKey = -3
    	// TokenInteger means that this token is integer number.
    	// For example, 3, 49983
    	TokenInteger TokenKey = -2
    	// TokenKeyword means that this token is word.
    	// For example, one, two, три
    	TokenKeyword TokenKey = -1
    	// TokenUndef means that token doesn't exist.
    	// Then stream out of range of token list any getter or checker will return TokenUndef token.
    	TokenUndef TokenKey = 0
    )
    
    const (
    	fStopOnUnknown          uint16 = 0b1
    	fAllowKeywordUnderscore uint16 = 0b10
    	fAllowNumberUnderscore  uint16 = 0b100
    	fAllowNumberInKeyword   uint16 = 0b1000
    )
    
    // BackSlash just backslash byte
    const BackSlash = '\\'
    
    var defaultWhiteSpaces = []byte{' ', '\t', '\n', '\r'}
    
    // DefaultStringEscapes is default escaped symbols. Those symbols are often used everywhere.
    var DefaultStringEscapes = map[byte]byte{
    	'n':  '\n',
    	'r':  '\r',
    	't':  '\t',
    	'\\': '\\',
    }
    
    // tokenItem describes one token.
    type tokenRef struct {
    	// Token type. Not unique.
    	Key TokenKey
    	// Token value as is. Should be unique.
    	Token []byte
    }
    
    // QuoteInjectSettings describes open injection token and close injection token.
    type QuoteInjectSettings struct {
    	// Token type witch opens quoted string.
    	StartKey TokenKey
    	// Token type witch closes quoted string.
    	EndKey TokenKey
    }
    
    // StringSettings describes framed(quoted) string tokens like quoted strings.
    type StringSettings struct {
    	Key          TokenKey
    	StartToken   []byte
    	EndToken     []byte
    	EscapeSymbol byte
    	SpecSymbols  map[byte]byte
    	Injects      []QuoteInjectSettings
    }
    
    // AddInjection configure injection in to string.
    // Injection - parsable fragment of framed(quoted) string.
    // Often used for parsing of placeholders or template's expressions in the framed string.
    func (q *StringSettings) AddInjection(startTokenKey, endTokenKey TokenKey) *StringSettings {
    	q.Injects = append(q.Injects, QuoteInjectSettings{StartKey: startTokenKey, EndKey: endTokenKey})
    	return q
    }
    
    // SetEscapeSymbol set escape symbol for framed(quoted) string.
    // Escape symbol allows ignoring close token of framed string.
    // Also escape symbol allows using special symbols in the frame strings, like \n, \t.
    func (q *StringSettings) SetEscapeSymbol(symbol byte) *StringSettings {
    	q.EscapeSymbol = symbol
    	return q
    }
    
    // SetSpecialSymbols set mapping of all escapable symbols for escape symbol, like \n, \t, \r.
    func (q *StringSettings) SetSpecialSymbols(special map[byte]byte) *StringSettings {
    	q.SpecSymbols = special
    	return q
    }
    
    // Tokenizer stores all tokens configuration and behaviors.
    type Tokenizer struct {
    	// bit flags
    	flags uint16
    	// all defined custom tokens {key: [token1, token2, ...], ...}
    	tokens  map[TokenKey][]*tokenRef
    	index   map[byte][]*tokenRef
    	quotes  []*StringSettings
    	wSpaces []byte
    	pool    sync.Pool
    }
    
    // New creates new tokenizer.
    func New() *Tokenizer {
    	t := Tokenizer{
    		flags:   0,
    		tokens:  map[TokenKey][]*tokenRef{},
    		index:   map[byte][]*tokenRef{},
    		quotes:  []*StringSettings{},
    		wSpaces: defaultWhiteSpaces,
    	}
    	t.pool.New = func() interface{} {
    		return new(Token)
    	}
    	return &t
    }
    
    // SetWhiteSpaces sets custom whitespace symbols between tokens.
    // By default: {' ', '\t', '\n', '\r'}
    func (t *Tokenizer) SetWhiteSpaces(ws []byte) *Tokenizer {
    	t.wSpaces = ws
    	return t
    }
    
    // StopOnUndefinedToken stops parsing if unknown token detected.
    func (t *Tokenizer) StopOnUndefinedToken() *Tokenizer {
    	t.flags |= fStopOnUnknown
    	return t
    }
    
    // AllowKeywordUnderscore allows underscore symbol in keywords, like `one_two` or `_three`
    func (t *Tokenizer) AllowKeywordUnderscore() *Tokenizer {
    	t.flags |= fAllowKeywordUnderscore
    	return t
    }
    
    // AllowNumbersInKeyword allows numbers in keywords, like `one1` or `r2d2`
    // The method allows numbers in keywords, but the keyword itself must not start with a number.
    // There should be no spaces between letters and numbers.
    func (t *Tokenizer) AllowNumbersInKeyword() *Tokenizer {
    	t.flags |= fAllowNumberInKeyword
    	return t
    }
    
    // DefineTokens add custom token.
    // There `key` unique is identifier of `tokens`, `tokens` — slice of string of tokens.
    // If key already exists tokens will be rewritten.
    func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string) *Tokenizer {
    	var tks []*tokenRef
    	if key < 1 {
    		return t
    	}
    	for _, token := range tokens {
    		ref := tokenRef{
    			Key:   key,
    			Token: s2b(token),
    		}
    		head := ref.Token[0]
    		tks = append(tks, &ref)
    		if t.index[head] == nil {
    			t.index[head] = []*tokenRef{}
    		}
    		t.index[head] = append(t.index[head], &ref)
    		sort.Slice(t.index[head], func(i, j int) bool {
    			return len(t.index[head][i].Token) > len(t.index[head][j].Token)
    		})
    	}
    	t.tokens[key] = tks
    
    	return t
    }
    
    // DefineStringToken defines a token string.
    // For example, a piece of data surrounded by quotes: "string in quotes" or 'string on sigle quotes'.
    // Arguments startToken and endToken defines open and close "quotes".
    //   - t.DefineStringToken("`", "`") - parse string "one `two three`" will be parsed as
    //     [{key: TokenKeyword, value: "one"}, {key: TokenString, value: "`two three`"}]
    //   - t.DefineStringToken("//", "\n") - parse string "parse // like comment\n" will be parsed as
    //     [{key: TokenKeyword, value: "parse"}, {key: TokenString, value: "// like comment"}]
    func (t *Tokenizer) DefineStringToken(key TokenKey, startToken, endToken string) *StringSettings {
    	q := &StringSettings{
    		Key:        key,
    		StartToken: s2b(startToken),
    		EndToken:   s2b(endToken),
    	}
    	if q.StartToken == nil {
    		return q
    	}
    	t.quotes = append(t.quotes, q)
    
    	return q
    }
    
    func (t *Tokenizer) allocToken() *Token {
    	return t.pool.Get().(*Token)
    }
    
    func (t *Tokenizer) freeToken(token *Token) {
    	token.next = nil
    	token.prev = nil
    	token.value = nil
    	token.indent = nil
    	token.offset = 0
    	token.line = 0
    	token.id = 0
    	token.key = 0
    	token.string = nil
    	t.pool.Put(token)
    }
    
    // ParseString parse the string into tokens
    func (t *Tokenizer) ParseString(str string) *Stream {
    	return t.ParseBytes(s2b(str))
    }
    
    // ParseBytes parse the bytes slice into tokens
    func (t *Tokenizer) ParseBytes(str []byte) *Stream {
    	p := newParser(t, str)
    	p.parse()
    	return NewStream(p)
    }
    
    // ParseStream parse the string into tokens.
    func (t *Tokenizer) ParseStream(r io.Reader, bufferSize uint) *Stream {
    	p := newInfParser(t, r, bufferSize)
    	p.preload()
    	p.parse()
    	return NewInfStream(p)
    }