Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
package tokenizer
import (
"io"
"sort"
"sync"
)
const newLine = '\n'
// TokenKey token type identifier
type TokenKey int
const (
// TokenUnknown means that this token not embedded token and not user defined.
TokenUnknown TokenKey = -6
// TokenStringFragment means that this is only fragment of quoted string with injections
// For example, "one {{ two }} three", where "one " and " three" — TokenStringFragment
TokenStringFragment TokenKey = -5
// TokenString means than this token is quoted string.
// For example, "one two"
TokenString TokenKey = -4
// TokenFloat means that this token is float number with point and/or exponent.
// For example, 1.2, 1e6, 1E-6
TokenFloat TokenKey = -3
// TokenInteger means that this token is integer number.
// For example, 3, 49983
TokenInteger TokenKey = -2
// TokenKeyword means that this token is word.
// For example, one, two, три
TokenKeyword TokenKey = -1
// TokenUndef means that token doesn't exist.
// Then stream out of range of token list any getter or checker will return TokenUndef token.
TokenUndef TokenKey = 0
)
const (
fStopOnUnknown uint16 = 0b1
fAllowKeywordUnderscore uint16 = 0b10
fAllowNumberUnderscore uint16 = 0b100
fAllowNumberInKeyword uint16 = 0b1000
)
// BackSlash just backslash byte
const BackSlash = '\\'
var defaultWhiteSpaces = []byte{' ', '\t', '\n', '\r'}
// DefaultStringEscapes is default escaped symbols. Those symbols are often used everywhere.
var DefaultStringEscapes = map[byte]byte{
'n': '\n',
'r': '\r',
't': '\t',
'\\': '\\',
}
// tokenItem describes one token.
type tokenRef struct {
// Token type. Not unique.
Key TokenKey
// Token value as is. Should be unique.
Token []byte
}
// QuoteInjectSettings describes open injection token and close injection token.
type QuoteInjectSettings struct {
// Token type witch opens quoted string.
StartKey TokenKey
// Token type witch closes quoted string.
EndKey TokenKey
}
// StringSettings describes framed(quoted) string tokens like quoted strings.
type StringSettings struct {
Key TokenKey
StartToken []byte
EndToken []byte
EscapeSymbol byte
SpecSymbols map[byte]byte
Injects []QuoteInjectSettings
}
// AddInjection configure injection in to string.
// Injection - parsable fragment of framed(quoted) string.
// Often used for parsing of placeholders or template's expressions in the framed string.
func (q *StringSettings) AddInjection(startTokenKey, endTokenKey TokenKey) *StringSettings {
q.Injects = append(q.Injects, QuoteInjectSettings{StartKey: startTokenKey, EndKey: endTokenKey})
return q
}
// SetEscapeSymbol set escape symbol for framed(quoted) string.
// Escape symbol allows ignoring close token of framed string.
// Also escape symbol allows using special symbols in the frame strings, like \n, \t.
func (q *StringSettings) SetEscapeSymbol(symbol byte) *StringSettings {
q.EscapeSymbol = symbol
return q
}
// SetSpecialSymbols set mapping of all escapable symbols for escape symbol, like \n, \t, \r.
func (q *StringSettings) SetSpecialSymbols(special map[byte]byte) *StringSettings {
q.SpecSymbols = special
return q
}
// Tokenizer stores all tokens configuration and behaviors.
type Tokenizer struct {
// bit flags
flags uint16
// all defined custom tokens {key: [token1, token2, ...], ...}
tokens map[TokenKey][]*tokenRef
index map[byte][]*tokenRef
quotes []*StringSettings
wSpaces []byte
pool sync.Pool
}
// New creates new tokenizer.
func New() *Tokenizer {
t := Tokenizer{
flags: 0,
tokens: map[TokenKey][]*tokenRef{},
index: map[byte][]*tokenRef{},
quotes: []*StringSettings{},
wSpaces: defaultWhiteSpaces,
}
t.pool.New = func() interface{} {
return new(Token)
}
return &t
}
// SetWhiteSpaces sets custom whitespace symbols between tokens.
// By default: {' ', '\t', '\n', '\r'}
func (t *Tokenizer) SetWhiteSpaces(ws []byte) *Tokenizer {
t.wSpaces = ws
return t
}
// StopOnUndefinedToken stops parsing if unknown token detected.
func (t *Tokenizer) StopOnUndefinedToken() *Tokenizer {
t.flags |= fStopOnUnknown
return t
}
// AllowKeywordUnderscore allows underscore symbol in keywords, like `one_two` or `_three`
func (t *Tokenizer) AllowKeywordUnderscore() *Tokenizer {
t.flags |= fAllowKeywordUnderscore
return t
}
// AllowNumbersInKeyword allows numbers in keywords, like `one1` or `r2d2`
// The method allows numbers in keywords, but the keyword itself must not start with a number.
// There should be no spaces between letters and numbers.
func (t *Tokenizer) AllowNumbersInKeyword() *Tokenizer {
t.flags |= fAllowNumberInKeyword
return t
}
// DefineTokens add custom token.
// There `key` unique is identifier of `tokens`, `tokens` — slice of string of tokens.
// If key already exists tokens will be rewritten.
func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string) *Tokenizer {
var tks []*tokenRef
if key < 1 {
return t
}
for _, token := range tokens {
ref := tokenRef{
Key: key,
Token: s2b(token),
}
head := ref.Token[0]
tks = append(tks, &ref)
if t.index[head] == nil {
t.index[head] = []*tokenRef{}
}
t.index[head] = append(t.index[head], &ref)
sort.Slice(t.index[head], func(i, j int) bool {
return len(t.index[head][i].Token) > len(t.index[head][j].Token)
})
}
t.tokens[key] = tks
return t
}
// DefineStringToken defines a token string.
// For example, a piece of data surrounded by quotes: "string in quotes" or 'string on sigle quotes'.
// Arguments startToken and endToken defines open and close "quotes".
// - t.DefineStringToken("`", "`") - parse string "one `two three`" will be parsed as
// [{key: TokenKeyword, value: "one"}, {key: TokenString, value: "`two three`"}]
// - t.DefineStringToken("//", "\n") - parse string "parse // like comment\n" will be parsed as
// [{key: TokenKeyword, value: "parse"}, {key: TokenString, value: "// like comment"}]
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
func (t *Tokenizer) DefineStringToken(key TokenKey, startToken, endToken string) *StringSettings {
q := &StringSettings{
Key: key,
StartToken: s2b(startToken),
EndToken: s2b(endToken),
}
if q.StartToken == nil {
return q
}
t.quotes = append(t.quotes, q)
return q
}
func (t *Tokenizer) allocToken() *Token {
return t.pool.Get().(*Token)
}
func (t *Tokenizer) freeToken(token *Token) {
token.next = nil
token.prev = nil
token.value = nil
token.indent = nil
token.offset = 0
token.line = 0
token.id = 0
token.key = 0
token.string = nil
t.pool.Put(token)
}
// ParseString parse the string into tokens
func (t *Tokenizer) ParseString(str string) *Stream {
return t.ParseBytes(s2b(str))
}
// ParseBytes parse the bytes slice into tokens
func (t *Tokenizer) ParseBytes(str []byte) *Stream {
p := newParser(t, str)
p.parse()
return NewStream(p)
}
// ParseStream parse the string into tokens.
func (t *Tokenizer) ParseStream(r io.Reader, bufferSize uint) *Stream {
p := newInfParser(t, r, bufferSize)
p.preload()
p.parse()
return NewInfStream(p)
}