Skip to content
Snippets Groups Projects
Verified Commit 249ebf31 authored by Volker Schukai's avatar Volker Schukai :alien:
Browse files

chore: work on flake

parent 310cb0d6
No related branches found
No related tags found
No related merge requests found
Showing
with 4 additions and 3816 deletions
flake-profile-24-link
\ No newline at end of file
flake-profile-27-link
\ No newline at end of file
/nix/store/z732hca3vl0w2mil01ljp1gaya4dm67x-bob-env
\ No newline at end of file
/nix/store/19hxmpsvwilgpw4nfg974zvgxry3gngr-bob-env
\ No newline at end of file
/nix/store/67qcqdvw00dwhx0s484z1a2c1s7rq910-bob-0.5.3-env
\ No newline at end of file
language: go
go:
- 1.3
- 1.4
install:
- go get github.com/andybalholm/cascadia
script:
- go test -v
notifications:
email: false
Copyright (c) 2011 Andy Balholm. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# cascadia
[![](https://travis-ci.org/andybalholm/cascadia.svg)](https://travis-ci.org/andybalholm/cascadia)
The Cascadia package implements CSS selectors for use with the parse trees produced by the html package.
To test CSS selectors without writing Go code, check out [cascadia](https://github.com/suntong/cascadia) the command line tool, a thin wrapper around this package.
[Refer to godoc here](https://godoc.org/github.com/andybalholm/cascadia).
## Example
The following is an example of how you can use Cascadia.
```go
package main
import (
"fmt"
"log"
"strings"
"github.com/andybalholm/cascadia"
"golang.org/x/net/html"
)
var pricingHtml string = `
<div class="card mb-4 box-shadow">
<div class="card-header">
<h4 class="my-0 font-weight-normal">Free</h4>
</div>
<div class="card-body">
<h1 class="card-title pricing-card-title">$0/mo</h1>
<ul class="list-unstyled mt-3 mb-4">
<li>10 users included</li>
<li>2 GB of storage</li>
<li><a href="https://example.com">See more</a></li>
</ul>
</div>
</div>
<div class="card mb-4 box-shadow">
<div class="card-header">
<h4 class="my-0 font-weight-normal">Pro</h4>
</div>
<div class="card-body">
<h1 class="card-title pricing-card-title">$15/mo</h1>
<ul class="list-unstyled mt-3 mb-4">
<li>20 users included</li>
<li>10 GB of storage</li>
<li><a href="https://example.com">See more</a></li>
</ul>
</div>
</div>
<div class="card mb-4 box-shadow">
<div class="card-header">
<h4 class="my-0 font-weight-normal">Enterprise</h4>
</div>
<div class="card-body">
<h1 class="card-title pricing-card-title">$29/mo</h1>
<ul class="list-unstyled mt-3 mb-4">
<li>30 users included</li>
<li>15 GB of storage</li>
<li><a>See more</a></li>
</ul>
</div>
</div>
`
func Query(n *html.Node, query string) *html.Node {
sel, err := cascadia.Parse(query)
if err != nil {
return &html.Node{}
}
return cascadia.Query(n, sel)
}
func QueryAll(n *html.Node, query string) []*html.Node {
sel, err := cascadia.Parse(query)
if err != nil {
return []*html.Node{}
}
return cascadia.QueryAll(n, sel)
}
func AttrOr(n *html.Node, attrName, or string) string {
for _, a := range n.Attr {
if a.Key == attrName {
return a.Val
}
}
return or
}
func main() {
doc, err := html.Parse(strings.NewReader(pricingHtml))
if err != nil {
log.Fatal(err)
}
fmt.Printf("List of pricing plans:\n\n")
for i, p := range QueryAll(doc, "div.card.mb-4.box-shadow") {
planName := Query(p, "h4").FirstChild.Data
price := Query(p, ".pricing-card-title").FirstChild.Data
usersIncluded := Query(p, "li:first-child").FirstChild.Data
storage := Query(p, "li:nth-child(2)").FirstChild.Data
detailsUrl := AttrOr(Query(p, "li:last-child a"), "href", "(No link available)")
fmt.Printf(
"Plan #%d\nName: %s\nPrice: %s\nUsers: %s\nStorage: %s\nDetails: %s\n\n",
i+1,
planName,
price,
usersIncluded,
storage,
detailsUrl,
)
}
}
```
The output is:
```
List of pricing plans:
Plan #1
Name: Free
Price: $0/mo
Users: 10 users included
Storage: 2 GB of storage
Details: https://example.com
Plan #2
Name: Pro
Price: $15/mo
Users: 20 users included
Storage: 10 GB of storage
Details: https://example.com
Plan #3
Name: Enterprise
Price: $29/mo
Users: 30 users included
Storage: 15 GB of storage
Details: (No link available)
```
This diff is collapsed.
package cascadia
import (
"bytes"
"fmt"
"regexp"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
// This file implements the pseudo classes selectors,
// which share the implementation of PseudoElement() and Specificity()
type abstractPseudoClass struct{}
func (s abstractPseudoClass) Specificity() Specificity {
return Specificity{0, 1, 0}
}
func (c abstractPseudoClass) PseudoElement() string {
return ""
}
type relativePseudoClassSelector struct {
name string // one of "not", "has", "haschild"
match SelectorGroup
}
func (s relativePseudoClassSelector) Match(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
switch s.name {
case "not":
// matches elements that do not match a.
return !s.match.Match(n)
case "has":
// matches elements with any descendant that matches a.
return hasDescendantMatch(n, s.match)
case "haschild":
// matches elements with a child that matches a.
return hasChildMatch(n, s.match)
default:
panic(fmt.Sprintf("unsupported relative pseudo class selector : %s", s.name))
}
}
// hasChildMatch returns whether n has any child that matches a.
func hasChildMatch(n *html.Node, a Matcher) bool {
for c := n.FirstChild; c != nil; c = c.NextSibling {
if a.Match(c) {
return true
}
}
return false
}
// hasDescendantMatch performs a depth-first search of n's descendants,
// testing whether any of them match a. It returns true as soon as a match is
// found, or false if no match is found.
func hasDescendantMatch(n *html.Node, a Matcher) bool {
for c := n.FirstChild; c != nil; c = c.NextSibling {
if a.Match(c) || (c.Type == html.ElementNode && hasDescendantMatch(c, a)) {
return true
}
}
return false
}
// Specificity returns the specificity of the most specific selectors
// in the pseudo-class arguments.
// See https://www.w3.org/TR/selectors/#specificity-rules
func (s relativePseudoClassSelector) Specificity() Specificity {
var max Specificity
for _, sel := range s.match {
newSpe := sel.Specificity()
if max.Less(newSpe) {
max = newSpe
}
}
return max
}
func (c relativePseudoClassSelector) PseudoElement() string {
return ""
}
type containsPseudoClassSelector struct {
abstractPseudoClass
value string
own bool
}
func (s containsPseudoClassSelector) Match(n *html.Node) bool {
var text string
if s.own {
// matches nodes that directly contain the given text
text = strings.ToLower(nodeOwnText(n))
} else {
// matches nodes that contain the given text.
text = strings.ToLower(nodeText(n))
}
return strings.Contains(text, s.value)
}
type regexpPseudoClassSelector struct {
abstractPseudoClass
regexp *regexp.Regexp
own bool
}
func (s regexpPseudoClassSelector) Match(n *html.Node) bool {
var text string
if s.own {
// matches nodes whose text directly matches the specified regular expression
text = nodeOwnText(n)
} else {
// matches nodes whose text matches the specified regular expression
text = nodeText(n)
}
return s.regexp.MatchString(text)
}
// writeNodeText writes the text contained in n and its descendants to b.
func writeNodeText(n *html.Node, b *bytes.Buffer) {
switch n.Type {
case html.TextNode:
b.WriteString(n.Data)
case html.ElementNode:
for c := n.FirstChild; c != nil; c = c.NextSibling {
writeNodeText(c, b)
}
}
}
// nodeText returns the text contained in n and its descendants.
func nodeText(n *html.Node) string {
var b bytes.Buffer
writeNodeText(n, &b)
return b.String()
}
// nodeOwnText returns the contents of the text nodes that are direct
// children of n.
func nodeOwnText(n *html.Node) string {
var b bytes.Buffer
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.TextNode {
b.WriteString(c.Data)
}
}
return b.String()
}
type nthPseudoClassSelector struct {
abstractPseudoClass
a, b int
last, ofType bool
}
func (s nthPseudoClassSelector) Match(n *html.Node) bool {
if s.a == 0 {
if s.last {
return simpleNthLastChildMatch(s.b, s.ofType, n)
} else {
return simpleNthChildMatch(s.b, s.ofType, n)
}
}
return nthChildMatch(s.a, s.b, s.last, s.ofType, n)
}
// nthChildMatch implements :nth-child(an+b).
// If last is true, implements :nth-last-child instead.
// If ofType is true, implements :nth-of-type instead.
func nthChildMatch(a, b int, last, ofType bool, n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
parent := n.Parent
if parent == nil {
return false
}
i := -1
count := 0
for c := parent.FirstChild; c != nil; c = c.NextSibling {
if (c.Type != html.ElementNode) || (ofType && c.Data != n.Data) {
continue
}
count++
if c == n {
i = count
if !last {
break
}
}
}
if i == -1 {
// This shouldn't happen, since n should always be one of its parent's children.
return false
}
if last {
i = count - i + 1
}
i -= b
if a == 0 {
return i == 0
}
return i%a == 0 && i/a >= 0
}
// simpleNthChildMatch implements :nth-child(b).
// If ofType is true, implements :nth-of-type instead.
func simpleNthChildMatch(b int, ofType bool, n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
parent := n.Parent
if parent == nil {
return false
}
count := 0
for c := parent.FirstChild; c != nil; c = c.NextSibling {
if c.Type != html.ElementNode || (ofType && c.Data != n.Data) {
continue
}
count++
if c == n {
return count == b
}
if count >= b {
return false
}
}
return false
}
// simpleNthLastChildMatch implements :nth-last-child(b).
// If ofType is true, implements :nth-last-of-type instead.
func simpleNthLastChildMatch(b int, ofType bool, n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
parent := n.Parent
if parent == nil {
return false
}
count := 0
for c := parent.LastChild; c != nil; c = c.PrevSibling {
if c.Type != html.ElementNode || (ofType && c.Data != n.Data) {
continue
}
count++
if c == n {
return count == b
}
if count >= b {
return false
}
}
return false
}
type onlyChildPseudoClassSelector struct {
abstractPseudoClass
ofType bool
}
// Match implements :only-child.
// If `ofType` is true, it implements :only-of-type instead.
func (s onlyChildPseudoClassSelector) Match(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
parent := n.Parent
if parent == nil {
return false
}
count := 0
for c := parent.FirstChild; c != nil; c = c.NextSibling {
if (c.Type != html.ElementNode) || (s.ofType && c.Data != n.Data) {
continue
}
count++
if count > 1 {
return false
}
}
return count == 1
}
type inputPseudoClassSelector struct {
abstractPseudoClass
}
// Matches input, select, textarea and button elements.
func (s inputPseudoClassSelector) Match(n *html.Node) bool {
return n.Type == html.ElementNode && (n.Data == "input" || n.Data == "select" || n.Data == "textarea" || n.Data == "button")
}
type emptyElementPseudoClassSelector struct {
abstractPseudoClass
}
// Matches empty elements.
func (s emptyElementPseudoClassSelector) Match(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
switch c.Type {
case html.ElementNode:
return false
case html.TextNode:
if strings.TrimSpace(nodeText(c)) == "" {
continue
} else {
return false
}
}
}
return true
}
type rootPseudoClassSelector struct {
abstractPseudoClass
}
// Match implements :root
func (s rootPseudoClassSelector) Match(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
if n.Parent == nil {
return false
}
return n.Parent.Type == html.DocumentNode
}
func hasAttr(n *html.Node, attr string) bool {
return matchAttribute(n, attr, func(string) bool { return true })
}
type linkPseudoClassSelector struct {
abstractPseudoClass
}
// Match implements :link
func (s linkPseudoClassSelector) Match(n *html.Node) bool {
return (n.DataAtom == atom.A || n.DataAtom == atom.Area || n.DataAtom == atom.Link) && hasAttr(n, "href")
}
type langPseudoClassSelector struct {
abstractPseudoClass
lang string
}
func (s langPseudoClassSelector) Match(n *html.Node) bool {
own := matchAttribute(n, "lang", func(val string) bool {
return val == s.lang || strings.HasPrefix(val, s.lang+"-")
})
if n.Parent == nil {
return own
}
return own || s.Match(n.Parent)
}
type enabledPseudoClassSelector struct {
abstractPseudoClass
}
func (s enabledPseudoClassSelector) Match(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
switch n.DataAtom {
case atom.A, atom.Area, atom.Link:
return hasAttr(n, "href")
case atom.Optgroup, atom.Menuitem, atom.Fieldset:
return !hasAttr(n, "disabled")
case atom.Button, atom.Input, atom.Select, atom.Textarea, atom.Option:
return !hasAttr(n, "disabled") && !inDisabledFieldset(n)
}
return false
}
type disabledPseudoClassSelector struct {
abstractPseudoClass
}
func (s disabledPseudoClassSelector) Match(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
switch n.DataAtom {
case atom.Optgroup, atom.Menuitem, atom.Fieldset:
return hasAttr(n, "disabled")
case atom.Button, atom.Input, atom.Select, atom.Textarea, atom.Option:
return hasAttr(n, "disabled") || inDisabledFieldset(n)
}
return false
}
func hasLegendInPreviousSiblings(n *html.Node) bool {
for s := n.PrevSibling; s != nil; s = s.PrevSibling {
if s.DataAtom == atom.Legend {
return true
}
}
return false
}
func inDisabledFieldset(n *html.Node) bool {
if n.Parent == nil {
return false
}
if n.Parent.DataAtom == atom.Fieldset && hasAttr(n.Parent, "disabled") &&
(n.DataAtom != atom.Legend || hasLegendInPreviousSiblings(n)) {
return true
}
return inDisabledFieldset(n.Parent)
}
type checkedPseudoClassSelector struct {
abstractPseudoClass
}
func (s checkedPseudoClassSelector) Match(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
switch n.DataAtom {
case atom.Input, atom.Menuitem:
return hasAttr(n, "checked") && matchAttribute(n, "type", func(val string) bool {
t := toLowerASCII(val)
return t == "checkbox" || t == "radio"
})
case atom.Option:
return hasAttr(n, "selected")
}
return false
}
package cascadia
import (
"fmt"
"regexp"
"strings"
"golang.org/x/net/html"
)
// Matcher is the interface for basic selector functionality.
// Match returns whether a selector matches n.
type Matcher interface {
Match(n *html.Node) bool
}
// Sel is the interface for all the functionality provided by selectors.
type Sel interface {
Matcher
Specificity() Specificity
// Returns a CSS input compiling to this selector.
String() string
// Returns a pseudo-element, or an empty string.
PseudoElement() string
}
// Parse parses a selector. Use `ParseWithPseudoElement`
// if you need support for pseudo-elements.
func Parse(sel string) (Sel, error) {
p := &parser{s: sel}
compiled, err := p.parseSelector()
if err != nil {
return nil, err
}
if p.i < len(sel) {
return nil, fmt.Errorf("parsing %q: %d bytes left over", sel, len(sel)-p.i)
}
return compiled, nil
}
// ParseWithPseudoElement parses a single selector,
// with support for pseudo-element.
func ParseWithPseudoElement(sel string) (Sel, error) {
p := &parser{s: sel, acceptPseudoElements: true}
compiled, err := p.parseSelector()
if err != nil {
return nil, err
}
if p.i < len(sel) {
return nil, fmt.Errorf("parsing %q: %d bytes left over", sel, len(sel)-p.i)
}
return compiled, nil
}
// ParseGroup parses a selector, or a group of selectors separated by commas.
// Use `ParseGroupWithPseudoElements`
// if you need support for pseudo-elements.
func ParseGroup(sel string) (SelectorGroup, error) {
p := &parser{s: sel}
compiled, err := p.parseSelectorGroup()
if err != nil {
return nil, err
}
if p.i < len(sel) {
return nil, fmt.Errorf("parsing %q: %d bytes left over", sel, len(sel)-p.i)
}
return compiled, nil
}
// ParseGroupWithPseudoElements parses a selector, or a group of selectors separated by commas.
// It supports pseudo-elements.
func ParseGroupWithPseudoElements(sel string) (SelectorGroup, error) {
p := &parser{s: sel, acceptPseudoElements: true}
compiled, err := p.parseSelectorGroup()
if err != nil {
return nil, err
}
if p.i < len(sel) {
return nil, fmt.Errorf("parsing %q: %d bytes left over", sel, len(sel)-p.i)
}
return compiled, nil
}
// A Selector is a function which tells whether a node matches or not.
//
// This type is maintained for compatibility; I recommend using the newer and
// more idiomatic interfaces Sel and Matcher.
type Selector func(*html.Node) bool
// Compile parses a selector and returns, if successful, a Selector object
// that can be used to match against html.Node objects.
func Compile(sel string) (Selector, error) {
compiled, err := ParseGroup(sel)
if err != nil {
return nil, err
}
return Selector(compiled.Match), nil
}
// MustCompile is like Compile, but panics instead of returning an error.
func MustCompile(sel string) Selector {
compiled, err := Compile(sel)
if err != nil {
panic(err)
}
return compiled
}
// MatchAll returns a slice of the nodes that match the selector,
// from n and its children.
func (s Selector) MatchAll(n *html.Node) []*html.Node {
return s.matchAllInto(n, nil)
}
func (s Selector) matchAllInto(n *html.Node, storage []*html.Node) []*html.Node {
if s(n) {
storage = append(storage, n)
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
storage = s.matchAllInto(child, storage)
}
return storage
}
func queryInto(n *html.Node, m Matcher, storage []*html.Node) []*html.Node {
for child := n.FirstChild; child != nil; child = child.NextSibling {
if m.Match(child) {
storage = append(storage, child)
}
storage = queryInto(child, m, storage)
}
return storage
}
// QueryAll returns a slice of all the nodes that match m, from the descendants
// of n.
func QueryAll(n *html.Node, m Matcher) []*html.Node {
return queryInto(n, m, nil)
}
// Match returns true if the node matches the selector.
func (s Selector) Match(n *html.Node) bool {
return s(n)
}
// MatchFirst returns the first node that matches s, from n and its children.
func (s Selector) MatchFirst(n *html.Node) *html.Node {
if s.Match(n) {
return n
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
m := s.MatchFirst(c)
if m != nil {
return m
}
}
return nil
}
// Query returns the first node that matches m, from the descendants of n.
// If none matches, it returns nil.
func Query(n *html.Node, m Matcher) *html.Node {
for c := n.FirstChild; c != nil; c = c.NextSibling {
if m.Match(c) {
return c
}
if matched := Query(c, m); matched != nil {
return matched
}
}
return nil
}
// Filter returns the nodes in nodes that match the selector.
func (s Selector) Filter(nodes []*html.Node) (result []*html.Node) {
for _, n := range nodes {
if s(n) {
result = append(result, n)
}
}
return result
}
// Filter returns the nodes that match m.
func Filter(nodes []*html.Node, m Matcher) (result []*html.Node) {
for _, n := range nodes {
if m.Match(n) {
result = append(result, n)
}
}
return result
}
type tagSelector struct {
tag string
}
// Matches elements with a given tag name.
func (t tagSelector) Match(n *html.Node) bool {
return n.Type == html.ElementNode && n.Data == t.tag
}
func (c tagSelector) Specificity() Specificity {
return Specificity{0, 0, 1}
}
func (c tagSelector) PseudoElement() string {
return ""
}
type classSelector struct {
class string
}
// Matches elements by class attribute.
func (t classSelector) Match(n *html.Node) bool {
return matchAttribute(n, "class", func(s string) bool {
return matchInclude(t.class, s, false)
})
}
func (c classSelector) Specificity() Specificity {
return Specificity{0, 1, 0}
}
func (c classSelector) PseudoElement() string {
return ""
}
type idSelector struct {
id string
}
// Matches elements by id attribute.
func (t idSelector) Match(n *html.Node) bool {
return matchAttribute(n, "id", func(s string) bool {
return s == t.id
})
}
func (c idSelector) Specificity() Specificity {
return Specificity{1, 0, 0}
}
func (c idSelector) PseudoElement() string {
return ""
}
type attrSelector struct {
key, val, operation string
regexp *regexp.Regexp
insensitive bool
}
// Matches elements by attribute value.
func (t attrSelector) Match(n *html.Node) bool {
switch t.operation {
case "":
return matchAttribute(n, t.key, func(string) bool { return true })
case "=":
return matchAttribute(n, t.key, func(s string) bool { return matchInsensitiveValue(s, t.val, t.insensitive) })
case "!=":
return attributeNotEqualMatch(t.key, t.val, n, t.insensitive)
case "~=":
// matches elements where the attribute named key is a whitespace-separated list that includes val.
return matchAttribute(n, t.key, func(s string) bool { return matchInclude(t.val, s, t.insensitive) })
case "|=":
return attributeDashMatch(t.key, t.val, n, t.insensitive)
case "^=":
return attributePrefixMatch(t.key, t.val, n, t.insensitive)
case "$=":
return attributeSuffixMatch(t.key, t.val, n, t.insensitive)
case "*=":
return attributeSubstringMatch(t.key, t.val, n, t.insensitive)
case "#=":
return attributeRegexMatch(t.key, t.regexp, n)
default:
panic(fmt.Sprintf("unsuported operation : %s", t.operation))
}
}
// matches elements where we ignore (or not) the case of the attribute value
// the user attribute is the value set by the user to match elements
// the real attribute is the attribute value found in the code parsed
func matchInsensitiveValue(userAttr string, realAttr string, ignoreCase bool) bool {
if ignoreCase {
return strings.EqualFold(userAttr, realAttr)
}
return userAttr == realAttr
}
// matches elements where the attribute named key satisifes the function f.
func matchAttribute(n *html.Node, key string, f func(string) bool) bool {
if n.Type != html.ElementNode {
return false
}
for _, a := range n.Attr {
if a.Key == key && f(a.Val) {
return true
}
}
return false
}
// attributeNotEqualMatch matches elements where
// the attribute named key does not have the value val.
func attributeNotEqualMatch(key, val string, n *html.Node, ignoreCase bool) bool {
if n.Type != html.ElementNode {
return false
}
for _, a := range n.Attr {
if a.Key == key && matchInsensitiveValue(a.Val, val, ignoreCase) {
return false
}
}
return true
}
// returns true if s is a whitespace-separated list that includes val.
func matchInclude(val string, s string, ignoreCase bool) bool {
for s != "" {
i := strings.IndexAny(s, " \t\r\n\f")
if i == -1 {
return matchInsensitiveValue(s, val, ignoreCase)
}
if matchInsensitiveValue(s[:i], val, ignoreCase) {
return true
}
s = s[i+1:]
}
return false
}
// matches elements where the attribute named key equals val or starts with val plus a hyphen.
func attributeDashMatch(key, val string, n *html.Node, ignoreCase bool) bool {
return matchAttribute(n, key,
func(s string) bool {
if matchInsensitiveValue(s, val, ignoreCase) {
return true
}
if len(s) <= len(val) {
return false
}
if matchInsensitiveValue(s[:len(val)], val, ignoreCase) && s[len(val)] == '-' {
return true
}
return false
})
}
// attributePrefixMatch returns a Selector that matches elements where
// the attribute named key starts with val.
func attributePrefixMatch(key, val string, n *html.Node, ignoreCase bool) bool {
return matchAttribute(n, key,
func(s string) bool {
if strings.TrimSpace(s) == "" {
return false
}
if ignoreCase {
return strings.HasPrefix(strings.ToLower(s), strings.ToLower(val))
}
return strings.HasPrefix(s, val)
})
}
// attributeSuffixMatch matches elements where
// the attribute named key ends with val.
func attributeSuffixMatch(key, val string, n *html.Node, ignoreCase bool) bool {
return matchAttribute(n, key,
func(s string) bool {
if strings.TrimSpace(s) == "" {
return false
}
if ignoreCase {
return strings.HasSuffix(strings.ToLower(s), strings.ToLower(val))
}
return strings.HasSuffix(s, val)
})
}
// attributeSubstringMatch matches nodes where
// the attribute named key contains val.
func attributeSubstringMatch(key, val string, n *html.Node, ignoreCase bool) bool {
return matchAttribute(n, key,
func(s string) bool {
if strings.TrimSpace(s) == "" {
return false
}
if ignoreCase {
return strings.Contains(strings.ToLower(s), strings.ToLower(val))
}
return strings.Contains(s, val)
})
}
// attributeRegexMatch matches nodes where
// the attribute named key matches the regular expression rx
func attributeRegexMatch(key string, rx *regexp.Regexp, n *html.Node) bool {
return matchAttribute(n, key,
func(s string) bool {
return rx.MatchString(s)
})
}
func (c attrSelector) Specificity() Specificity {
return Specificity{0, 1, 0}
}
func (c attrSelector) PseudoElement() string {
return ""
}
// see pseudo_classes.go for pseudo classes selectors
// on a static context, some selectors can't match anything
type neverMatchSelector struct {
value string
}
func (s neverMatchSelector) Match(n *html.Node) bool {
return false
}
func (s neverMatchSelector) Specificity() Specificity {
return Specificity{0, 0, 0}
}
func (c neverMatchSelector) PseudoElement() string {
return ""
}
type compoundSelector struct {
selectors []Sel
pseudoElement string
}
// Matches elements if each sub-selectors matches.
func (t compoundSelector) Match(n *html.Node) bool {
if len(t.selectors) == 0 {
return n.Type == html.ElementNode
}
for _, sel := range t.selectors {
if !sel.Match(n) {
return false
}
}
return true
}
func (s compoundSelector) Specificity() Specificity {
var out Specificity
for _, sel := range s.selectors {
out = out.Add(sel.Specificity())
}
if s.pseudoElement != "" {
// https://drafts.csswg.org/selectors-3/#specificity
out = out.Add(Specificity{0, 0, 1})
}
return out
}
func (c compoundSelector) PseudoElement() string {
return c.pseudoElement
}
type combinedSelector struct {
first Sel
combinator byte
second Sel
}
func (t combinedSelector) Match(n *html.Node) bool {
if t.first == nil {
return false // maybe we should panic
}
switch t.combinator {
case 0:
return t.first.Match(n)
case ' ':
return descendantMatch(t.first, t.second, n)
case '>':
return childMatch(t.first, t.second, n)
case '+':
return siblingMatch(t.first, t.second, true, n)
case '~':
return siblingMatch(t.first, t.second, false, n)
default:
panic("unknown combinator")
}
}
// matches an element if it matches d and has an ancestor that matches a.
func descendantMatch(a, d Matcher, n *html.Node) bool {
if !d.Match(n) {
return false
}
for p := n.Parent; p != nil; p = p.Parent {
if a.Match(p) {
return true
}
}
return false
}
// matches an element if it matches d and its parent matches a.
func childMatch(a, d Matcher, n *html.Node) bool {
return d.Match(n) && n.Parent != nil && a.Match(n.Parent)
}
// matches an element if it matches s2 and is preceded by an element that matches s1.
// If adjacent is true, the sibling must be immediately before the element.
func siblingMatch(s1, s2 Matcher, adjacent bool, n *html.Node) bool {
if !s2.Match(n) {
return false
}
if adjacent {
for n = n.PrevSibling; n != nil; n = n.PrevSibling {
if n.Type == html.TextNode || n.Type == html.CommentNode {
continue
}
return s1.Match(n)
}
return false
}
// Walk backwards looking for element that matches s1
for c := n.PrevSibling; c != nil; c = c.PrevSibling {
if s1.Match(c) {
return true
}
}
return false
}
func (s combinedSelector) Specificity() Specificity {
spec := s.first.Specificity()
if s.second != nil {
spec = spec.Add(s.second.Specificity())
}
return spec
}
// on combinedSelector, a pseudo-element only makes sens on the last
// selector, although others increase specificity.
func (c combinedSelector) PseudoElement() string {
if c.second == nil {
return ""
}
return c.second.PseudoElement()
}
// A SelectorGroup is a list of selectors, which matches if any of the
// individual selectors matches.
type SelectorGroup []Sel
// Match returns true if the node matches one of the single selectors.
func (s SelectorGroup) Match(n *html.Node) bool {
for _, sel := range s {
if sel.Match(n) {
return true
}
}
return false
}
package cascadia
import (
"fmt"
"strconv"
"strings"
)
// implements the reverse operation Sel -> string
var specialCharReplacer *strings.Replacer
func init() {
var pairs []string
for _, s := range ",!\"#$%&'()*+ -./:;<=>?@[\\]^`{|}~" {
pairs = append(pairs, string(s), "\\"+string(s))
}
specialCharReplacer = strings.NewReplacer(pairs...)
}
// espace special CSS char
func escape(s string) string { return specialCharReplacer.Replace(s) }
func (c tagSelector) String() string {
return c.tag
}
func (c idSelector) String() string {
return "#" + escape(c.id)
}
func (c classSelector) String() string {
return "." + escape(c.class)
}
func (c attrSelector) String() string {
val := c.val
if c.operation == "#=" {
val = c.regexp.String()
} else if c.operation != "" {
val = fmt.Sprintf(`"%s"`, val)
}
ignoreCase := ""
if c.insensitive {
ignoreCase = " i"
}
return fmt.Sprintf(`[%s%s%s%s]`, c.key, c.operation, val, ignoreCase)
}
func (c relativePseudoClassSelector) String() string {
return fmt.Sprintf(":%s(%s)", c.name, c.match.String())
}
func (c containsPseudoClassSelector) String() string {
s := "contains"
if c.own {
s += "Own"
}
return fmt.Sprintf(`:%s("%s")`, s, c.value)
}
func (c regexpPseudoClassSelector) String() string {
s := "matches"
if c.own {
s += "Own"
}
return fmt.Sprintf(":%s(%s)", s, c.regexp.String())
}
func (c nthPseudoClassSelector) String() string {
if c.a == 0 && c.b == 1 { // special cases
s := ":first-"
if c.last {
s = ":last-"
}
if c.ofType {
s += "of-type"
} else {
s += "child"
}
return s
}
var name string
switch [2]bool{c.last, c.ofType} {
case [2]bool{true, true}:
name = "nth-last-of-type"
case [2]bool{true, false}:
name = "nth-last-child"
case [2]bool{false, true}:
name = "nth-of-type"
case [2]bool{false, false}:
name = "nth-child"
}
s := fmt.Sprintf("+%d", c.b)
if c.b < 0 { // avoid +-8 invalid syntax
s = strconv.Itoa(c.b)
}
return fmt.Sprintf(":%s(%dn%s)", name, c.a, s)
}
func (c onlyChildPseudoClassSelector) String() string {
if c.ofType {
return ":only-of-type"
}
return ":only-child"
}
func (c inputPseudoClassSelector) String() string {
return ":input"
}
func (c emptyElementPseudoClassSelector) String() string {
return ":empty"
}
func (c rootPseudoClassSelector) String() string {
return ":root"
}
func (c linkPseudoClassSelector) String() string {
return ":link"
}
func (c langPseudoClassSelector) String() string {
return fmt.Sprintf(":lang(%s)", c.lang)
}
func (c neverMatchSelector) String() string {
return c.value
}
func (c enabledPseudoClassSelector) String() string {
return ":enabled"
}
func (c disabledPseudoClassSelector) String() string {
return ":disabled"
}
func (c checkedPseudoClassSelector) String() string {
return ":checked"
}
func (c compoundSelector) String() string {
if len(c.selectors) == 0 && c.pseudoElement == "" {
return "*"
}
chunks := make([]string, len(c.selectors))
for i, sel := range c.selectors {
chunks[i] = sel.String()
}
s := strings.Join(chunks, "")
if c.pseudoElement != "" {
s += "::" + c.pseudoElement
}
return s
}
func (c combinedSelector) String() string {
start := c.first.String()
if c.second != nil {
start += fmt.Sprintf(" %s %s", string(c.combinator), c.second.String())
}
return start
}
func (c SelectorGroup) String() string {
ck := make([]string, len(c))
for i, s := range c {
ck[i] = s.String()
}
return strings.Join(ck, ", ")
}
package cascadia
// Specificity is the CSS specificity as defined in
// https://www.w3.org/TR/selectors/#specificity-rules
// with the convention Specificity = [A,B,C].
type Specificity [3]int
// returns `true` if s < other (strictly), false otherwise
func (s Specificity) Less(other Specificity) bool {
for i := range s {
if s[i] < other[i] {
return true
}
if s[i] > other[i] {
return false
}
}
return false
}
func (s Specificity) Add(other Specificity) Specificity {
for i, sp := range other {
s[i] += sp
}
return s
}
coverage:
status:
project:
default:
informational: true
patch:
default:
informational: true
.idea
\ No newline at end of file
MIT License
Copyright (c) 2021 Ivan Shalganov
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
package tokenizer
import (
"reflect"
"runtime"
"unsafe"
)
// b2s converts byte slice to a string without memory allocation.
// See https://groups.google.com/forum/#!msg/Golang-Nuts/ENgbUzYvCuU/90yGx7GUAgAJ .
func b2s(b []byte) string {
return *(*string)(unsafe.Pointer(&b))
}
// s2b converts string to a byte slice without memory allocation.
//
// Note it may break if string and/or slice header will change
// in the future go versions.
func s2b(s string) (b []byte) {
sh := (*reflect.StringHeader)(unsafe.Pointer(&s))
bh := (*reflect.SliceHeader)(unsafe.Pointer(&b))
bh.Data = sh.Data
bh.Cap = sh.Len
bh.Len = sh.Len
runtime.KeepAlive(&s)
return b
}
func isNumberByte(b byte) bool {
return '0' <= b && b <= '9'
}
func bytesStarts(prefix []byte, b []byte) bool {
if len(prefix) > len(b) {
return false
}
return b2s(prefix) == b2s(b[0:len(prefix)])
}
func bytesEnds(suffix []byte, b []byte) bool {
if len(suffix) > len(b) {
return false
}
return b2s(suffix) == b2s(b[len(b)-len(suffix):])
}
package tokenizer
import (
"io"
"unicode"
"unicode/utf8"
)
// DefaultChunkSize default chunk size for reader.
const DefaultChunkSize = 4096
// parsing is main parser
type parsing struct {
t *Tokenizer
curr byte
pos int
line int
str []byte
err error
reader io.Reader
token *Token
head *Token
ptr *Token
tail []byte
stopKeys []*tokenRef
n int // tokens id generator
chunkSize int // chunks size for infinite buffer
offset int
resume bool
parsed int
}
// newParser creates new parser for string
func newParser(t *Tokenizer, str []byte) *parsing {
tok := t.allocToken()
tok.line = 1
return &parsing{
t: t,
str: str,
line: 1,
token: tok,
}
}
func newInfParser(t *Tokenizer, reader io.Reader, bufferSize uint) *parsing {
if bufferSize == 0 {
bufferSize = DefaultChunkSize
}
buffer := make([]byte, bufferSize)
tok := t.allocToken()
tok.line = 1
return &parsing{
t: t,
str: buffer,
reader: reader,
line: 1,
chunkSize: int(bufferSize),
token: tok,
}
}
func (p *parsing) prev() {
if p.pos > 0 {
p.pos--
p.curr = p.str[p.pos]
}
}
func (p *parsing) ensureBytes(n int) bool {
if p.pos+n >= len(p.str) {
if p.reader != nil {
p.loadChunk()
if p.pos+n < len(p.str) {
return true
}
}
return false
}
return true
}
func (p *parsing) next() {
p.pos++
if p.pos >= len(p.str) {
if p.reader == nil || p.loadChunk() == 0 {
p.curr = 0
return
}
}
p.curr = p.str[p.pos]
}
func (p *parsing) nextByte() byte {
if p.ensureBytes(1) {
return p.str[p.pos+1]
}
return 0
}
func (p *parsing) slice(from, to int) []byte {
if to < len(p.str) {
return p.str[from:to]
}
return p.str[from:]
}
func (p *parsing) preload() {
n, err := p.reader.Read(p.str)
if n < p.chunkSize {
p.str = p.str[:n]
p.reader = nil
}
if err != nil {
p.reader = nil
if err != io.EOF {
p.err = err
}
}
}
func (p *parsing) loadChunk() int {
// chunk size = new chunk size + size of tail of prev chunk
chunk := make([]byte, len(p.str)+p.chunkSize)
copy(chunk, p.str)
n, err := p.reader.Read(chunk[len(p.str):])
if n < p.chunkSize {
p.str = chunk[:len(p.str)+n]
p.reader = nil
} else {
p.str = chunk
}
if err != nil {
p.reader = nil
if err != io.EOF {
p.err = err
}
}
p.resume = false
return n
}
// checkPoint reset internal values for next chunk of data
func (p *parsing) checkPoint() bool {
if p.pos > 0 {
p.parsed += p.pos
p.str = p.str[p.pos:]
p.offset += p.pos
p.pos = 0
if len(p.str) == 0 {
p.curr = 0
}
}
return p.resume
}
// parse bytes (p.str) to tokens and append them to the end if stream of tokens.
func (p *parsing) parse() {
if len(p.str) == 0 {
if p.reader == nil || p.loadChunk() == 0 { // if it's not infinite stream or this is the end of stream
return
}
}
p.curr = p.str[p.pos]
p.resume = true
for p.checkPoint() {
if p.stopKeys != nil {
for _, t := range p.stopKeys {
if p.ptr.key == t.Key {
return
}
}
}
p.parseWhitespace()
if p.curr == 0 {
break
}
if p.parseToken() {
continue
}
if p.curr == 0 {
break
}
if p.parseKeyword() {
continue
}
if p.curr == 0 {
break
}
if p.parseNumber() {
continue
}
if p.curr == 0 {
break
}
if p.parseQuote() {
continue
}
if p.curr == 0 {
break
}
if p.t.flags&fStopOnUnknown != 0 {
break
}
p.token.key = TokenUnknown
p.token.value = p.str[p.pos : p.pos+1]
p.token.offset = p.offset + p.pos
p.next()
p.emmitToken()
if p.curr == 0 {
break
}
}
if len(p.token.indent) > 0 {
p.tail = p.token.indent
}
}
func (p *parsing) parseWhitespace() bool {
var start = -1
for p.curr != 0 {
var matched = false
for _, ws := range p.t.wSpaces {
if p.curr == ws {
if start == -1 {
start = p.pos
}
matched = true
break
}
}
if !matched {
break
}
if p.curr == newLine {
p.line++
}
p.next()
}
if start != -1 {
p.token.line = p.line
p.token.indent = p.str[start:p.pos]
return true
}
return false
}
func (p *parsing) parseKeyword() bool {
var start = -1
for p.curr != 0 {
var r rune
var size int
p.ensureBytes(4)
r, size = utf8.DecodeRune(p.slice(p.pos, p.pos+4))
if unicode.IsLetter(r) ||
(p.t.flags&fAllowKeywordUnderscore != 0 && p.curr == '_') ||
(p.t.flags&fAllowNumberInKeyword != 0 && start != -1 && isNumberByte(p.curr)) {
if start == -1 {
start = p.pos
}
p.pos += size - 1 // rune may be more than 1 byte
} else {
break
}
p.next()
}
if start != -1 {
p.token.key = TokenKeyword
p.token.value = p.str[start:p.pos]
p.token.offset = p.offset + start
p.emmitToken()
return true
}
return false
}
const (
stageCoefficient = iota + 1
stageMantissa
stagePower
)
func (p *parsing) parseNumber() bool {
var start = -1
var needNumber = true
var stage uint8 = 0
for p.curr != 0 {
if isNumberByte(p.curr) {
needNumber = false
if start == -1 {
if stage == 0 {
stage = stageCoefficient
start = p.pos
}
}
} else if p.t.flags&fAllowNumberUnderscore != 0 && p.curr == '_' {
if stage != stageCoefficient {
break
}
// todo checks double underscore
} else if !needNumber && p.curr == '.' {
if stage != stageCoefficient {
break
}
stage = stageMantissa
needNumber = true
} else if !needNumber && (p.curr == 'e' || p.curr == 'E') {
if stage != stageMantissa && stage != stageCoefficient {
break
}
ePowSign := false
switch p.nextByte() {
case '-', '+':
ePowSign = true
p.next()
}
needNumber = true
if isNumberByte(p.nextByte()) {
stage = stagePower
} else {
if ePowSign { // rollback sign position
p.prev()
}
break
}
} else {
break
}
p.next()
}
if stage == 0 {
return false
}
p.token.value = p.str[start:p.pos]
if stage == stageCoefficient {
p.token.key = TokenInteger
p.token.offset = p.offset + start
} else {
p.token.key = TokenFloat
p.token.offset = p.offset + start
}
p.emmitToken()
return true
}
// match compare next bytes from data with `r`
func (p *parsing) match(r []byte, seek bool) bool {
if r[0] == p.curr {
if len(r) > 1 {
if p.ensureBytes(len(r) - 1) {
var i = 1
for ; i < len(r); i++ {
if r[i] != p.str[p.pos+i] {
return false
}
}
if seek {
p.pos += i - 1
p.next()
}
return true
}
return false
}
if seek {
p.next()
}
return true
}
return false
}
// parseQuote parses quoted string.
func (p *parsing) parseQuote() bool {
var quote *StringSettings
var start = p.pos
for _, q := range p.t.quotes {
if p.match(q.StartToken, true) {
quote = q
break
}
}
if quote == nil {
return false
}
p.token.key = TokenString
p.token.offset = p.offset + start
p.token.string = quote
escapes := false
for p.curr != 0 {
if escapes {
escapes = false
} else if p.curr == quote.EscapeSymbol {
escapes = true
} else if p.match(quote.EndToken, true) {
break
} else if quote.Injects != nil {
loop := true
for _, inject := range quote.Injects {
for _, token := range p.t.tokens[inject.StartKey] {
if p.match(token.Token, true) {
p.token.key = TokenStringFragment
p.token.value = p.str[start : p.pos-len(token.Token)]
p.emmitToken()
p.token.key = token.Key
p.token.value = token.Token
p.token.offset = p.offset + p.pos - len(token.Token)
p.emmitToken()
stopKeys := p.stopKeys // may be recursive quotes
p.stopKeys = p.t.tokens[inject.EndKey]
p.parse()
p.stopKeys = stopKeys
p.token.key = TokenStringFragment
p.token.offset = p.offset + p.pos
p.token.string = quote
start = p.pos
loop = false
break
}
}
if !loop {
break
}
}
}
if p.curr == newLine {
p.line++
}
p.next()
}
p.token.value = p.str[start:p.pos]
p.emmitToken()
return true
}
// parseToken search any rune sequence from tokenItem.
func (p *parsing) parseToken() bool {
if p.curr != 0 {
toks := p.t.index[p.curr]
if toks != nil {
start := p.pos
for _, t := range toks {
if p.match(t.Token, true) {
p.token.key = t.Key
p.token.offset = p.offset + start
p.token.value = t.Token
p.emmitToken()
return true
}
}
}
}
return false
}
// emmitToken add new p.token to stream
func (p *parsing) emmitToken() {
if p.ptr == nil {
p.ptr = p.token
p.head = p.ptr
} else {
p.ptr.addNext(p.token)
p.ptr = p.token
}
p.n++
p.token = p.t.allocToken()
p.token.id = p.n
p.token.line = p.line
}
# Tokenizer
[![Build Status](https://github.com/bzick/tokenizer/actions/workflows/tokenizer.yml/badge.svg)](https://github.com/bzick/tokenizer/actions/workflows/tokenizer.yml)
[![codecov](https://codecov.io/gh/bzick/tokenizer/branch/master/graph/badge.svg?token=MFY5NWATGC)](https://codecov.io/gh/bzick/tokenizer)
[![Go Report Card](https://goreportcard.com/badge/github.com/bzick/tokenizer?rnd=2)](https://goreportcard.com/report/github.com/bzick/tokenizer)
[![GoDoc](https://godoc.org/github.com/bzick/tokenizer?status.svg)](https://godoc.org/github.com/bzick/tokenizer)
Tokenizer — parse any string, slice or infinite buffer to any tokens.
Main features:
* High performance.
* No regexp.
* Provides [simple API](https://pkg.go.dev/github.com/bzick/tokenizer).
* Supports [integer](#integer-number) and [float](#float-number) numbers.
* Supports [quoted string or other "framed"](#framed-string) strings.
* Supports [injection](#injection-in-framed-string) in quoted or "framed" strings.
* Supports unicode.
* [Customization of tokens](#user-defined-tokens).
* Autodetect white space symbols.
* Parse any data syntax (xml, [json](https://github.com/bzick/tokenizer/blob/master/example_test.go), yaml), any programming language.
* Single pass through the data.
* Parses [infinite incoming data](#parse-buffer) and don't panic.
Use cases:
- Parsing html, xml, [json](./example_test.go), yaml and other text formats.
- Parsing huge or infinite texts.
- Parsing any programming languages.
- Parsing templates.
- Parsing formulas.
For example, parsing SQL `WHERE` condition `user_id = 119 and modified > "2020-01-01 00:00:00" or amount >= 122.34`:
```go
// define custom tokens keys
const (
TEquality = 1
TDot = 2
TMath = 3
)
// configure tokenizer
parser := tokenizer.New()
parser.DefineTokens(TEquality, []string{"<", "<=", "==", ">=", ">", "!="})
parser.DefineTokens(TDot, []string{"."})
parser.DefineTokens(TMath, []string{"+", "-", "/", "*", "%"})
parser.DefineStringToken(`"`, `"`).SetEscapeSymbol(tokenizer.BackSlash)
// create tokens stream
stream := parser.ParseString(`user_id = 119 and modified > "2020-01-01 00:00:00" or amount >= 122.34`)
defer stream.Close()
// iterate over each token
for stream.Valid() {
if stream.CurrentToken().Is(tokenizer.TokenKeyword) {
field := stream.CurrentToken().ValueString()
// ...
}
stream.Next()
}
```
tokens stram:
```
string: user_id = 119 and modified > "2020-01-01 00:00:00" or amount >= 122.34
tokens: |user_id| =| 119| and| modified| >| "2020-01-01 00:00:00"| or| amount| >=| 122.34|
| 0 | 1| 2 | 3 | 4 | 5| 6 | 7 | 8 | 9 | 10 |
0: {key: TokenKeyword, value: "user_id"} token.Value() == "user_id"
1: {key: TEquality, value: "="} token.Value() == "="
2: {key: TokenInteger, value: "119"} token.ValueInt() == 119
3: {key: TokenKeyword, value: "and"} token.Value() == "and"
4: {key: TokenKeyword, value: "modified"} token.Value() == "modified"
5: {key: TEquality, value: ">"} token.Value() == ">"
6: {key: TokenString, value: "\"2020-01-01 00:00:00\""} token.ValueUnescaped() == "2020-01-01 00:00:00"
7: {key: TokenKeyword, value: "or"} token.Value() == "and"
8: {key: TokenKeyword, value: "amount"} token.Value() == "amount"
9: {key: TEquality, value: ">="} token.Value() == ">="
10: {key: TokenFloat, value: "122.34"} token.ValueFloat() == 122.34
```
More examples:
- [JSON parser](./example_test.go)
## Begin
### Create and parse
```go
import (
"github.com/bzick/tokenizer"
)
var parser := tokenizer.New()
parser.AllowKeywordUnderscore() // ... and other configuration code
```
There is two ways to **parse string or slice**:
- `parser.ParseString(str)`
- `parser.ParseBytes(slice)`
The package allows to **parse an endless stream** of data into tokens.
For parsing, you need to pass `io.Reader`, from which data will be read (chunk-by-chunk):
```go
fp, err := os.Open("data.json") // huge JSON file
// check fs, configure tokenizer ...
stream := parser.ParseStream(fp, 4096).SetHistorySize(10)
defer stream.Close()
for stream.IsValid() {
// ...
stream.Next()
}
```
## Embedded tokens
- `tokenizer.TokenUnknown` — unspecified token key.
- `tokenizer.TokenKeyword` — keyword, any combination of letters, including unicode letters.
- `tokenizer.TokenInteger` — integer value
- `tokenizer.TokenFloat` — float/double value
- `tokenizer.TokenString` — quoted string
- `tokenizer.TokenStringFragment` — fragment framed (quoted) string
### Unknown token — `tokenizer.TokenUnknown`
A token marks as `TokenUnknown` if the parser detects an unknown token:
```go
parser.ParseString(`one!`)
```
```
{
{
Key: tokenizer.TokenKeyword
Value: "One"
},
{
Key: tokenizer.TokenUnknown
Value: "!"
}
}
```
By default, `TokenUnknown` tokens are added to the stream.
To exclude them from the stream, use the `tokenizer.StopOnUndefinedToken()` method
```
{
{
Key: tokenizer.TokenKeyword
Value: "one"
}
}
```
Please note that if the `tokenizer.StopOnUndefinedToken` setting is enabled, then the string may not be fully parsed.
To find out that the string was not fully parsed, check the length of the parsed string `stream.GetParsedLength()`
and the length of the original string.
### Keywords
Any word that is not a custom token is stored in a single token as `tokenizer.TokenKeyword`.
The word can contains unicode characters, numbers (see `tokenizer.AllowNumbersInKeyword ()`) and underscore (see `tokenizer.AllowKeywordUnderscore ()`).
```go
parser.ParseString(`one two четыре`)
```
```
tokens: {
{
Key: tokenizer.TokenKeyword
Value: "one"
},
{
Key: tokenizer.TokenKeyword
Value: "two"
},
{
Key: tokenizer.TokenKeyword
Value: "четыре"
}
}
```
### Integer number
Any integer is stored as one token with key `tokenizer.Token Integer`.
```go
parser.ParseString(`223 999`)
```
```
tokens: {
{
Key: tokenizer.TokenInteger
Value: "223"
},
{
Key: tokenizer.TokenInteger
Value: "999"
},
}
```
To get int64 from the token value use `stream.GetInt()`:
```go
stream := tokenizer.ParseString("123")
fmt.Print("Token is %d", stream.CurrentToken().GetInt()) // Token is 123
```
### Float number
Any float number is stored as one token with key `tokenizer.TokenFloat`. Float number may
- have point, for example `1.2`
- have exponent, for example `1e6`
- have lower `e` or upper `E` letter in the exponent, for example `1E6`, `1e6`
- have sign in the exponent, for example `1e-6`, `1e6`, `1e+6`
```
tokenizer.ParseString(`1.3e-8`):
{
{
Key: tokenizer.TokenFloat
Value: "1.3e-8"
},
}
```
To get float64 from the token value use `token.GetFloat()`:
```go
stream := tokenizer.ParseString("1.3e2")
fmt.Print("Token is %d", stream.CurrentToken().GetFloat()) // Token is 130
```
### Framed string
Strings that are framed with tokens are called framed strings. An obvious example is quoted a string like `"one two"`.
There quotes — edge tokens.
You can create and customize framed string through `tokenizer.AddQuote()`:
```go
const TokenDoubleQuotedString = 10
tokenizer.DefineStringToken(TokenDoubleQuotedString, `"`, `"`).SetEscapeSymbol('\\')
stream := tokenizer.ParseString(`"two \"three"`)
```
```
{
{
Key: tokenizer.TokenString
Value: "\"two \\"three\""
},
}
```
To get a framed string without edge tokens and special characters, use the `stream.ValueUnescape()` method:
```go
v := stream.CurrentToken().ValueUnescape() // result: two "three
```
The method `token.StringKey()` will be return token string key defined in the `DefineStringToken`:
```go
stream.CurrentToken().StringKey() == TokenDoubleQuotedString // true
```
### Injection in framed string
Strings can contain expression substitutions that can be parsed into tokens. For example `"one {{two}} three"`.
Fragments of strings before, between and after substitutions will be stored in tokens as `tokenizer.TokenStringFragment`.
```go
const (
TokenOpenInjection = 1
TokenCloseInjection = 2
TokenQuotedString = 3
)
parser := tokenizer.New()
parser.DefineTokens(TokenOpenInjection, []string{"{{"})
parser.DefineTokens(TokenCloseInjection, []string{"}}"})
parser.DefineStringToken(TokenQuotedString, `"`, `"`).AddInjection(TokenOpenInjection, TokenCloseInjection)
parser.ParseString(`"one {{ two }} three"`)
```
Tokens:
```
{
{
Key: tokenizer.TokenStringFragment,
Value: "one"
},
{
Key: TokenOpenInjection,
Value: "{{"
},
{
Key: tokenizer.TokenKeyword,
Value: "two"
},
{
Key: TokenCloseInjection,
Value: "}}"
},
{
Key: tokenizer.TokenStringFragment,
Value: "three"
},
}
```
Use cases:
- parse templates
- parse placeholders
## User defined tokens
The new token can be defined via the `DefineTokens` method:
```go
const (
TokenCurlyOpen = 1
TokenCurlyClose = 2
TokenSquareOpen = 3
TokenSquareClose = 4
TokenColon = 5
TokenComma = 6
TokenDoubleQuoted = 7
)
// json parser
parser := tokenizer.New()
parser.
DefineTokens(TokenCurlyOpen, []string{"{"}).
DefineTokens(TokenCurlyClose, []string{"}"}).
DefineTokens(TokenSquareOpen, []string{"["}).
DefineTokens(TokenSquareClose, []string{"]"}).
DefineTokens(TokenColon, []string{":"}).
DefineTokens(TokenComma, []string{","}).
DefineStringToken(TokenDoubleQuoted, `"`, `"`).SetSpecialSymbols(tokenizer.DefaultStringEscapes)
stream := parser.ParseString(`{"key": [1]}`)
```
## Known issues
* zero-byte `\0` ignores in the source string.
## Benchmark
Parse string/bytes
```
pkg: tokenizer
cpu: Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
BenchmarkParseBytes
stream_test.go:251: Speed: 70 bytes string with 19.689µs: 3555284 byte/sec
stream_test.go:251: Speed: 7000 bytes string with 848.163µs: 8253130 byte/sec
stream_test.go:251: Speed: 700000 bytes string with 75.685945ms: 9248744 byte/sec
stream_test.go:251: Speed: 11093670 bytes string with 1.16611538s: 9513355 byte/sec
BenchmarkParseBytes-8 158481 7358 ns/op
```
Parse infinite stream
```
pkg: tokenizer
cpu: Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
BenchmarkParseInfStream
stream_test.go:226: Speed: 70 bytes at 33.826µs: 2069414 byte/sec
stream_test.go:226: Speed: 7000 bytes at 627.357µs: 11157921 byte/sec
stream_test.go:226: Speed: 700000 bytes at 27.675799ms: 25292856 byte/sec
stream_test.go:226: Speed: 30316440 bytes at 1.18061702s: 25678471 byte/sec
BenchmarkParseInfStream-8 433092 2726 ns/op
PASS
```
package tokenizer
import (
"strconv"
"strings"
)
// Stream iterator via parsed tokens.
// If data reads from an infinite buffer then the iterator will be read data from reader chunk-by-chunk.
type Stream struct {
t *Tokenizer
// count of tokens in the stream
len int
// pointer to the node of double-linked list of tokens
current *Token
// pointer of valid token if current moved to out of bounds (out of end list)
prev *Token
// pointer of valid token if current moved to out of bounds (out of begin list)
next *Token
// pointer to head of list
head *Token
// last whitespaces before end of source
wsTail []byte
// count of parsed bytes
parsed int
p *parsing
historySize int
}
// NewStream creates new parsed stream of tokens.
func NewStream(p *parsing) *Stream {
return &Stream{
t: p.t,
head: p.head,
current: p.head,
len: p.n,
wsTail: p.tail,
parsed: p.parsed + p.pos,
}
}
// NewInfStream creates new stream with active parser.
func NewInfStream(p *parsing) *Stream {
return &Stream{
t: p.t,
p: p,
len: p.n,
head: p.head,
current: p.head,
}
}
// SetHistorySize sets the number of tokens that should remain after the current token
func (s *Stream) SetHistorySize(size int) *Stream {
s.historySize = size
return s
}
// Close releases all token objects to pool
func (s *Stream) Close() {
for ptr := s.head; ptr != nil; {
p := ptr.next
s.t.freeToken(ptr)
ptr = p
}
s.next = nil
s.prev = nil
s.head = undefToken
s.current = undefToken
s.len = 0
}
func (s *Stream) String() string {
items := make([]string, 0, s.len)
ptr := s.head
for ptr != nil {
items = append(items, strconv.Itoa(ptr.id)+": "+ptr.String())
ptr = ptr.next
}
return strings.Join(items, "\n")
}
// GetParsedLength returns currently count parsed bytes.
func (s *Stream) GetParsedLength() int {
if s.p == nil {
return s.parsed
} else {
return s.p.parsed + s.p.pos
}
}
// GoNext moves stream pointer to the next token.
// If there is no token, it initiates the parsing of the next chunk of data.
// If there is no data, the pointer will point to the TokenUndef token.
func (s *Stream) GoNext() *Stream {
if s.current.next != nil {
s.current = s.current.next
if s.current.next == nil && s.p != nil { // lazy load and parse next data-chunk
n := s.p.n
s.p.parse()
s.len += s.p.n - n
}
if s.historySize != 0 && s.current.id-s.head.id > s.historySize {
t := s.head
s.head = s.head.unlink()
s.t.freeToken(t)
s.len--
}
} else if s.current == undefToken {
s.current = s.prev
s.prev = nil
} else {
s.prev = s.current
s.current = undefToken
}
return s
}
// GoPrev moves pointer of stream to the next token.
// The number of possible calls is limited if you specified SetHistorySize.
// If the beginning of the stream or the end of the history is reached, the pointer will point to the TokenUndef token.
func (s *Stream) GoPrev() *Stream {
if s.current.prev != nil {
s.current = s.current.prev
} else if s.current == undefToken {
s.current = s.next
s.prev = nil
} else {
s.next = s.current
s.current = undefToken
}
return s
}
// GoTo moves pointer of stream to specific token.
// The search is done by token ID.
func (s *Stream) GoTo(id int) *Stream {
if id > s.current.id {
for s.current != nil && id != s.current.id {
s.GoNext()
}
} else if id < s.current.id {
for s.current != nil && id != s.current.id {
s.GoPrev()
}
}
return s
}
// IsValid checks if stream is valid.
// This means that the pointer has not reached the end of the stream.
func (s *Stream) IsValid() bool {
return s.current != undefToken
}
// IsNextSequence checks if these are next tokens in exactly the same sequence as specified.
func (s *Stream) IsNextSequence(keys ...TokenKey) bool {
var (
result = true
hSize = 0
id = s.CurrentToken().ID()
)
if s.historySize > 0 && s.historySize < len(keys) {
hSize = s.historySize
s.historySize = len(keys)
}
for _, key := range keys {
if !s.GoNext().CurrentToken().Is(key) {
result = false
break
}
}
s.GoTo(id)
if hSize != 0 {
s.SetHistorySize(hSize)
}
return result
}
// IsAnyNextSequence checks that at least one token from each group is contained in a sequence of tokens
func (s *Stream) IsAnyNextSequence(keys ...[]TokenKey) bool {
var (
result = true
hSize = 0
id = s.CurrentToken().ID()
)
if s.historySize > 0 && s.historySize < len(keys) {
hSize = s.historySize
s.historySize = len(keys)
}
for _, key := range keys {
found := false
for _, k := range key {
if s.GoNext().CurrentToken().Is(k) {
found = true
break
}
}
if !found {
result = false
break
}
}
s.GoTo(id)
if hSize != 0 {
s.SetHistorySize(hSize)
}
return result
}
// HeadToken returns pointer to head-token
// Head-token may be changed if history size set.
func (s *Stream) HeadToken() *Token {
return s.head
}
// CurrentToken always returns the token.
// If the pointer is not valid (see IsValid) CurrentToken will be returns TokenUndef token.
// Do not save result (Token) into variables — current token may be changed at any time.
func (s *Stream) CurrentToken() *Token {
return s.current
}
// PrevToken returns previous token from the stream.
// If previous token doesn't exist method return TypeUndef token.
// Do not save result (Token) into variables — previous token may be changed at any time.
func (s *Stream) PrevToken() *Token {
if s.current.prev != nil {
return s.current.prev
}
return undefToken
}
// NextToken returns next token from the stream.
// If next token doesn't exist method return TypeUndef token.
// Do not save result (Token) into variables — next token may be changed at any time.
func (s *Stream) NextToken() *Token {
if s.current.next != nil {
return s.current.next
}
return undefToken
}
// GoNextIfNextIs moves stream pointer to the next token if the next token has specific token keys.
// If keys matched pointer will be updated and method returned true. Otherwise, returned false.
func (s *Stream) GoNextIfNextIs(key TokenKey, otherKeys ...TokenKey) bool {
if s.NextToken().Is(key, otherKeys...) {
s.GoNext()
return true
}
return false
}
// GetSnippet returns slice of tokens.
// Slice generated from current token position and include tokens before and after current token.
func (s *Stream) GetSnippet(before, after int) []Token {
var segment []Token
if s.current == undefToken {
if s.prev != nil && before > s.prev.id-s.head.id {
before = s.prev.id - s.head.id
} else {
before = 0
}
} else if before > s.current.id-s.head.id {
before = s.current.id - s.head.id
}
if after > s.len-before-1 {
after = s.len - before - 1
}
segment = make([]Token, before+after+1)
var ptr *Token
if s.next != nil {
ptr = s.next
} else if s.prev != nil {
ptr = s.prev
} else {
ptr = s.current
}
for p := ptr; p != nil; p, before = ptr.prev, before-1 {
segment[before] = Token{
id: ptr.id,
key: ptr.key,
value: ptr.value,
line: ptr.line,
offset: ptr.offset,
indent: ptr.indent,
string: ptr.string,
}
if before <= 0 {
break
}
}
for p, i := ptr.next, 1; p != nil; p, i = p.next, i+1 {
segment[before+i] = Token{
id: p.id,
key: p.key,
value: p.value,
line: p.line,
offset: p.offset,
indent: p.indent,
string: p.string,
}
if i >= after {
break
}
}
return segment
}
// GetSnippetAsString returns tokens before and after current token as string.
// `maxStringLength` specify max length of each token string. Zero — unlimited token string length.
// If string greater than maxLength method removes some runes in the middle of the string.
func (s *Stream) GetSnippetAsString(before, after, maxStringLength int) string {
segments := s.GetSnippet(before, after)
str := make([]string, len(segments))
for i, token := range segments {
v := token.ValueString()
if maxStringLength > 4 && len(v) > maxStringLength {
str[i] = v[:maxStringLength/2] + "..." + v[maxStringLength/2+1:]
} else {
str[i] = v
}
}
return strings.Join(str, "")
}
package tokenizer
import (
"fmt"
"strconv"
)
var undefToken = &Token{
id: -1,
}
// Token struct describe one token.
type Token struct {
id int
key TokenKey
value []byte
line int
offset int
indent []byte
string *StringSettings
prev *Token
next *Token
}
// addNext add new token as next node of dl-list.
func (t *Token) addNext(next *Token) {
next.prev = t
t.next = next
}
// unlink remove token from dl-list and fix links of prev and next nodes.
// Method returns next token or nil if no next token found.
func (t *Token) unlink() *Token {
next := t.next
t.next.prev = nil
t.next = nil
t.prev = nil
return next
}
// ID returns id of token. Id is the sequence number of tokens in the stream.
func (t *Token) ID() int {
return t.id
}
// String returns a multiline string with the token's information.
func (t Token) String() string {
return fmt.Sprintf("{\n\tId: %d\n\tKey: %d\n\tValue: %s\n\tPosition: %d\n\tIndent: %d bytes\n\tLine: %d\n}",
t.id, t.key, t.value, t.offset, len(t.indent), t.line)
}
// IsValid checks if this token is valid — the key is not TokenUndef.
func (t *Token) IsValid() bool {
return t.key != TokenUndef
}
// IsKeyword checks if this is keyword — the key is TokenKeyword.
func (t Token) IsKeyword() bool {
return t.key == TokenKeyword
}
// IsNumber checks if this token is integer or float — the key is TokenInteger or TokenFloat.
func (t Token) IsNumber() bool {
return t.key == TokenInteger || t.key == TokenFloat
}
// IsFloat checks if this token is float — the key is TokenFloat.
func (t Token) IsFloat() bool {
return t.key == TokenFloat
}
// IsInteger checks if this token is integer — the key is TokenInteger.
func (t Token) IsInteger() bool {
return t.key == TokenInteger
}
// ValueInt returns value as int64.
// If the token is float the result wild be round by math's rules.
// If the token is not TokenInteger or TokenFloat zero will be returned.
// Method doesn't use cache. Each call starts a number parser.
func (t Token) ValueInt() int64 {
if t.key == TokenInteger {
num, _ := strconv.ParseInt(b2s(t.value), 10, 64)
return num
} else if t.key == TokenFloat {
num, _ := strconv.ParseFloat(b2s(t.value), 64)
return int64(num)
}
return 0
}
// ValueFloat returns value as float64.
// If the token is not TokenInteger or TokenFloat zero will be returned.
// Method doesn't use cache. Each call starts a number parser.
func (t *Token) ValueFloat() float64 {
if t.key == TokenFloat {
num, _ := strconv.ParseFloat(b2s(t.value), 64)
return num
} else if t.key == TokenInteger {
num, _ := strconv.ParseInt(b2s(t.value), 10, 64)
return float64(num)
}
return 0.0
}
// Indent returns spaces before the token.
func (t *Token) Indent() []byte {
return t.indent
}
// Key returns the key of the token pointed to by the pointer.
// If pointer is not valid (see IsValid) TokenUndef will be returned.
func (t *Token) Key() TokenKey {
return t.key
}
// Value returns value of current token as slice of bytes from source.
// If current token is invalid value returns nil.
//
// Do not change bytes in the slice. Copy slice before change.
func (t *Token) Value() []byte {
return t.value
}
// ValueString returns value of the token as string.
// If the token is TokenUndef method returns empty string.
func (t *Token) ValueString() string {
if t.value == nil {
return ""
}
return b2s(t.value)
}
// Line returns line number in input string.
// Line numbers starts from 1.
func (t *Token) Line() int {
return t.line
}
// Offset returns the byte position in input string (from start).
func (t *Token) Offset() int {
return t.offset
}
// StringSettings returns StringSettings structure if token is framed string.
func (t *Token) StringSettings() *StringSettings {
return t.string
}
// StringKey returns key of string.
// If key not defined for string TokenString will be returned.
func (t *Token) StringKey() TokenKey {
if t.string != nil {
return t.string.Key
}
return TokenString
}
// IsString checks if current token is a quoted string.
// Token key may be TokenString or TokenStringFragment.
func (t Token) IsString() bool {
return t.key == TokenString || t.key == TokenStringFragment
}
// ValueUnescaped returns clear (unquoted) string
// - without edge-tokens (quotes)
// - with character escaping handling
//
// For example quoted string
// "one \"two\"\t three"
// transforms to
// one "two" three
// Method doesn't use cache. Each call starts a string parser.
func (t *Token) ValueUnescaped() []byte {
if t.string != nil {
from := 0
to := len(t.value)
if bytesStarts(t.string.StartToken, t.value) {
from = len(t.string.StartToken)
}
if bytesEnds(t.string.EndToken, t.value) {
to = len(t.value) - len(t.string.EndToken)
}
str := t.value[from:to]
result := make([]byte, 0, len(str))
escaping := false
start := 0
for i := 0; i < len(str); i++ {
if escaping {
if v, ok := t.string.SpecSymbols[str[i]]; ok {
result = append(result, t.value[start:i]...)
result = append(result, v)
}
start = i
escaping = false
} else if t.string.EscapeSymbol != 0 && str[i] == t.string.EscapeSymbol {
escaping = true
}
}
if start == 0 { // no one escapes
return str
}
return result
}
return t.value
}
// ValueUnescapedString like as ValueUnescaped but returns string.
func (t *Token) ValueUnescapedString() string {
if s := t.ValueUnescaped(); s != nil {
return b2s(s)
}
return ""
}
// Is checks if the token has any of these keys.
func (t *Token) Is(key TokenKey, keys ...TokenKey) bool {
if t.key == key {
return true
}
if len(keys) > 0 {
for _, k := range keys {
if t.key == k {
return true
}
}
}
return false
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment