package html // import "github.com/tdewolff/parse/html"
import (
"bytes"
"fmt"
"io"
"testing"
"github.com/tdewolff/parse"
"github.com/tdewolff/test"
)
type TTs []TokenType
func TestTokens(t *testing.T) {
var tokenTests = []struct {
html string
expected []TokenType
}{
{"", TTs{StartTagToken, StartTagCloseToken, EndTagToken}},
{"", TTs{StartTagToken, StartTagVoidToken}},
{"", TTs{CommentToken}},
{"", TTs{CommentToken}},
{"
text
", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
{"", TTs{StartTagToken, AttributeToken, StartTagVoidToken}},
{"", TTs{StartTagToken, AttributeToken, AttributeToken, StartTagVoidToken}},
{"", TTs{StartTagToken, AttributeToken, AttributeToken, AttributeToken, AttributeToken, StartTagVoidToken}},
{"", TTs{DoctypeToken}},
{"", TTs{DoctypeToken}},
{"", TTs{CommentToken}},
{"0bogus>", TTs{CommentToken}},
{"", TTs{CommentToken}},
{"< ", TTs{TextToken}},
{"", TTs{TextToken}},
// raw tags
{"", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
{"", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
{"", TTs{StartTagToken, StartTagCloseToken, TextToken}},
{"", TTs{StartTagToken, StartTagCloseToken, EndTagToken}},
{"';", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken, TextToken, EndTagToken}},
{"';-->", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken, TextToken, EndTagToken}},
{"';-->", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
{"", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
{"", TTs{TextToken}},
{"", TTs{SvgToken}},
{"", TTs{MathToken}},
{``, TTs{SvgToken}},
{"", TTs{StartTagToken, StartTagCloseToken, SvgToken, EndTagToken}},
// early endings
{"", TTs{StartTagToken, StartTagCloseToken, TextToken}},
// NULL
{"foo\x00bar", TTs{TextToken}},
{"<\x00foo>", TTs{TextToken}},
{"", TTs{StartTagToken, StartTagCloseToken}},
{"\x00bogus>", TTs{CommentToken}},
{"", TTs{EndTagToken}},
{"\x00", TTs{StartTagToken, StartTagCloseToken, TextToken}},
{"", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
{"", TTs{CommentToken}},
{"", TTs{TextToken}},
{"", TTs{DoctypeToken}},
{"", TTs{CommentToken}},
{"", TTs{CommentToken}},
// go-fuzz
{">", TTs{TextToken}},
}
for _, tt := range tokenTests {
t.Run(tt.html, func(t *testing.T) {
l := NewLexer(bytes.NewBufferString(tt.html))
i := 0
for {
token, _ := l.Next()
if token == ErrorToken {
test.T(t, l.Err(), io.EOF)
test.T(t, i, len(tt.expected), "when error occurred we must be at the end")
break
}
test.That(t, i < len(tt.expected), "index", i, "must not exceed expected token types size", len(tt.expected))
if i < len(tt.expected) {
test.T(t, token, tt.expected[i], "token types must match")
}
i++
}
})
}
test.T(t, TokenType(100).String(), "Invalid(100)")
}
func TestTags(t *testing.T) {
var tagTests = []struct {
html string
expected string
}{
{"", "foo:bar.qux-norf"},
{"", "foo?bar/qux"},
{"", " note SYSTEM \"Note.dtd\""},
{"", "foo"},
// early endings
{"", []string{"a", "\"b\""}},
{"\"' />", []string{"checked", "", "value", "'=/>\"'"}},
{"", []string{"bar", "\" a \n\t\r b \""}},
{"", []string{"a", ""}},
{"", []string{"/", "/"}},
// early endings
{"", []string{"\x00", ""}},
{"", []string{"\x00", "\x00"}},
{"", []string{"\x00", "'\x00'"}},
}
for _, tt := range attributeTests {
t.Run(tt.attr, func(t *testing.T) {
l := NewLexer(bytes.NewBufferString(tt.attr))
i := 0
for {
token, _ := l.Next()
if token == ErrorToken {
test.T(t, l.Err(), io.EOF)
test.T(t, i, len(tt.expected), "when error occurred we must be at the end")
break
} else if token == AttributeToken {
test.That(t, i+1 < len(tt.expected), "index", i+1, "must not exceed expected attributes size", len(tt.expected))
if i+1 < len(tt.expected) {
test.String(t, string(l.Text()), tt.expected[i], "attribute keys must match")
test.String(t, string(l.AttrVal()), tt.expected[i+1], "attribute keys must match")
i += 2
}
}
}
})
}
}
func TestErrors(t *testing.T) {
var errorTests = []struct {
html string
col int
}{
{"", 6},
{"", 11},
}
for _, tt := range errorTests {
t.Run(tt.html, func(t *testing.T) {
l := NewLexer(bytes.NewBufferString(tt.html))
for {
token, _ := l.Next()
if token == ErrorToken {
if tt.col == 0 {
test.T(t, l.Err(), io.EOF)
} else if perr, ok := l.Err().(*parse.Error); ok {
_, col, _ := perr.Position()
test.T(t, col, tt.col)
} else {
test.Fail(t, "bad error:", l.Err())
}
break
}
}
})
}
}
////////////////////////////////////////////////////////////////
var J int
var ss = [][]byte{
[]byte(" style"),
[]byte("style"),
[]byte(" \r\n\tstyle"),
[]byte(" style"),
[]byte(" x"),
[]byte("x"),
}
func BenchmarkWhitespace1(b *testing.B) {
for i := 0; i < b.N; i++ {
for _, s := range ss {
j := 0
for {
if c := s[j]; c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
j++
} else {
break
}
}
J += j
}
}
}
func BenchmarkWhitespace2(b *testing.B) {
for i := 0; i < b.N; i++ {
for _, s := range ss {
j := 0
for {
if c := s[j]; c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
j++
continue
}
break
}
J += j
}
}
}
func BenchmarkWhitespace3(b *testing.B) {
for i := 0; i < b.N; i++ {
for _, s := range ss {
j := 0
for {
if c := s[j]; c != ' ' && c != '\t' && c != '\n' && c != '\r' && c != '\f' {
break
}
j++
}
J += j
}
}
}
////////////////////////////////////////////////////////////////
func ExampleNewLexer() {
l := NewLexer(bytes.NewBufferString("John Doe"))
out := ""
for {
tt, data := l.Next()
if tt == ErrorToken {
break
}
out += string(data)
}
fmt.Println(out)
// Output: John Doe
}