miniflux/vendor/github.com/tdewolff/parse/css/lex.go

711 lines
15 KiB
Go
Raw Normal View History

2017-11-20 06:10:04 +01:00
// Package css is a CSS3 lexer and parser following the specifications at http://www.w3.org/TR/css-syntax-3/.
package css // import "github.com/tdewolff/parse/css"
// TODO: \uFFFD replacement character for NULL bytes in strings for example, or atleast don't end the string early
import (
"bytes"
"io"
"strconv"
"github.com/tdewolff/parse"
"github.com/tdewolff/parse/buffer"
)
// TokenType determines the type of token, eg. a number or a semicolon.
type TokenType uint32
// TokenType values.
const (
ErrorToken TokenType = iota // extra token when errors occur
IdentToken
FunctionToken // rgb( rgba( ...
AtKeywordToken // @abc
HashToken // #abc
StringToken
BadStringToken
URLToken
BadURLToken
DelimToken // any unmatched character
NumberToken // 5
PercentageToken // 5%
DimensionToken // 5em
UnicodeRangeToken // U+554A
IncludeMatchToken // ~=
DashMatchToken // |=
PrefixMatchToken // ^=
SuffixMatchToken // $=
SubstringMatchToken // *=
ColumnToken // ||
WhitespaceToken // space \t \r \n \f
CDOToken // <!--
CDCToken // -->
ColonToken // :
SemicolonToken // ;
CommaToken // ,
LeftBracketToken // [
RightBracketToken // ]
LeftParenthesisToken // (
RightParenthesisToken // )
LeftBraceToken // {
RightBraceToken // }
CommentToken // extra token for comments
EmptyToken
CustomPropertyNameToken
CustomPropertyValueToken
)
// String returns the string representation of a TokenType.
func (tt TokenType) String() string {
switch tt {
case ErrorToken:
return "Error"
case IdentToken:
return "Ident"
case FunctionToken:
return "Function"
case AtKeywordToken:
return "AtKeyword"
case HashToken:
return "Hash"
case StringToken:
return "String"
case BadStringToken:
return "BadString"
case URLToken:
return "URL"
case BadURLToken:
return "BadURL"
case DelimToken:
return "Delim"
case NumberToken:
return "Number"
case PercentageToken:
return "Percentage"
case DimensionToken:
return "Dimension"
case UnicodeRangeToken:
return "UnicodeRange"
case IncludeMatchToken:
return "IncludeMatch"
case DashMatchToken:
return "DashMatch"
case PrefixMatchToken:
return "PrefixMatch"
case SuffixMatchToken:
return "SuffixMatch"
case SubstringMatchToken:
return "SubstringMatch"
case ColumnToken:
return "Column"
case WhitespaceToken:
return "Whitespace"
case CDOToken:
return "CDO"
case CDCToken:
return "CDC"
case ColonToken:
return "Colon"
case SemicolonToken:
return "Semicolon"
case CommaToken:
return "Comma"
case LeftBracketToken:
return "LeftBracket"
case RightBracketToken:
return "RightBracket"
case LeftParenthesisToken:
return "LeftParenthesis"
case RightParenthesisToken:
return "RightParenthesis"
case LeftBraceToken:
return "LeftBrace"
case RightBraceToken:
return "RightBrace"
case CommentToken:
return "Comment"
case EmptyToken:
return "Empty"
case CustomPropertyNameToken:
return "CustomPropertyName"
case CustomPropertyValueToken:
return "CustomPropertyValue"
}
return "Invalid(" + strconv.Itoa(int(tt)) + ")"
}
////////////////////////////////////////////////////////////////
// Lexer is the state for the lexer.
type Lexer struct {
r *buffer.Lexer
}
// NewLexer returns a new Lexer for a given io.Reader.
func NewLexer(r io.Reader) *Lexer {
return &Lexer{
buffer.NewLexer(r),
}
}
// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
func (l *Lexer) Err() error {
return l.r.Err()
}
// Restore restores the NULL byte at the end of the buffer.
func (l *Lexer) Restore() {
l.r.Restore()
}
// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
func (l *Lexer) Next() (TokenType, []byte) {
switch l.r.Peek(0) {
case ' ', '\t', '\n', '\r', '\f':
l.r.Move(1)
for l.consumeWhitespace() {
}
return WhitespaceToken, l.r.Shift()
case ':':
l.r.Move(1)
return ColonToken, l.r.Shift()
case ';':
l.r.Move(1)
return SemicolonToken, l.r.Shift()
case ',':
l.r.Move(1)
return CommaToken, l.r.Shift()
case '(', ')', '[', ']', '{', '}':
if t := l.consumeBracket(); t != ErrorToken {
return t, l.r.Shift()
}
case '#':
if l.consumeHashToken() {
return HashToken, l.r.Shift()
}
case '"', '\'':
if t := l.consumeString(); t != ErrorToken {
return t, l.r.Shift()
}
case '.', '+':
if t := l.consumeNumeric(); t != ErrorToken {
return t, l.r.Shift()
}
case '-':
if t := l.consumeNumeric(); t != ErrorToken {
return t, l.r.Shift()
} else if t := l.consumeIdentlike(); t != ErrorToken {
return t, l.r.Shift()
} else if l.consumeCDCToken() {
return CDCToken, l.r.Shift()
} else if l.consumeCustomVariableToken() {
return CustomPropertyNameToken, l.r.Shift()
}
case '@':
if l.consumeAtKeywordToken() {
return AtKeywordToken, l.r.Shift()
}
case '$', '*', '^', '~':
if t := l.consumeMatch(); t != ErrorToken {
return t, l.r.Shift()
}
case '/':
if l.consumeComment() {
return CommentToken, l.r.Shift()
}
case '<':
if l.consumeCDOToken() {
return CDOToken, l.r.Shift()
}
case '\\':
if t := l.consumeIdentlike(); t != ErrorToken {
return t, l.r.Shift()
}
case 'u', 'U':
if l.consumeUnicodeRangeToken() {
return UnicodeRangeToken, l.r.Shift()
} else if t := l.consumeIdentlike(); t != ErrorToken {
return t, l.r.Shift()
}
case '|':
if t := l.consumeMatch(); t != ErrorToken {
return t, l.r.Shift()
} else if l.consumeColumnToken() {
return ColumnToken, l.r.Shift()
}
case 0:
if l.Err() != nil {
return ErrorToken, nil
}
default:
if t := l.consumeNumeric(); t != ErrorToken {
return t, l.r.Shift()
} else if t := l.consumeIdentlike(); t != ErrorToken {
return t, l.r.Shift()
}
}
// can't be rune because consumeIdentlike consumes that as an identifier
l.r.Move(1)
return DelimToken, l.r.Shift()
}
////////////////////////////////////////////////////////////////
/*
The following functions follow the railroad diagrams in http://www.w3.org/TR/css3-syntax/
*/
func (l *Lexer) consumeByte(c byte) bool {
if l.r.Peek(0) == c {
l.r.Move(1)
return true
}
return false
}
func (l *Lexer) consumeComment() bool {
if l.r.Peek(0) != '/' || l.r.Peek(1) != '*' {
return false
}
l.r.Move(2)
for {
c := l.r.Peek(0)
if c == 0 && l.Err() != nil {
break
} else if c == '*' && l.r.Peek(1) == '/' {
l.r.Move(2)
return true
}
l.r.Move(1)
}
return true
}
func (l *Lexer) consumeNewline() bool {
c := l.r.Peek(0)
if c == '\n' || c == '\f' {
l.r.Move(1)
return true
} else if c == '\r' {
if l.r.Peek(1) == '\n' {
l.r.Move(2)
} else {
l.r.Move(1)
}
return true
}
return false
}
func (l *Lexer) consumeWhitespace() bool {
c := l.r.Peek(0)
if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
l.r.Move(1)
return true
}
return false
}
func (l *Lexer) consumeDigit() bool {
c := l.r.Peek(0)
if c >= '0' && c <= '9' {
l.r.Move(1)
return true
}
return false
}
func (l *Lexer) consumeHexDigit() bool {
c := l.r.Peek(0)
if (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') {
l.r.Move(1)
return true
}
return false
}
func (l *Lexer) consumeEscape() bool {
if l.r.Peek(0) != '\\' {
return false
}
mark := l.r.Pos()
l.r.Move(1)
if l.consumeNewline() {
l.r.Rewind(mark)
return false
} else if l.consumeHexDigit() {
for k := 1; k < 6; k++ {
if !l.consumeHexDigit() {
break
}
}
l.consumeWhitespace()
return true
} else {
c := l.r.Peek(0)
if c >= 0xC0 {
_, n := l.r.PeekRune(0)
l.r.Move(n)
return true
} else if c == 0 && l.r.Err() != nil {
return true
}
}
l.r.Move(1)
return true
}
func (l *Lexer) consumeIdentToken() bool {
mark := l.r.Pos()
if l.r.Peek(0) == '-' {
l.r.Move(1)
}
c := l.r.Peek(0)
if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80) {
if c != '\\' || !l.consumeEscape() {
l.r.Rewind(mark)
return false
}
} else {
l.r.Move(1)
}
for {
c := l.r.Peek(0)
if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
if c != '\\' || !l.consumeEscape() {
break
}
} else {
l.r.Move(1)
}
}
return true
}
// support custom variables, https://www.w3.org/TR/css-variables-1/
func (l *Lexer) consumeCustomVariableToken() bool {
// expect to be on a '-'
l.r.Move(1)
if l.r.Peek(0) != '-' {
l.r.Move(-1)
return false
}
if !l.consumeIdentToken() {
l.r.Move(-1)
return false
}
return true
}
func (l *Lexer) consumeAtKeywordToken() bool {
// expect to be on an '@'
l.r.Move(1)
if !l.consumeIdentToken() {
l.r.Move(-1)
return false
}
return true
}
func (l *Lexer) consumeHashToken() bool {
// expect to be on a '#'
mark := l.r.Pos()
l.r.Move(1)
c := l.r.Peek(0)
if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
if c != '\\' || !l.consumeEscape() {
l.r.Rewind(mark)
return false
}
} else {
l.r.Move(1)
}
for {
c := l.r.Peek(0)
if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
if c != '\\' || !l.consumeEscape() {
break
}
} else {
l.r.Move(1)
}
}
return true
}
func (l *Lexer) consumeNumberToken() bool {
mark := l.r.Pos()
c := l.r.Peek(0)
if c == '+' || c == '-' {
l.r.Move(1)
}
firstDigit := l.consumeDigit()
if firstDigit {
for l.consumeDigit() {
}
}
if l.r.Peek(0) == '.' {
l.r.Move(1)
if l.consumeDigit() {
for l.consumeDigit() {
}
} else if firstDigit {
// . could belong to the next token
l.r.Move(-1)
return true
} else {
l.r.Rewind(mark)
return false
}
} else if !firstDigit {
l.r.Rewind(mark)
return false
}
mark = l.r.Pos()
c = l.r.Peek(0)
if c == 'e' || c == 'E' {
l.r.Move(1)
c = l.r.Peek(0)
if c == '+' || c == '-' {
l.r.Move(1)
}
if !l.consumeDigit() {
// e could belong to next token
l.r.Rewind(mark)
return true
}
for l.consumeDigit() {
}
}
return true
}
func (l *Lexer) consumeUnicodeRangeToken() bool {
c := l.r.Peek(0)
if (c != 'u' && c != 'U') || l.r.Peek(1) != '+' {
return false
}
mark := l.r.Pos()
l.r.Move(2)
if l.consumeHexDigit() {
// consume up to 6 hexDigits
k := 1
for ; k < 6; k++ {
if !l.consumeHexDigit() {
break
}
}
// either a minus or a question mark or the end is expected
if l.consumeByte('-') {
// consume another up to 6 hexDigits
if l.consumeHexDigit() {
for k := 1; k < 6; k++ {
if !l.consumeHexDigit() {
break
}
}
} else {
l.r.Rewind(mark)
return false
}
} else {
// could be filled up to 6 characters with question marks or else regular hexDigits
if l.consumeByte('?') {
k++
for ; k < 6; k++ {
if !l.consumeByte('?') {
l.r.Rewind(mark)
return false
}
}
}
}
} else {
// consume 6 question marks
for k := 0; k < 6; k++ {
if !l.consumeByte('?') {
l.r.Rewind(mark)
return false
}
}
}
return true
}
func (l *Lexer) consumeColumnToken() bool {
if l.r.Peek(0) == '|' && l.r.Peek(1) == '|' {
l.r.Move(2)
return true
}
return false
}
func (l *Lexer) consumeCDOToken() bool {
if l.r.Peek(0) == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
l.r.Move(4)
return true
}
return false
}
func (l *Lexer) consumeCDCToken() bool {
if l.r.Peek(0) == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
l.r.Move(3)
return true
}
return false
}
////////////////////////////////////////////////////////////////
// consumeMatch consumes any MatchToken.
func (l *Lexer) consumeMatch() TokenType {
if l.r.Peek(1) == '=' {
switch l.r.Peek(0) {
case '~':
l.r.Move(2)
return IncludeMatchToken
case '|':
l.r.Move(2)
return DashMatchToken
case '^':
l.r.Move(2)
return PrefixMatchToken
case '$':
l.r.Move(2)
return SuffixMatchToken
case '*':
l.r.Move(2)
return SubstringMatchToken
}
}
return ErrorToken
}
// consumeBracket consumes any bracket token.
func (l *Lexer) consumeBracket() TokenType {
switch l.r.Peek(0) {
case '(':
l.r.Move(1)
return LeftParenthesisToken
case ')':
l.r.Move(1)
return RightParenthesisToken
case '[':
l.r.Move(1)
return LeftBracketToken
case ']':
l.r.Move(1)
return RightBracketToken
case '{':
l.r.Move(1)
return LeftBraceToken
case '}':
l.r.Move(1)
return RightBraceToken
}
return ErrorToken
}
// consumeNumeric consumes NumberToken, PercentageToken or DimensionToken.
func (l *Lexer) consumeNumeric() TokenType {
if l.consumeNumberToken() {
if l.consumeByte('%') {
return PercentageToken
} else if l.consumeIdentToken() {
return DimensionToken
}
return NumberToken
}
return ErrorToken
}
// consumeString consumes a string and may return BadStringToken when a newline is encountered.
func (l *Lexer) consumeString() TokenType {
// assume to be on " or '
delim := l.r.Peek(0)
l.r.Move(1)
for {
c := l.r.Peek(0)
if c == 0 && l.Err() != nil {
break
} else if c == '\n' || c == '\r' || c == '\f' {
l.r.Move(1)
return BadStringToken
} else if c == delim {
l.r.Move(1)
break
} else if c == '\\' {
if !l.consumeEscape() {
l.r.Move(1)
l.consumeNewline()
}
} else {
l.r.Move(1)
}
}
return StringToken
}
func (l *Lexer) consumeUnquotedURL() bool {
for {
c := l.r.Peek(0)
if c == 0 && l.Err() != nil || c == ')' {
break
} else if c == '"' || c == '\'' || c == '(' || c == '\\' || c == ' ' || c <= 0x1F || c == 0x7F {
if c != '\\' || !l.consumeEscape() {
return false
}
} else {
l.r.Move(1)
}
}
return true
}
// consumeRemnantsBadUrl consumes bytes of a BadUrlToken so that normal tokenization may continue.
func (l *Lexer) consumeRemnantsBadURL() {
for {
if l.consumeByte(')') || l.Err() != nil {
break
} else if !l.consumeEscape() {
l.r.Move(1)
}
}
}
// consumeIdentlike consumes IdentToken, FunctionToken or UrlToken.
func (l *Lexer) consumeIdentlike() TokenType {
if l.consumeIdentToken() {
if l.r.Peek(0) != '(' {
return IdentToken
} else if !parse.EqualFold(bytes.Replace(l.r.Lexeme(), []byte{'\\'}, nil, -1), []byte{'u', 'r', 'l'}) {
l.r.Move(1)
return FunctionToken
}
l.r.Move(1)
// consume url
for l.consumeWhitespace() {
}
if c := l.r.Peek(0); c == '"' || c == '\'' {
if l.consumeString() == BadStringToken {
l.consumeRemnantsBadURL()
return BadURLToken
}
} else if !l.consumeUnquotedURL() && !l.consumeWhitespace() {
l.consumeRemnantsBadURL()
return BadURLToken
}
for l.consumeWhitespace() {
}
if !l.consumeByte(')') && l.Err() != io.EOF {
l.consumeRemnantsBadURL()
return BadURLToken
}
return URLToken
}
return ErrorToken
}