miniflux/vendor/github.com/tdewolff/parse/js/lex.go

// Package js is an ECMAScript5.1 lexer following the specifications at http://www.ecma-international.org/ecma-262/5.1/.
package js // import "github.com/tdewolff/parse/js"

import (
	"io"
	"strconv"
	"unicode"

	"github.com/tdewolff/parse/buffer"
)

var identifierStart = []*unicode.RangeTable{unicode.Lu, unicode.Ll, unicode.Lt, unicode.Lm, unicode.Lo, unicode.Nl, unicode.Other_ID_Start}
var identifierContinue = []*unicode.RangeTable{unicode.Lu, unicode.Ll, unicode.Lt, unicode.Lm, unicode.Lo, unicode.Nl, unicode.Mn, unicode.Mc, unicode.Nd, unicode.Pc, unicode.Other_ID_Continue}

////////////////////////////////////////////////////////////////

// TokenType determines the type of token, eg. a number or a semicolon.
type TokenType uint32

// TokenType values.
const (
	ErrorToken          TokenType = iota // extra token when errors occur
	UnknownToken                         // extra token when no token can be matched
	WhitespaceToken                      // space \t \v \f
	LineTerminatorToken                  // \r \n \r\n
	CommentToken
	IdentifierToken
	PunctuatorToken /* { } ( ) [ ] . ; , < > <= >= == != === !==  + - * % ++ -- << >>
	   >>> & | ^ ! ~ && || ? : = += -= *= %= <<= >>= >>>= &= |= ^= / /= >= */
	NumericToken
	StringToken
	RegexpToken
	TemplateToken
)

// TokenState determines a state in which next token should be read
type TokenState uint32

// TokenState values
const (
	ExprState TokenState = iota
	StmtParensState
	SubscriptState
	PropNameState
)

// ParsingContext determines the context in which following token should be parsed.
// This affects parsing regular expressions and template literals.
type ParsingContext uint32

// ParsingContext values
const (
	GlobalContext ParsingContext = iota
	StmtParensContext
	ExprParensContext
	BracesContext
	TemplateContext
)

// String returns the string representation of a TokenType.
func (tt TokenType) String() string {
	switch tt {
	case ErrorToken:
		return "Error"
	case UnknownToken:
		return "Unknown"
	case WhitespaceToken:
		return "Whitespace"
	case LineTerminatorToken:
		return "LineTerminator"
	case CommentToken:
		return "Comment"
	case IdentifierToken:
		return "Identifier"
	case PunctuatorToken:
		return "Punctuator"
	case NumericToken:
		return "Numeric"
	case StringToken:
		return "String"
	case RegexpToken:
		return "Regexp"
	case TemplateToken:
		return "Template"
	}
	return "Invalid(" + strconv.Itoa(int(tt)) + ")"
}

////////////////////////////////////////////////////////////////

// Lexer is the state for the lexer.
type Lexer struct {
	r         *buffer.Lexer
	stack     []ParsingContext
	state     TokenState
	emptyLine bool
}

// NewLexer returns a new Lexer for a given io.Reader.
func NewLexer(r io.Reader) *Lexer {
	return &Lexer{
		r:         buffer.NewLexer(r),
		stack:     make([]ParsingContext, 0, 16),
		state:     ExprState,
		emptyLine: true,
	}
}

func (l *Lexer) enterContext(context ParsingContext) {
	l.stack = append(l.stack, context)
}

func (l *Lexer) leaveContext() ParsingContext {
	ctx := GlobalContext
	if last := len(l.stack) - 1; last >= 0 {
		ctx, l.stack = l.stack[last], l.stack[:last]
	}
	return ctx
}

// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
func (l *Lexer) Err() error {
	return l.r.Err()
}

// Restore restores the NULL byte at the end of the buffer.
func (l *Lexer) Restore() {
	l.r.Restore()
}

// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
func (l *Lexer) Next() (TokenType, []byte) {
	tt := UnknownToken
	c := l.r.Peek(0)
	switch c {
	case '(':
		if l.state == StmtParensState {
			l.enterContext(StmtParensContext)
		} else {
			l.enterContext(ExprParensContext)
		}
		l.state = ExprState
		l.r.Move(1)
		tt = PunctuatorToken
	case ')':
		if l.leaveContext() == StmtParensContext {
			l.state = ExprState
		} else {
			l.state = SubscriptState
		}
		l.r.Move(1)
		tt = PunctuatorToken
	case '{':
		l.enterContext(BracesContext)
		l.state = ExprState
		l.r.Move(1)
		tt = PunctuatorToken
	case '}':
		if l.leaveContext() == TemplateContext && l.consumeTemplateToken() {
			tt = TemplateToken
		} else {
			// will work incorrectly for objects or functions divided by something,
			// but that's an extremely rare case
			l.state = ExprState
			l.r.Move(1)
			tt = PunctuatorToken
		}
	case ']':
		l.state = SubscriptState
		l.r.Move(1)
		tt = PunctuatorToken
	case '[', ';', ',', '~', '?', ':':
		l.state = ExprState
		l.r.Move(1)
		tt = PunctuatorToken
	case '<', '>', '=', '!', '+', '-', '*', '%', '&', '|', '^':
		if (c == '<' || (l.emptyLine && c == '-')) && l.consumeCommentToken() {
			return CommentToken, l.r.Shift()
		} else if l.consumeLongPunctuatorToken() {
			l.state = ExprState
			tt = PunctuatorToken
		}
	case '/':
		if l.consumeCommentToken() {
			return CommentToken, l.r.Shift()
		} else if l.state == ExprState && l.consumeRegexpToken() {
			l.state = SubscriptState
			tt = RegexpToken
		} else if l.consumeLongPunctuatorToken() {
			l.state = ExprState
			tt = PunctuatorToken
		}
	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.':
		if l.consumeNumericToken() {
			tt = NumericToken
			l.state = SubscriptState
		} else if c == '.' {
			l.state = PropNameState
			l.r.Move(1)
			tt = PunctuatorToken
		}
	case '\'', '"':
		if l.consumeStringToken() {
			l.state = SubscriptState
			tt = StringToken
		}
	case ' ', '\t', '\v', '\f':
		l.r.Move(1)
		for l.consumeWhitespace() {
		}
		return WhitespaceToken, l.r.Shift()
	case '\n', '\r':
		l.r.Move(1)
		for l.consumeLineTerminator() {
		}
		tt = LineTerminatorToken
	case '`':
		if l.consumeTemplateToken() {
			tt = TemplateToken
		}
	default:
		if l.consumeIdentifierToken() {
			tt = IdentifierToken
			if l.state != PropNameState {
				switch hash := ToHash(l.r.Lexeme()); hash {
				case 0, This, False, True, Null:
					l.state = SubscriptState
				case If, While, For, With:
					l.state = StmtParensState
				default:
					// This will include keywords that can't be followed by a regexp, but only
					// by a specified char (like `switch` or `try`), but we don't check for syntax
					// errors as we don't attempt to parse a full JS grammar when streaming
					l.state = ExprState
				}
			} else {
				l.state = SubscriptState
			}
		} else if c >= 0xC0 {
			if l.consumeWhitespace() {
				for l.consumeWhitespace() {
				}
				return WhitespaceToken, l.r.Shift()
			} else if l.consumeLineTerminator() {
				for l.consumeLineTerminator() {
				}
				tt = LineTerminatorToken
			}
		} else if l.Err() != nil {
			return ErrorToken, nil
		}
	}

	l.emptyLine = tt == LineTerminatorToken

	if tt == UnknownToken {
		_, n := l.r.PeekRune(0)
		l.r.Move(n)
	}
	return tt, l.r.Shift()
}

////////////////////////////////////////////////////////////////

/*
The following functions follow the specifications at http://www.ecma-international.org/ecma-262/5.1/
*/

func (l *Lexer) consumeWhitespace() bool {
	c := l.r.Peek(0)
	if c == ' ' || c == '\t' || c == '\v' || c == '\f' {
		l.r.Move(1)
		return true
	} else if c >= 0xC0 {
		if r, n := l.r.PeekRune(0); r == '\u00A0' || r == '\uFEFF' || unicode.Is(unicode.Zs, r) {
			l.r.Move(n)
			return true
		}
	}
	return false
}

func (l *Lexer) consumeLineTerminator() bool {
	c := l.r.Peek(0)
	if c == '\n' {
		l.r.Move(1)
		return true
	} else if c == '\r' {
		if l.r.Peek(1) == '\n' {
			l.r.Move(2)
		} else {
			l.r.Move(1)
		}
		return true
	} else if c >= 0xC0 {
		if r, n := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' {
			l.r.Move(n)
			return true
		}
	}
	return false
}

func (l *Lexer) consumeDigit() bool {
	if c := l.r.Peek(0); c >= '0' && c <= '9' {
		l.r.Move(1)
		return true
	}
	return false
}

func (l *Lexer) consumeHexDigit() bool {
	if c := l.r.Peek(0); (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') {
		l.r.Move(1)
		return true
	}
	return false
}

func (l *Lexer) consumeBinaryDigit() bool {
	if c := l.r.Peek(0); c == '0' || c == '1' {
		l.r.Move(1)
		return true
	}
	return false
}

func (l *Lexer) consumeOctalDigit() bool {
	if c := l.r.Peek(0); c >= '0' && c <= '7' {
		l.r.Move(1)
		return true
	}
	return false
}

func (l *Lexer) consumeUnicodeEscape() bool {
	if l.r.Peek(0) != '\\' || l.r.Peek(1) != 'u' {
		return false
	}
	mark := l.r.Pos()
	l.r.Move(2)
	if c := l.r.Peek(0); c == '{' {
		l.r.Move(1)
		if l.consumeHexDigit() {
			for l.consumeHexDigit() {
			}
			if c := l.r.Peek(0); c == '}' {
				l.r.Move(1)
				return true
			}
		}
		l.r.Rewind(mark)
		return false
	} else if !l.consumeHexDigit() || !l.consumeHexDigit() || !l.consumeHexDigit() || !l.consumeHexDigit() {
		l.r.Rewind(mark)
		return false
	}
	return true
}

func (l *Lexer) consumeSingleLineComment() {
	for {
		c := l.r.Peek(0)
		if c == '\r' || c == '\n' || c == 0 {
			break
		} else if c >= 0xC0 {
			if r, _ := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' {
				break
			}
		}
		l.r.Move(1)
	}
}

////////////////////////////////////////////////////////////////

func (l *Lexer) consumeCommentToken() bool {
	c := l.r.Peek(0)
	if c == '/' {
		c = l.r.Peek(1)
		if c == '/' {
			// single line
			l.r.Move(2)
			l.consumeSingleLineComment()
		} else if c == '*' {
			// multi line
			l.r.Move(2)
			for {
				c := l.r.Peek(0)
				if c == '*' && l.r.Peek(1) == '/' {
					l.r.Move(2)
					return true
				} else if c == 0 {
					break
				} else if l.consumeLineTerminator() {
					l.emptyLine = true
				} else {
					l.r.Move(1)
				}
			}
		} else {
			return false
		}
	} else if c == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
		// opening HTML-style single line comment
		l.r.Move(4)
		l.consumeSingleLineComment()
	} else if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
		// closing HTML-style single line comment
		// (only if current line didn't contain any meaningful tokens)
		l.r.Move(3)
		l.consumeSingleLineComment()
	} else {
		return false
	}
	return true
}

func (l *Lexer) consumeLongPunctuatorToken() bool {
	c := l.r.Peek(0)
	if c == '!' || c == '=' || c == '+' || c == '-' || c == '*' || c == '/' || c == '%' || c == '&' || c == '|' || c == '^' {
		l.r.Move(1)
		if l.r.Peek(0) == '=' {
			l.r.Move(1)
			if (c == '!' || c == '=') && l.r.Peek(0) == '=' {
				l.r.Move(1)
			}
		} else if (c == '+' || c == '-' || c == '&' || c == '|') && l.r.Peek(0) == c {
			l.r.Move(1)
		} else if c == '=' && l.r.Peek(0) == '>' {
			l.r.Move(1)
		}
	} else { // c == '<' || c == '>'
		l.r.Move(1)
		if l.r.Peek(0) == c {
			l.r.Move(1)
			if c == '>' && l.r.Peek(0) == '>' {
				l.r.Move(1)
			}
		}
		if l.r.Peek(0) == '=' {
			l.r.Move(1)
		}
	}
	return true
}

func (l *Lexer) consumeIdentifierToken() bool {
	c := l.r.Peek(0)
	if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '$' || c == '_' {
		l.r.Move(1)
	} else if c >= 0xC0 {
		if r, n := l.r.PeekRune(0); unicode.IsOneOf(identifierStart, r) {
			l.r.Move(n)
		} else {
			return false
		}
	} else if !l.consumeUnicodeEscape() {
		return false
	}
	for {
		c := l.r.Peek(0)
		if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$' || c == '_' {
			l.r.Move(1)
		} else if c >= 0xC0 {
			if r, n := l.r.PeekRune(0); r == '\u200C' || r == '\u200D' || unicode.IsOneOf(identifierContinue, r) {
				l.r.Move(n)
			} else {
				break
			}
		} else {
			break
		}
	}
	return true
}

func (l *Lexer) consumeNumericToken() bool {
	// assume to be on 0 1 2 3 4 5 6 7 8 9 .
	mark := l.r.Pos()
	c := l.r.Peek(0)
	if c == '0' {
		l.r.Move(1)
		if l.r.Peek(0) == 'x' || l.r.Peek(0) == 'X' {
			l.r.Move(1)
			if l.consumeHexDigit() {
				for l.consumeHexDigit() {
				}
			} else {
				l.r.Move(-1) // return just the zero
			}
			return true
		} else if l.r.Peek(0) == 'b' || l.r.Peek(0) == 'B' {
			l.r.Move(1)
			if l.consumeBinaryDigit() {
				for l.consumeBinaryDigit() {
				}
			} else {
				l.r.Move(-1) // return just the zero
			}
			return true
		} else if l.r.Peek(0) == 'o' || l.r.Peek(0) == 'O' {
			l.r.Move(1)
			if l.consumeOctalDigit() {
				for l.consumeOctalDigit() {
				}
			} else {
				l.r.Move(-1) // return just the zero
			}
			return true
		}
	} else if c != '.' {
		for l.consumeDigit() {
		}
	}
	if l.r.Peek(0) == '.' {
		l.r.Move(1)
		if l.consumeDigit() {
			for l.consumeDigit() {
			}
		} else if c != '.' {
			// . could belong to the next token
			l.r.Move(-1)
			return true
		} else {
			l.r.Rewind(mark)
			return false
		}
	}
	mark = l.r.Pos()
	c = l.r.Peek(0)
	if c == 'e' || c == 'E' {
		l.r.Move(1)
		c = l.r.Peek(0)
		if c == '+' || c == '-' {
			l.r.Move(1)
		}
		if !l.consumeDigit() {
			// e could belong to the next token
			l.r.Rewind(mark)
			return true
		}
		for l.consumeDigit() {
		}
	}
	return true
}

func (l *Lexer) consumeStringToken() bool {
	// assume to be on ' or "
	mark := l.r.Pos()
	delim := l.r.Peek(0)
	l.r.Move(1)
	for {
		c := l.r.Peek(0)
		if c == delim {
			l.r.Move(1)
			break
		} else if c == '\\' {
			l.r.Move(1)
			if !l.consumeLineTerminator() {
				if c := l.r.Peek(0); c == delim || c == '\\' {
					l.r.Move(1)
				}
			}
			continue
		} else if c == '\n' || c == '\r' {
			l.r.Rewind(mark)
			return false
		} else if c >= 0xC0 {
			if r, _ := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' {
				l.r.Rewind(mark)
				return false
			}
		} else if c == 0 {
			break
		}
		l.r.Move(1)
	}
	return true
}

func (l *Lexer) consumeRegexpToken() bool {
	// assume to be on / and not /*
	mark := l.r.Pos()
	l.r.Move(1)
	inClass := false
	for {
		c := l.r.Peek(0)
		if !inClass && c == '/' {
			l.r.Move(1)
			break
		} else if c == '[' {
			inClass = true
		} else if c == ']' {
			inClass = false
		} else if c == '\\' {
			l.r.Move(1)
			if l.consumeLineTerminator() {
				l.r.Rewind(mark)
				return false
			}
		} else if l.consumeLineTerminator() {
			l.r.Rewind(mark)
			return false
		} else if c == 0 {
			return true
		}
		l.r.Move(1)
	}
	// flags
	for {
		c := l.r.Peek(0)
		if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$' || c == '_' {
			l.r.Move(1)
		} else if c >= 0xC0 {
			if r, n := l.r.PeekRune(0); r == '\u200C' || r == '\u200D' || unicode.IsOneOf(identifierContinue, r) {
				l.r.Move(n)
			} else {
				break
			}
		} else {
			break
		}
	}
	return true
}

func (l *Lexer) consumeTemplateToken() bool {
	// assume to be on ` or } when already within template
	mark := l.r.Pos()
	l.r.Move(1)
	for {
		c := l.r.Peek(0)
		if c == '`' {
			l.state = SubscriptState
			l.r.Move(1)
			return true
		} else if c == '$' && l.r.Peek(1) == '{' {
			l.enterContext(TemplateContext)
			l.state = ExprState
			l.r.Move(2)
			return true
		} else if c == 0 {
			l.r.Rewind(mark)
			return false
		}
		l.r.Move(1)
	}
}