308 lines
7 KiB
Go
308 lines
7 KiB
Go
|
// Package json is a JSON parser following the specifications at http://json.org/.
|
||
|
package json // import "github.com/tdewolff/parse/json"
|
||
|
|
||
|
import (
|
||
|
"io"
|
||
|
"strconv"
|
||
|
|
||
|
"github.com/tdewolff/parse"
|
||
|
"github.com/tdewolff/parse/buffer"
|
||
|
)
|
||
|
|
||
|
// GrammarType determines the type of grammar
|
||
|
type GrammarType uint32
|
||
|
|
||
|
// GrammarType values.
|
||
|
const (
|
||
|
ErrorGrammar GrammarType = iota // extra grammar when errors occur
|
||
|
WhitespaceGrammar
|
||
|
LiteralGrammar
|
||
|
NumberGrammar
|
||
|
StringGrammar
|
||
|
StartObjectGrammar // {
|
||
|
EndObjectGrammar // }
|
||
|
StartArrayGrammar // [
|
||
|
EndArrayGrammar // ]
|
||
|
)
|
||
|
|
||
|
// String returns the string representation of a GrammarType.
|
||
|
func (gt GrammarType) String() string {
|
||
|
switch gt {
|
||
|
case ErrorGrammar:
|
||
|
return "Error"
|
||
|
case WhitespaceGrammar:
|
||
|
return "Whitespace"
|
||
|
case LiteralGrammar:
|
||
|
return "Literal"
|
||
|
case NumberGrammar:
|
||
|
return "Number"
|
||
|
case StringGrammar:
|
||
|
return "String"
|
||
|
case StartObjectGrammar:
|
||
|
return "StartObject"
|
||
|
case EndObjectGrammar:
|
||
|
return "EndObject"
|
||
|
case StartArrayGrammar:
|
||
|
return "StartArray"
|
||
|
case EndArrayGrammar:
|
||
|
return "EndArray"
|
||
|
}
|
||
|
return "Invalid(" + strconv.Itoa(int(gt)) + ")"
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////
|
||
|
|
||
|
// State determines the current state the parser is in.
|
||
|
type State uint32
|
||
|
|
||
|
// State values.
|
||
|
const (
|
||
|
ValueState State = iota // extra token when errors occur
|
||
|
ObjectKeyState
|
||
|
ObjectValueState
|
||
|
ArrayState
|
||
|
)
|
||
|
|
||
|
// String returns the string representation of a State.
|
||
|
func (state State) String() string {
|
||
|
switch state {
|
||
|
case ValueState:
|
||
|
return "Value"
|
||
|
case ObjectKeyState:
|
||
|
return "ObjectKey"
|
||
|
case ObjectValueState:
|
||
|
return "ObjectValue"
|
||
|
case ArrayState:
|
||
|
return "Array"
|
||
|
}
|
||
|
return "Invalid(" + strconv.Itoa(int(state)) + ")"
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////
|
||
|
|
||
|
// Parser is the state for the lexer.
|
||
|
type Parser struct {
|
||
|
r *buffer.Lexer
|
||
|
state []State
|
||
|
err error
|
||
|
|
||
|
needComma bool
|
||
|
}
|
||
|
|
||
|
// NewParser returns a new Parser for a given io.Reader.
|
||
|
func NewParser(r io.Reader) *Parser {
|
||
|
return &Parser{
|
||
|
r: buffer.NewLexer(r),
|
||
|
state: []State{ValueState},
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Err returns the error encountered during tokenization, this is often io.EOF but also other errors can be returned.
|
||
|
func (p *Parser) Err() error {
|
||
|
if err := p.r.Err(); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
return p.err
|
||
|
}
|
||
|
|
||
|
// Restore restores the NULL byte at the end of the buffer.
|
||
|
func (p *Parser) Restore() {
|
||
|
p.r.Restore()
|
||
|
}
|
||
|
|
||
|
// Next returns the next Grammar. It returns ErrorGrammar when an error was encountered. Using Err() one can retrieve the error message.
|
||
|
func (p *Parser) Next() (GrammarType, []byte) {
|
||
|
p.moveWhitespace()
|
||
|
c := p.r.Peek(0)
|
||
|
state := p.state[len(p.state)-1]
|
||
|
if c == ',' {
|
||
|
if state != ArrayState && state != ObjectKeyState {
|
||
|
p.err = parse.NewErrorLexer("unexpected comma character outside an array or object", p.r)
|
||
|
return ErrorGrammar, nil
|
||
|
}
|
||
|
p.r.Move(1)
|
||
|
p.moveWhitespace()
|
||
|
p.needComma = false
|
||
|
c = p.r.Peek(0)
|
||
|
}
|
||
|
p.r.Skip()
|
||
|
|
||
|
if p.needComma && c != '}' && c != ']' && c != 0 {
|
||
|
p.err = parse.NewErrorLexer("expected comma character or an array or object ending", p.r)
|
||
|
return ErrorGrammar, nil
|
||
|
} else if c == '{' {
|
||
|
p.state = append(p.state, ObjectKeyState)
|
||
|
p.r.Move(1)
|
||
|
return StartObjectGrammar, p.r.Shift()
|
||
|
} else if c == '}' {
|
||
|
if state != ObjectKeyState {
|
||
|
p.err = parse.NewErrorLexer("unexpected right brace character", p.r)
|
||
|
return ErrorGrammar, nil
|
||
|
}
|
||
|
p.needComma = true
|
||
|
p.state = p.state[:len(p.state)-1]
|
||
|
if p.state[len(p.state)-1] == ObjectValueState {
|
||
|
p.state[len(p.state)-1] = ObjectKeyState
|
||
|
}
|
||
|
p.r.Move(1)
|
||
|
return EndObjectGrammar, p.r.Shift()
|
||
|
} else if c == '[' {
|
||
|
p.state = append(p.state, ArrayState)
|
||
|
p.r.Move(1)
|
||
|
return StartArrayGrammar, p.r.Shift()
|
||
|
} else if c == ']' {
|
||
|
p.needComma = true
|
||
|
if state != ArrayState {
|
||
|
p.err = parse.NewErrorLexer("unexpected right bracket character", p.r)
|
||
|
return ErrorGrammar, nil
|
||
|
}
|
||
|
p.state = p.state[:len(p.state)-1]
|
||
|
if p.state[len(p.state)-1] == ObjectValueState {
|
||
|
p.state[len(p.state)-1] = ObjectKeyState
|
||
|
}
|
||
|
p.r.Move(1)
|
||
|
return EndArrayGrammar, p.r.Shift()
|
||
|
} else if state == ObjectKeyState {
|
||
|
if c != '"' || !p.consumeStringToken() {
|
||
|
p.err = parse.NewErrorLexer("expected object key to be a quoted string", p.r)
|
||
|
return ErrorGrammar, nil
|
||
|
}
|
||
|
n := p.r.Pos()
|
||
|
p.moveWhitespace()
|
||
|
if c := p.r.Peek(0); c != ':' {
|
||
|
p.err = parse.NewErrorLexer("expected colon character after object key", p.r)
|
||
|
return ErrorGrammar, nil
|
||
|
}
|
||
|
p.r.Move(1)
|
||
|
p.state[len(p.state)-1] = ObjectValueState
|
||
|
return StringGrammar, p.r.Shift()[:n]
|
||
|
} else {
|
||
|
p.needComma = true
|
||
|
if state == ObjectValueState {
|
||
|
p.state[len(p.state)-1] = ObjectKeyState
|
||
|
}
|
||
|
if c == '"' && p.consumeStringToken() {
|
||
|
return StringGrammar, p.r.Shift()
|
||
|
} else if p.consumeNumberToken() {
|
||
|
return NumberGrammar, p.r.Shift()
|
||
|
} else if p.consumeLiteralToken() {
|
||
|
return LiteralGrammar, p.r.Shift()
|
||
|
}
|
||
|
}
|
||
|
return ErrorGrammar, nil
|
||
|
}
|
||
|
|
||
|
// State returns the state the parser is currently in (ie. which token is expected).
|
||
|
func (p *Parser) State() State {
|
||
|
return p.state[len(p.state)-1]
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////
|
||
|
|
||
|
/*
|
||
|
The following functions follow the specifications at http://json.org/
|
||
|
*/
|
||
|
|
||
|
func (p *Parser) moveWhitespace() {
|
||
|
for {
|
||
|
if c := p.r.Peek(0); c != ' ' && c != '\n' && c != '\r' && c != '\t' {
|
||
|
break
|
||
|
}
|
||
|
p.r.Move(1)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (p *Parser) consumeLiteralToken() bool {
|
||
|
c := p.r.Peek(0)
|
||
|
if c == 't' && p.r.Peek(1) == 'r' && p.r.Peek(2) == 'u' && p.r.Peek(3) == 'e' {
|
||
|
p.r.Move(4)
|
||
|
return true
|
||
|
} else if c == 'f' && p.r.Peek(1) == 'a' && p.r.Peek(2) == 'l' && p.r.Peek(3) == 's' && p.r.Peek(4) == 'e' {
|
||
|
p.r.Move(5)
|
||
|
return true
|
||
|
} else if c == 'n' && p.r.Peek(1) == 'u' && p.r.Peek(2) == 'l' && p.r.Peek(3) == 'l' {
|
||
|
p.r.Move(4)
|
||
|
return true
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func (p *Parser) consumeNumberToken() bool {
|
||
|
mark := p.r.Pos()
|
||
|
if p.r.Peek(0) == '-' {
|
||
|
p.r.Move(1)
|
||
|
}
|
||
|
c := p.r.Peek(0)
|
||
|
if c >= '1' && c <= '9' {
|
||
|
p.r.Move(1)
|
||
|
for {
|
||
|
if c := p.r.Peek(0); c < '0' || c > '9' {
|
||
|
break
|
||
|
}
|
||
|
p.r.Move(1)
|
||
|
}
|
||
|
} else if c != '0' {
|
||
|
p.r.Rewind(mark)
|
||
|
return false
|
||
|
} else {
|
||
|
p.r.Move(1) // 0
|
||
|
}
|
||
|
if c := p.r.Peek(0); c == '.' {
|
||
|
p.r.Move(1)
|
||
|
if c := p.r.Peek(0); c < '0' || c > '9' {
|
||
|
p.r.Move(-1)
|
||
|
return true
|
||
|
}
|
||
|
for {
|
||
|
if c := p.r.Peek(0); c < '0' || c > '9' {
|
||
|
break
|
||
|
}
|
||
|
p.r.Move(1)
|
||
|
}
|
||
|
}
|
||
|
mark = p.r.Pos()
|
||
|
if c := p.r.Peek(0); c == 'e' || c == 'E' {
|
||
|
p.r.Move(1)
|
||
|
if c := p.r.Peek(0); c == '+' || c == '-' {
|
||
|
p.r.Move(1)
|
||
|
}
|
||
|
if c := p.r.Peek(0); c < '0' || c > '9' {
|
||
|
p.r.Rewind(mark)
|
||
|
return true
|
||
|
}
|
||
|
for {
|
||
|
if c := p.r.Peek(0); c < '0' || c > '9' {
|
||
|
break
|
||
|
}
|
||
|
p.r.Move(1)
|
||
|
}
|
||
|
}
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
func (p *Parser) consumeStringToken() bool {
|
||
|
// assume to be on "
|
||
|
p.r.Move(1)
|
||
|
for {
|
||
|
c := p.r.Peek(0)
|
||
|
if c == '"' {
|
||
|
escaped := false
|
||
|
for i := p.r.Pos() - 1; i >= 0; i-- {
|
||
|
if p.r.Lexeme()[i] == '\\' {
|
||
|
escaped = !escaped
|
||
|
} else {
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
if !escaped {
|
||
|
p.r.Move(1)
|
||
|
break
|
||
|
}
|
||
|
} else if c == 0 {
|
||
|
return false
|
||
|
}
|
||
|
p.r.Move(1)
|
||
|
}
|
||
|
return true
|
||
|
}
|