miniflux/vendor/golang.org/x/text/internal/colltab/table.go

// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package colltab

import (
	"unicode/utf8"

	"golang.org/x/text/unicode/norm"
)

// Table holds all collation data for a given collation ordering.
type Table struct {
	Index Trie // main trie

	// expansion info
	ExpandElem []uint32

	// contraction info
	ContractTries  ContractTrieSet
	ContractElem   []uint32
	MaxContractLen int
	VariableTop    uint32
}

func (t *Table) AppendNext(w []Elem, b []byte) (res []Elem, n int) {
	return t.appendNext(w, source{bytes: b})
}

func (t *Table) AppendNextString(w []Elem, s string) (res []Elem, n int) {
	return t.appendNext(w, source{str: s})
}

func (t *Table) Start(p int, b []byte) int {
	// TODO: implement
	panic("not implemented")
}

func (t *Table) StartString(p int, s string) int {
	// TODO: implement
	panic("not implemented")
}

func (t *Table) Domain() []string {
	// TODO: implement
	panic("not implemented")
}

func (t *Table) Top() uint32 {
	return t.VariableTop
}

type source struct {
	str   string
	bytes []byte
}

func (src *source) lookup(t *Table) (ce Elem, sz int) {
	if src.bytes == nil {
		return t.Index.lookupString(src.str)
	}
	return t.Index.lookup(src.bytes)
}

func (src *source) tail(sz int) {
	if src.bytes == nil {
		src.str = src.str[sz:]
	} else {
		src.bytes = src.bytes[sz:]
	}
}

func (src *source) nfd(buf []byte, end int) []byte {
	if src.bytes == nil {
		return norm.NFD.AppendString(buf[:0], src.str[:end])
	}
	return norm.NFD.Append(buf[:0], src.bytes[:end]...)
}

func (src *source) rune() (r rune, sz int) {
	if src.bytes == nil {
		return utf8.DecodeRuneInString(src.str)
	}
	return utf8.DecodeRune(src.bytes)
}

func (src *source) properties(f norm.Form) norm.Properties {
	if src.bytes == nil {
		return f.PropertiesString(src.str)
	}
	return f.Properties(src.bytes)
}

// appendNext appends the weights corresponding to the next rune or
// contraction in s.  If a contraction is matched to a discontinuous
// sequence of runes, the weights for the interstitial runes are
// appended as well.  It returns a new slice that includes the appended
// weights and the number of bytes consumed from s.
func (t *Table) appendNext(w []Elem, src source) (res []Elem, n int) {
	ce, sz := src.lookup(t)
	tp := ce.ctype()
	if tp == ceNormal {
		if ce == 0 {
			r, _ := src.rune()
			const (
				hangulSize  = 3
				firstHangul = 0xAC00
				lastHangul  = 0xD7A3
			)
			if r >= firstHangul && r <= lastHangul {
				// TODO: performance can be considerably improved here.
				n = sz
				var buf [16]byte // Used for decomposing Hangul.
				for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] {
					ce, sz = t.Index.lookup(b)
					w = append(w, ce)
				}
				return w, n
			}
			ce = makeImplicitCE(implicitPrimary(r))
		}
		w = append(w, ce)
	} else if tp == ceExpansionIndex {
		w = t.appendExpansion(w, ce)
	} else if tp == ceContractionIndex {
		n := 0
		src.tail(sz)
		if src.bytes == nil {
			w, n = t.matchContractionString(w, ce, src.str)
		} else {
			w, n = t.matchContraction(w, ce, src.bytes)
		}
		sz += n
	} else if tp == ceDecompose {
		// Decompose using NFKD and replace tertiary weights.
		t1, t2 := splitDecompose(ce)
		i := len(w)
		nfkd := src.properties(norm.NFKD).Decomposition()
		for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
			w, p = t.appendNext(w, source{bytes: nfkd})
		}
		w[i] = w[i].updateTertiary(t1)
		if i++; i < len(w) {
			w[i] = w[i].updateTertiary(t2)
			for i++; i < len(w); i++ {
				w[i] = w[i].updateTertiary(maxTertiary)
			}
		}
	}
	return w, sz
}

func (t *Table) appendExpansion(w []Elem, ce Elem) []Elem {
	i := splitExpandIndex(ce)
	n := int(t.ExpandElem[i])
	i++
	for _, ce := range t.ExpandElem[i : i+n] {
		w = append(w, Elem(ce))
	}
	return w
}

func (t *Table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) {
	index, n, offset := splitContractIndex(ce)

	scan := t.ContractTries.scanner(index, n, suffix)
	buf := [norm.MaxSegmentSize]byte{}
	bufp := 0
	p := scan.scan(0)

	if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
		// By now we should have filtered most cases.
		p0 := p
		bufn := 0
		rune := norm.NFD.Properties(suffix[p:])
		p += rune.Size()
		if rune.LeadCCC() != 0 {
			prevCC := rune.TrailCCC()
			// A gap may only occur in the last normalization segment.
			// This also ensures that len(scan.s) < norm.MaxSegmentSize.
			if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
				scan.s = suffix[:p+end]
			}
			for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
				rune = norm.NFD.Properties(suffix[p:])
				if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
					break
				}
				prevCC = rune.TrailCCC()
				if pp := scan.scan(p); pp != p {
					// Copy the interstitial runes for later processing.
					bufn += copy(buf[bufn:], suffix[p0:p])
					if scan.pindex == pp {
						bufp = bufn
					}
					p, p0 = pp, pp
				} else {
					p += rune.Size()
				}
			}
		}
	}
	// Append weights for the matched contraction, which may be an expansion.
	i, n := scan.result()
	ce = Elem(t.ContractElem[i+offset])
	if ce.ctype() == ceNormal {
		w = append(w, ce)
	} else {
		w = t.appendExpansion(w, ce)
	}
	// Append weights for the runes in the segment not part of the contraction.
	for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
		w, p = t.appendNext(w, source{bytes: b})
	}
	return w, n
}

// TODO: unify the two implementations. This is best done after first simplifying
// the algorithm taking into account the inclusion of both NFC and NFD forms
// in the table.
func (t *Table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) {
	index, n, offset := splitContractIndex(ce)

	scan := t.ContractTries.scannerString(index, n, suffix)
	buf := [norm.MaxSegmentSize]byte{}
	bufp := 0
	p := scan.scan(0)

	if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
		// By now we should have filtered most cases.
		p0 := p
		bufn := 0
		rune := norm.NFD.PropertiesString(suffix[p:])
		p += rune.Size()
		if rune.LeadCCC() != 0 {
			prevCC := rune.TrailCCC()
			// A gap may only occur in the last normalization segment.
			// This also ensures that len(scan.s) < norm.MaxSegmentSize.
			if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
				scan.s = suffix[:p+end]
			}
			for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
				rune = norm.NFD.PropertiesString(suffix[p:])
				if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
					break
				}
				prevCC = rune.TrailCCC()
				if pp := scan.scan(p); pp != p {
					// Copy the interstitial runes for later processing.
					bufn += copy(buf[bufn:], suffix[p0:p])
					if scan.pindex == pp {
						bufp = bufn
					}
					p, p0 = pp, pp
				} else {
					p += rune.Size()
				}
			}
		}
	}
	// Append weights for the matched contraction, which may be an expansion.
	i, n := scan.result()
	ce = Elem(t.ContractElem[i+offset])
	if ce.ctype() == ceNormal {
		w = append(w, ce)
	} else {
		w = t.appendExpansion(w, ce)
	}
	// Append weights for the runes in the segment not part of the contraction.
	for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
		w, p = t.appendNext(w, source{bytes: b})
	}
	return w, n
}
First commit 2017-11-20 06:10:04 +01:00			`// Copyright 2012 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`package colltab`

			`import (`
			`"unicode/utf8"`

			`"golang.org/x/text/unicode/norm"`
			`)`

			`// Table holds all collation data for a given collation ordering.`
			`type Table struct {`
			`Index Trie // main trie`

			`// expansion info`
			`ExpandElem []uint32`

			`// contraction info`
			`ContractTries ContractTrieSet`
			`ContractElem []uint32`
			`MaxContractLen int`
			`VariableTop uint32`
			`}`

			`func (t *Table) AppendNext(w []Elem, b []byte) (res []Elem, n int) {`
			`return t.appendNext(w, source{bytes: b})`
			`}`

			`func (t *Table) AppendNextString(w []Elem, s string) (res []Elem, n int) {`
			`return t.appendNext(w, source{str: s})`
			`}`

			`func (t *Table) Start(p int, b []byte) int {`
			`// TODO: implement`
			`panic("not implemented")`
			`}`

			`func (t *Table) StartString(p int, s string) int {`
			`// TODO: implement`
			`panic("not implemented")`
			`}`

			`func (t *Table) Domain() []string {`
			`// TODO: implement`
			`panic("not implemented")`
			`}`

			`func (t *Table) Top() uint32 {`
			`return t.VariableTop`
			`}`

			`type source struct {`
			`str string`
			`bytes []byte`
			`}`

			`func (src source) lookup(t Table) (ce Elem, sz int) {`
			`if src.bytes == nil {`
			`return t.Index.lookupString(src.str)`
			`}`
			`return t.Index.lookup(src.bytes)`
			`}`

			`func (src *source) tail(sz int) {`
			`if src.bytes == nil {`
			`src.str = src.str[sz:]`
			`} else {`
			`src.bytes = src.bytes[sz:]`
			`}`
			`}`

			`func (src *source) nfd(buf []byte, end int) []byte {`
			`if src.bytes == nil {`
			`return norm.NFD.AppendString(buf[:0], src.str[:end])`
			`}`
			`return norm.NFD.Append(buf[:0], src.bytes[:end]...)`
			`}`

			`func (src *source) rune() (r rune, sz int) {`
			`if src.bytes == nil {`
			`return utf8.DecodeRuneInString(src.str)`
			`}`
			`return utf8.DecodeRune(src.bytes)`
			`}`

			`func (src *source) properties(f norm.Form) norm.Properties {`
			`if src.bytes == nil {`
			`return f.PropertiesString(src.str)`
			`}`
			`return f.Properties(src.bytes)`
			`}`

			`// appendNext appends the weights corresponding to the next rune or`
			`// contraction in s. If a contraction is matched to a discontinuous`
			`// sequence of runes, the weights for the interstitial runes are`
			`// appended as well. It returns a new slice that includes the appended`
			`// weights and the number of bytes consumed from s.`
			`func (t *Table) appendNext(w []Elem, src source) (res []Elem, n int) {`
			`ce, sz := src.lookup(t)`
			`tp := ce.ctype()`
			`if tp == ceNormal {`
			`if ce == 0 {`
			`r, _ := src.rune()`
			`const (`
			`hangulSize = 3`
			`firstHangul = 0xAC00`
			`lastHangul = 0xD7A3`
			`)`
			`if r >= firstHangul && r <= lastHangul {`
			`// TODO: performance can be considerably improved here.`
			`n = sz`
			`var buf [16]byte // Used for decomposing Hangul.`
			`for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] {`
			`ce, sz = t.Index.lookup(b)`
			`w = append(w, ce)`
			`}`
			`return w, n`
			`}`
			`ce = makeImplicitCE(implicitPrimary(r))`
			`}`
			`w = append(w, ce)`
			`} else if tp == ceExpansionIndex {`
			`w = t.appendExpansion(w, ce)`
			`} else if tp == ceContractionIndex {`
			`n := 0`
			`src.tail(sz)`
			`if src.bytes == nil {`
			`w, n = t.matchContractionString(w, ce, src.str)`
			`} else {`
			`w, n = t.matchContraction(w, ce, src.bytes)`
			`}`
			`sz += n`
			`} else if tp == ceDecompose {`
			`// Decompose using NFKD and replace tertiary weights.`
			`t1, t2 := splitDecompose(ce)`
			`i := len(w)`
			`nfkd := src.properties(norm.NFKD).Decomposition()`
			`for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {`
			`w, p = t.appendNext(w, source{bytes: nfkd})`
			`}`
			`w[i] = w[i].updateTertiary(t1)`
			`if i++; i < len(w) {`
			`w[i] = w[i].updateTertiary(t2)`
			`for i++; i < len(w); i++ {`
			`w[i] = w[i].updateTertiary(maxTertiary)`
			`}`
			`}`
			`}`
			`return w, sz`
			`}`

			`func (t *Table) appendExpansion(w []Elem, ce Elem) []Elem {`
			`i := splitExpandIndex(ce)`
			`n := int(t.ExpandElem[i])`
			`i++`
			`for _, ce := range t.ExpandElem[i : i+n] {`
			`w = append(w, Elem(ce))`
			`}`
			`return w`
			`}`

			`func (t *Table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) {`
			`index, n, offset := splitContractIndex(ce)`

			`scan := t.ContractTries.scanner(index, n, suffix)`
			`buf := [norm.MaxSegmentSize]byte{}`
			`bufp := 0`
			`p := scan.scan(0)`

			`if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {`
			`// By now we should have filtered most cases.`
			`p0 := p`
			`bufn := 0`
			`rune := norm.NFD.Properties(suffix[p:])`
			`p += rune.Size()`
			`if rune.LeadCCC() != 0 {`
			`prevCC := rune.TrailCCC()`
			`// A gap may only occur in the last normalization segment.`
			`// This also ensures that len(scan.s) < norm.MaxSegmentSize.`
			`if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {`
			`scan.s = suffix[:p+end]`
			`}`
			`for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {`
			`rune = norm.NFD.Properties(suffix[p:])`
			`if ccc := rune.LeadCCC(); ccc == 0 \|\| prevCC >= ccc {`
			`break`
			`}`
			`prevCC = rune.TrailCCC()`
			`if pp := scan.scan(p); pp != p {`
			`// Copy the interstitial runes for later processing.`
			`bufn += copy(buf[bufn:], suffix[p0:p])`
			`if scan.pindex == pp {`
			`bufp = bufn`
			`}`
			`p, p0 = pp, pp`
			`} else {`
			`p += rune.Size()`
			`}`
			`}`
			`}`
			`}`
			`// Append weights for the matched contraction, which may be an expansion.`
			`i, n := scan.result()`
			`ce = Elem(t.ContractElem[i+offset])`
			`if ce.ctype() == ceNormal {`
			`w = append(w, ce)`
			`} else {`
			`w = t.appendExpansion(w, ce)`
			`}`
			`// Append weights for the runes in the segment not part of the contraction.`
			`for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {`
			`w, p = t.appendNext(w, source{bytes: b})`
			`}`
			`return w, n`
			`}`

			`// TODO: unify the two implementations. This is best done after first simplifying`
			`// the algorithm taking into account the inclusion of both NFC and NFD forms`
			`// in the table.`
			`func (t *Table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) {`
			`index, n, offset := splitContractIndex(ce)`

			`scan := t.ContractTries.scannerString(index, n, suffix)`
			`buf := [norm.MaxSegmentSize]byte{}`
			`bufp := 0`
			`p := scan.scan(0)`

			`if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {`
			`// By now we should have filtered most cases.`
			`p0 := p`
			`bufn := 0`
			`rune := norm.NFD.PropertiesString(suffix[p:])`
			`p += rune.Size()`
			`if rune.LeadCCC() != 0 {`
			`prevCC := rune.TrailCCC()`
			`// A gap may only occur in the last normalization segment.`
			`// This also ensures that len(scan.s) < norm.MaxSegmentSize.`
			`if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {`
			`scan.s = suffix[:p+end]`
			`}`
			`for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {`
			`rune = norm.NFD.PropertiesString(suffix[p:])`
			`if ccc := rune.LeadCCC(); ccc == 0 \|\| prevCC >= ccc {`
			`break`
			`}`
			`prevCC = rune.TrailCCC()`
			`if pp := scan.scan(p); pp != p {`
			`// Copy the interstitial runes for later processing.`
			`bufn += copy(buf[bufn:], suffix[p0:p])`
			`if scan.pindex == pp {`
			`bufp = bufn`
			`}`
			`p, p0 = pp, pp`
			`} else {`
			`p += rune.Size()`
			`}`
			`}`
			`}`
			`}`
			`// Append weights for the matched contraction, which may be an expansion.`
			`i, n := scan.result()`
			`ce = Elem(t.ContractElem[i+offset])`
			`if ce.ctype() == ceNormal {`
			`w = append(w, ce)`
			`} else {`
			`w = t.appendExpansion(w, ce)`
			`}`
			`// Append weights for the runes in the segment not part of the contraction.`
			`for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {`
			`w, p = t.appendNext(w, source{bytes: b})`
			`}`
			`return w, n`
			`}`