145 lines
3.6 KiB
Go
145 lines
3.6 KiB
Go
// Copyright 2012 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package colltab
|
|
|
|
import "unicode/utf8"
|
|
|
|
// For a description of ContractTrieSet, see text/collate/build/contract.go.
|
|
|
|
type ContractTrieSet []struct{ L, H, N, I uint8 }
|
|
|
|
// ctScanner is used to match a trie to an input sequence.
|
|
// A contraction may match a non-contiguous sequence of bytes in an input string.
|
|
// For example, if there is a contraction for <a, combining_ring>, it should match
|
|
// the sequence <a, combining_cedilla, combining_ring>, as combining_cedilla does
|
|
// not block combining_ring.
|
|
// ctScanner does not automatically skip over non-blocking non-starters, but rather
|
|
// retains the state of the last match and leaves it up to the user to continue
|
|
// the match at the appropriate points.
|
|
type ctScanner struct {
|
|
states ContractTrieSet
|
|
s []byte
|
|
n int
|
|
index int
|
|
pindex int
|
|
done bool
|
|
}
|
|
|
|
type ctScannerString struct {
|
|
states ContractTrieSet
|
|
s string
|
|
n int
|
|
index int
|
|
pindex int
|
|
done bool
|
|
}
|
|
|
|
func (t ContractTrieSet) scanner(index, n int, b []byte) ctScanner {
|
|
return ctScanner{s: b, states: t[index:], n: n}
|
|
}
|
|
|
|
func (t ContractTrieSet) scannerString(index, n int, str string) ctScannerString {
|
|
return ctScannerString{s: str, states: t[index:], n: n}
|
|
}
|
|
|
|
// result returns the offset i and bytes consumed p so far. If no suffix
|
|
// matched, i and p will be 0.
|
|
func (s *ctScanner) result() (i, p int) {
|
|
return s.index, s.pindex
|
|
}
|
|
|
|
func (s *ctScannerString) result() (i, p int) {
|
|
return s.index, s.pindex
|
|
}
|
|
|
|
const (
|
|
final = 0
|
|
noIndex = 0xFF
|
|
)
|
|
|
|
// scan matches the longest suffix at the current location in the input
|
|
// and returns the number of bytes consumed.
|
|
func (s *ctScanner) scan(p int) int {
|
|
pr := p // the p at the rune start
|
|
str := s.s
|
|
states, n := s.states, s.n
|
|
for i := 0; i < n && p < len(str); {
|
|
e := states[i]
|
|
c := str[p]
|
|
// TODO: a significant number of contractions are of a form that
|
|
// cannot match discontiguous UTF-8 in a normalized string. We could let
|
|
// a negative value of e.n mean that we can set s.done = true and avoid
|
|
// the need for additional matches.
|
|
if c >= e.L {
|
|
if e.L == c {
|
|
p++
|
|
if e.I != noIndex {
|
|
s.index = int(e.I)
|
|
s.pindex = p
|
|
}
|
|
if e.N != final {
|
|
i, states, n = 0, states[int(e.H)+n:], int(e.N)
|
|
if p >= len(str) || utf8.RuneStart(str[p]) {
|
|
s.states, s.n, pr = states, n, p
|
|
}
|
|
} else {
|
|
s.done = true
|
|
return p
|
|
}
|
|
continue
|
|
} else if e.N == final && c <= e.H {
|
|
p++
|
|
s.done = true
|
|
s.index = int(c-e.L) + int(e.I)
|
|
s.pindex = p
|
|
return p
|
|
}
|
|
}
|
|
i++
|
|
}
|
|
return pr
|
|
}
|
|
|
|
// scan is a verbatim copy of ctScanner.scan.
|
|
func (s *ctScannerString) scan(p int) int {
|
|
pr := p // the p at the rune start
|
|
str := s.s
|
|
states, n := s.states, s.n
|
|
for i := 0; i < n && p < len(str); {
|
|
e := states[i]
|
|
c := str[p]
|
|
// TODO: a significant number of contractions are of a form that
|
|
// cannot match discontiguous UTF-8 in a normalized string. We could let
|
|
// a negative value of e.n mean that we can set s.done = true and avoid
|
|
// the need for additional matches.
|
|
if c >= e.L {
|
|
if e.L == c {
|
|
p++
|
|
if e.I != noIndex {
|
|
s.index = int(e.I)
|
|
s.pindex = p
|
|
}
|
|
if e.N != final {
|
|
i, states, n = 0, states[int(e.H)+n:], int(e.N)
|
|
if p >= len(str) || utf8.RuneStart(str[p]) {
|
|
s.states, s.n, pr = states, n, p
|
|
}
|
|
} else {
|
|
s.done = true
|
|
return p
|
|
}
|
|
continue
|
|
} else if e.N == final && c <= e.H {
|
|
p++
|
|
s.done = true
|
|
s.index = int(c-e.L) + int(e.I)
|
|
s.pindex = p
|
|
return p
|
|
}
|
|
}
|
|
i++
|
|
}
|
|
return pr
|
|
}
|