230 lines
6.1 KiB
Go
230 lines
6.1 KiB
Go
// Copyright 2012 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package collate
|
|
|
|
import (
|
|
"archive/zip"
|
|
"bufio"
|
|
"bytes"
|
|
"flag"
|
|
"io"
|
|
"io/ioutil"
|
|
"log"
|
|
"path"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"testing"
|
|
"unicode/utf8"
|
|
|
|
"golang.org/x/text/collate/build"
|
|
"golang.org/x/text/internal/gen"
|
|
"golang.org/x/text/language"
|
|
)
|
|
|
|
var long = flag.Bool("long", false,
|
|
"run time-consuming tests, such as tests that fetch data online")
|
|
|
|
// This regression test runs tests for the test files in CollationTest.zip
|
|
// (taken from http://www.unicode.org/Public/UCA/<gen.UnicodeVersion()>/).
|
|
//
|
|
// The test files have the following form:
|
|
// # header
|
|
// 0009 0021; # ('\u0009') <CHARACTER TABULATION> [| | | 0201 025E]
|
|
// 0009 003F; # ('\u0009') <CHARACTER TABULATION> [| | | 0201 0263]
|
|
// 000A 0021; # ('\u000A') <LINE FEED (LF)> [| | | 0202 025E]
|
|
// 000A 003F; # ('\u000A') <LINE FEED (LF)> [| | | 0202 0263]
|
|
//
|
|
// The part before the semicolon is the hex representation of a sequence
|
|
// of runes. After the hash mark is a comment. The strings
|
|
// represented by rune sequence are in the file in sorted order, as
|
|
// defined by the DUCET.
|
|
|
|
type Test struct {
|
|
name string
|
|
str [][]byte
|
|
comment []string
|
|
}
|
|
|
|
var versionRe = regexp.MustCompile(`# UCA Version: (.*)\n?$`)
|
|
var testRe = regexp.MustCompile(`^([\dA-F ]+);.*# (.*)\n?$`)
|
|
|
|
func TestCollation(t *testing.T) {
|
|
if !gen.IsLocal() && !*long {
|
|
t.Skip("skipping test to prevent downloading; to run use -long or use -local to specify a local source")
|
|
}
|
|
t.Skip("must first update to new file format to support test")
|
|
for _, test := range loadTestData() {
|
|
doTest(t, test)
|
|
}
|
|
}
|
|
|
|
func Error(e error) {
|
|
if e != nil {
|
|
log.Fatal(e)
|
|
}
|
|
}
|
|
|
|
// parseUCA parses a Default Unicode Collation Element Table of the format
|
|
// specified in http://www.unicode.org/reports/tr10/#File_Format.
|
|
// It returns the variable top.
|
|
func parseUCA(builder *build.Builder) {
|
|
r := gen.OpenUnicodeFile("UCA", "", "allkeys.txt")
|
|
defer r.Close()
|
|
input := bufio.NewReader(r)
|
|
colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
|
|
for i := 1; true; i++ {
|
|
l, prefix, err := input.ReadLine()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
Error(err)
|
|
line := string(l)
|
|
if prefix {
|
|
log.Fatalf("%d: buffer overflow", i)
|
|
}
|
|
if len(line) == 0 || line[0] == '#' {
|
|
continue
|
|
}
|
|
if line[0] == '@' {
|
|
if strings.HasPrefix(line[1:], "version ") {
|
|
if v := strings.Split(line[1:], " ")[1]; v != gen.UnicodeVersion() {
|
|
log.Fatalf("incompatible version %s; want %s", v, gen.UnicodeVersion())
|
|
}
|
|
}
|
|
} else {
|
|
// parse entries
|
|
part := strings.Split(line, " ; ")
|
|
if len(part) != 2 {
|
|
log.Fatalf("%d: production rule without ';': %v", i, line)
|
|
}
|
|
lhs := []rune{}
|
|
for _, v := range strings.Split(part[0], " ") {
|
|
if v != "" {
|
|
lhs = append(lhs, rune(convHex(i, v)))
|
|
}
|
|
}
|
|
vars := []int{}
|
|
rhs := [][]int{}
|
|
for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
|
|
if m[1] == "*" {
|
|
vars = append(vars, i)
|
|
}
|
|
elem := []int{}
|
|
for _, h := range strings.Split(m[2], ".") {
|
|
elem = append(elem, convHex(i, h))
|
|
}
|
|
rhs = append(rhs, elem)
|
|
}
|
|
builder.Add(lhs, rhs, vars)
|
|
}
|
|
}
|
|
}
|
|
|
|
func convHex(line int, s string) int {
|
|
r, e := strconv.ParseInt(s, 16, 32)
|
|
if e != nil {
|
|
log.Fatalf("%d: %v", line, e)
|
|
}
|
|
return int(r)
|
|
}
|
|
|
|
func loadTestData() []Test {
|
|
f := gen.OpenUnicodeFile("UCA", "", "CollationTest.zip")
|
|
buffer, err := ioutil.ReadAll(f)
|
|
f.Close()
|
|
Error(err)
|
|
archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
|
|
Error(err)
|
|
tests := []Test{}
|
|
for _, f := range archive.File {
|
|
// Skip the short versions, which are simply duplicates of the long versions.
|
|
if strings.Contains(f.Name, "SHORT") || f.FileInfo().IsDir() {
|
|
continue
|
|
}
|
|
ff, err := f.Open()
|
|
Error(err)
|
|
defer ff.Close()
|
|
scanner := bufio.NewScanner(ff)
|
|
test := Test{name: path.Base(f.Name)}
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
if len(line) <= 1 || line[0] == '#' {
|
|
if m := versionRe.FindStringSubmatch(line); m != nil {
|
|
if m[1] != gen.UnicodeVersion() {
|
|
log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], gen.UnicodeVersion())
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
m := testRe.FindStringSubmatch(line)
|
|
if m == nil || len(m) < 3 {
|
|
log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
|
|
}
|
|
str := []byte{}
|
|
// In the regression test data (unpaired) surrogates are assigned a weight
|
|
// corresponding to their code point value. However, utf8.DecodeRune,
|
|
// which is used to compute the implicit weight, assigns FFFD to surrogates.
|
|
// We therefore skip tests with surrogates. This skips about 35 entries
|
|
// per test.
|
|
valid := true
|
|
for _, split := range strings.Split(m[1], " ") {
|
|
r, err := strconv.ParseUint(split, 16, 64)
|
|
Error(err)
|
|
valid = valid && utf8.ValidRune(rune(r))
|
|
str = append(str, string(rune(r))...)
|
|
}
|
|
if valid {
|
|
test.str = append(test.str, str)
|
|
test.comment = append(test.comment, m[2])
|
|
}
|
|
}
|
|
if scanner.Err() != nil {
|
|
log.Fatal(scanner.Err())
|
|
}
|
|
tests = append(tests, test)
|
|
}
|
|
return tests
|
|
}
|
|
|
|
var errorCount int
|
|
|
|
func runes(b []byte) []rune {
|
|
return []rune(string(b))
|
|
}
|
|
|
|
var shifted = language.MustParse("und-u-ka-shifted-ks-level4")
|
|
|
|
func doTest(t *testing.T, tc Test) {
|
|
bld := build.NewBuilder()
|
|
parseUCA(bld)
|
|
w, err := bld.Build()
|
|
Error(err)
|
|
var tag language.Tag
|
|
if !strings.Contains(tc.name, "NON_IGNOR") {
|
|
tag = shifted
|
|
}
|
|
c := NewFromTable(w, OptionsFromTag(tag))
|
|
b := &Buffer{}
|
|
prev := tc.str[0]
|
|
for i := 1; i < len(tc.str); i++ {
|
|
b.Reset()
|
|
s := tc.str[i]
|
|
ka := c.Key(b, prev)
|
|
kb := c.Key(b, s)
|
|
if r := bytes.Compare(ka, kb); r == 1 {
|
|
t.Errorf("%s:%d: Key(%.4X) < Key(%.4X) (%X < %X) == %d; want -1 or 0", tc.name, i, []rune(string(prev)), []rune(string(s)), ka, kb, r)
|
|
prev = s
|
|
continue
|
|
}
|
|
if r := c.Compare(prev, s); r == 1 {
|
|
t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want -1 or 0", tc.name, i, runes(prev), runes(s), r)
|
|
}
|
|
if r := c.Compare(s, prev); r == -1 {
|
|
t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want 1 or 0", tc.name, i, runes(s), runes(prev), r)
|
|
}
|
|
prev = s
|
|
}
|
|
}
|