summaryrefslogtreecommitdiff
path: root/vendor/golang.org/x/text/cases/icu_test.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/golang.org/x/text/cases/icu_test.go')
-rw-r--r--vendor/golang.org/x/text/cases/icu_test.go210
1 files changed, 210 insertions, 0 deletions
diff --git a/vendor/golang.org/x/text/cases/icu_test.go b/vendor/golang.org/x/text/cases/icu_test.go
new file mode 100644
index 0000000..3d07e25
--- /dev/null
+++ b/vendor/golang.org/x/text/cases/icu_test.go
@@ -0,0 +1,210 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build icu
+
+package cases
+
+import (
+ "path"
+ "strings"
+ "testing"
+
+ "golang.org/x/text/internal/testtext"
+ "golang.org/x/text/language"
+ "golang.org/x/text/unicode/norm"
+)
+
+func TestICUConformance(t *testing.T) {
+ // Build test set.
+ input := []string{
+ "a.a a_a",
+ "a\u05d0a",
+ "\u05d0'a",
+ "a\u03084a",
+ "a\u0308a",
+ "a3\u30a3a",
+ "a\u303aa",
+ "a_\u303a_a",
+ "1_a..a",
+ "1_a.a",
+ "a..a.",
+ "a--a-",
+ "a-a-",
+ "a\u200ba",
+ "a\u200b\u200ba",
+ "a\u00ad\u00ada", // Format
+ "a\u00ada",
+ "a''a", // SingleQuote
+ "a'a",
+ "a::a", // MidLetter
+ "a:a",
+ "a..a", // MidNumLet
+ "a.a",
+ "a;;a", // MidNum
+ "a;a",
+ "a__a", // ExtendNumlet
+ "a_a",
+ "ΟΣ''a",
+ }
+ add := func(x interface{}) {
+ switch v := x.(type) {
+ case string:
+ input = append(input, v)
+ case []string:
+ for _, s := range v {
+ input = append(input, s)
+ }
+ }
+ }
+ for _, tc := range testCases {
+ add(tc.src)
+ add(tc.lower)
+ add(tc.upper)
+ add(tc.title)
+ }
+ for _, tc := range bufferTests {
+ add(tc.src)
+ }
+ for _, tc := range breakTest {
+ add(strings.Replace(tc, "|", "", -1))
+ }
+ for _, tc := range foldTestCases {
+ add(tc)
+ }
+
+ // Compare ICU to Go.
+ for _, c := range []string{"lower", "upper", "title", "fold"} {
+ for _, tag := range []string{
+ "und", "af", "az", "el", "lt", "nl", "tr",
+ } {
+ for _, s := range input {
+ if exclude(c, tag, s) {
+ continue
+ }
+ testtext.Run(t, path.Join(c, tag, s), func(t *testing.T) {
+ want := doICU(tag, c, s)
+ got := doGo(tag, c, s)
+ if norm.NFC.String(got) != norm.NFC.String(want) {
+ t.Errorf("\n in %[3]q (%+[3]q)\n got %[1]q (%+[1]q)\n want %[2]q (%+[2]q)", got, want, s)
+ }
+ })
+ }
+ }
+ }
+}
+
+// exclude indicates if a string should be excluded from testing.
+func exclude(cm, tag, s string) bool {
+ list := []struct{ cm, tags, pattern string }{
+ // TODO: Go does not handle certain esoteric breaks correctly. This will be
+ // fixed once we have a real word break iterator. Alternatively, it
+ // seems like we're not too far off from making it work, so we could
+ // fix these last steps. But first verify that using a separate word
+ // breaker does not hurt performance.
+ {"title", "af nl", "a''a"},
+ {"", "", "א'a"},
+
+ // All the exclusions below seem to be issues with the ICU
+ // implementation (at version 57) and thus are not marked as TODO.
+
+ // ICU does not handle leading apostrophe for Dutch and
+ // Afrikaans correctly. See http://unicode.org/cldr/trac/ticket/7078.
+ {"title", "af nl", "'n"},
+ {"title", "af nl", "'N"},
+
+ // Go terminates the final sigma check after a fixed number of
+ // ignorables have been found. This ensures that the algorithm can make
+ // progress in a streaming scenario.
+ {"lower title", "", "\u039f\u03a3...............................a"},
+ // This also applies to upper in Greek.
+ // NOTE: we could fix the following two cases by adding state to elUpper
+ // and aztrLower. However, considering a modifier to not belong to the
+ // preceding letter after the maximum modifiers count is reached is
+ // consistent with the behavior of unicode/norm.
+ {"upper", "el", "\u03bf" + strings.Repeat("\u0321", 29) + "\u0313"},
+ {"lower", "az tr lt", "I" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
+ {"upper", "lt", "i" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
+ {"lower", "lt", "I" + strings.Repeat("\u0321", 30) + "\u0300"},
+
+ // ICU title case seems to erroneously removes \u0307 from an upper case
+ // I unconditionally, instead of only when lowercasing. The ICU
+ // transform algorithm transforms these cases consistently with our
+ // implementation.
+ {"title", "az tr", "\u0307"},
+
+ // The spec says to remove \u0307 after Soft-Dotted characters. ICU
+ // transforms conform but ucasemap_utf8ToUpper does not.
+ {"upper title", "lt", "i\u0307"},
+ {"upper title", "lt", "i" + strings.Repeat("\u0321", 29) + "\u0307\u0300"},
+
+ // Both Unicode and CLDR prescribe an extra explicit dot above after a
+ // Soft_Dotted character if there are other modifiers.
+ // ucasemap_utf8ToUpper does not do this; ICU transforms do.
+ // The issue with ucasemap_utf8ToUpper seems to be that it does not
+ // consider the modifiers that are part of composition in the evaluation
+ // of More_Above. For instance, according to the More_Above rule for lt,
+ // a dotted capital I (U+0130) becomes i\u0307\u0307 (an small i with
+ // two additional dots). This seems odd, but is correct. ICU is
+ // definitely not correct as it produces different results for different
+ // normal forms. For instance, for an İ:
+ // \u0130 (NFC) -> i\u0307 (incorrect)
+ // I\u0307 (NFD) -> i\u0307\u0307 (correct)
+ // We could argue that we should not add a \u0307 if there already is
+ // one, but this may be hard to get correct and is not conform the
+ // standard.
+ {"lower title", "lt", "\u0130"},
+ {"lower title", "lt", "\u00cf"},
+
+ // We are conform ICU ucasemap_utf8ToUpper if we remove support for
+ // elUpper. However, this is clearly not conform the spec. Moreover, the
+ // ICU transforms _do_ implement this transform and produces results
+ // consistent with our implementation. Note that we still prefer to use
+ // ucasemap_utf8ToUpper instead of transforms as the latter have
+ // inconsistencies in the word breaking algorithm.
+ {"upper", "el", "\u0386"}, // GREEK CAPITAL LETTER ALPHA WITH TONOS
+ {"upper", "el", "\u0389"}, // GREEK CAPITAL LETTER ETA WITH TONOS
+ {"upper", "el", "\u038A"}, // GREEK CAPITAL LETTER IOTA WITH TONOS
+
+ {"upper", "el", "\u0391"}, // GREEK CAPITAL LETTER ALPHA
+ {"upper", "el", "\u0397"}, // GREEK CAPITAL LETTER ETA
+ {"upper", "el", "\u0399"}, // GREEK CAPITAL LETTER IOTA
+
+ {"upper", "el", "\u03AC"}, // GREEK SMALL LETTER ALPHA WITH TONOS
+ {"upper", "el", "\u03AE"}, // GREEK SMALL LETTER ALPHA WITH ETA
+ {"upper", "el", "\u03AF"}, // GREEK SMALL LETTER ALPHA WITH IOTA
+
+ {"upper", "el", "\u03B1"}, // GREEK SMALL LETTER ALPHA
+ {"upper", "el", "\u03B7"}, // GREEK SMALL LETTER ETA
+ {"upper", "el", "\u03B9"}, // GREEK SMALL LETTER IOTA
+ }
+ for _, x := range list {
+ if x.cm != "" && strings.Index(x.cm, cm) == -1 {
+ continue
+ }
+ if x.tags != "" && strings.Index(x.tags, tag) == -1 {
+ continue
+ }
+ if strings.Index(s, x.pattern) != -1 {
+ return true
+ }
+ }
+ return false
+}
+
+func doGo(tag, caser, input string) string {
+ var c Caser
+ t := language.MustParse(tag)
+ switch caser {
+ case "lower":
+ c = Lower(t)
+ case "upper":
+ c = Upper(t)
+ case "title":
+ c = Title(t)
+ case "fold":
+ c = Fold()
+ }
+ return c.String(input)
+}