summaryrefslogtreecommitdiff
path: root/vendor/golang.org/x/text/secure/precis/gen.go
blob: 946acbaa1f19ade1a6f8cc2db71674c15a2a3d8b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Unicode table generator.
// Data read from the web.

// +build ignore

package main

import (
	"flag"
	"log"
	"unicode"
	"unicode/utf8"

	"golang.org/x/text/internal/gen"
	"golang.org/x/text/internal/triegen"
	"golang.org/x/text/internal/ucd"
	"golang.org/x/text/unicode/norm"
	"golang.org/x/text/unicode/rangetable"
)

var outputFile = flag.String("output", "tables.go", "output file for generated tables; default tables.go")

var assigned, disallowedRunes *unicode.RangeTable

var runeCategory = map[rune]category{}

var overrides = map[category]category{
	viramaModifier: viramaJoinT,
	greek:          greekJoinT,
	hebrew:         hebrewJoinT,
}

func setCategory(r rune, cat category) {
	if c, ok := runeCategory[r]; ok {
		if override, ok := overrides[c]; cat == joiningT && ok {
			cat = override
		} else {
			log.Fatalf("%U: multiple categories for rune (%v and %v)", r, c, cat)
		}
	}
	runeCategory[r] = cat
}

func init() {
	if numCategories > 1<<propShift {
		log.Fatalf("Number of categories is %d; may at most be %d", numCategories, 1<<propShift)
	}
}

func main() {
	gen.Init()

	// Load data
	runes := []rune{}
	// PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13
	ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
		if p.String(1) == "Default_Ignorable_Code_Point" {
			runes = append(runes, p.Rune(0))
		}
	})
	ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) {
		switch p.String(1) {
		case "Noncharacter_Code_Point":
			runes = append(runes, p.Rune(0))
		}
	})
	// OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9
	ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
		switch p.String(1) {
		case "L", "V", "T":
			runes = append(runes, p.Rune(0))
		}
	})

	disallowedRunes = rangetable.New(runes...)
	assigned = rangetable.Assigned(unicode.Version)

	// Load category data.
	runeCategory['l'] = latinSmallL
	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
		const cccVirama = 9
		if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
			setCategory(p.Rune(0), viramaModifier)
		}
	})
	ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) {
		switch p.String(1) {
		case "Greek":
			setCategory(p.Rune(0), greek)
		case "Hebrew":
			setCategory(p.Rune(0), hebrew)
		case "Hiragana", "Katakana", "Han":
			setCategory(p.Rune(0), japanese)
		}
	})

	// Set the rule categories associated with exceptions. This overrides any
	// previously set categories. The original categories are manually
	// reintroduced in the categoryTransitions table.
	for r, e := range exceptions {
		if e.cat != 0 {
			runeCategory[r] = e.cat
		}
	}
	cat := map[string]category{
		"L": joiningL,
		"D": joiningD,
		"T": joiningT,

		"R": joiningR,
	}
	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
		switch v := p.String(1); v {
		case "L", "D", "T", "R":
			setCategory(p.Rune(0), cat[v])
		}
	})

	writeTables()
	gen.Repackage("gen_trieval.go", "trieval.go", "precis")
}

type exception struct {
	prop property
	cat  category
}

func init() {
	// Programmatically add the Arabic and Indic digits to the exceptions map.
	// See comment in the exceptions map below why these are marked disallowed.
	for i := rune(0); i <= 9; i++ {
		exceptions[0x0660+i] = exception{
			prop: disallowed,
			cat:  arabicIndicDigit,
		}
		exceptions[0x06F0+i] = exception{
			prop: disallowed,
			cat:  extendedArabicIndicDigit,
		}
	}
}

// The Exceptions class as defined in RFC 5892
// https://tools.ietf.org/html/rfc5892#section-2.6
var exceptions = map[rune]exception{
	0x00DF: {prop: pValid},
	0x03C2: {prop: pValid},
	0x06FD: {prop: pValid},
	0x06FE: {prop: pValid},
	0x0F0B: {prop: pValid},
	0x3007: {prop: pValid},

	// ContextO|J rules are marked as disallowed, taking a "guilty until proven
	// innocent" approach. The main reason for this is that the check for
	// whether a context rule should be applied can be moved to the logic for
	// handing disallowed runes, taken it off the common path. The exception to
	// this rule is for katakanaMiddleDot, as the rule logic is handled without
	// using a rule function.

	// ContextJ (Join control)
	0x200C: {prop: disallowed, cat: zeroWidthNonJoiner},
	0x200D: {prop: disallowed, cat: zeroWidthJoiner},

	// ContextO
	0x00B7: {prop: disallowed, cat: middleDot},
	0x0375: {prop: disallowed, cat: greekLowerNumeralSign},
	0x05F3: {prop: disallowed, cat: hebrewPreceding}, // punctuation Geresh
	0x05F4: {prop: disallowed, cat: hebrewPreceding}, // punctuation Gershayim
	0x30FB: {prop: pValid, cat: katakanaMiddleDot},

	// These are officially ContextO, but the implementation does not require
	// special treatment of these, so we simply mark them as valid.
	0x0660: {prop: pValid},
	0x0661: {prop: pValid},
	0x0662: {prop: pValid},
	0x0663: {prop: pValid},
	0x0664: {prop: pValid},
	0x0665: {prop: pValid},
	0x0666: {prop: pValid},
	0x0667: {prop: pValid},
	0x0668: {prop: pValid},
	0x0669: {prop: pValid},
	0x06F0: {prop: pValid},
	0x06F1: {prop: pValid},
	0x06F2: {prop: pValid},
	0x06F3: {prop: pValid},
	0x06F4: {prop: pValid},
	0x06F5: {prop: pValid},
	0x06F6: {prop: pValid},
	0x06F7: {prop: pValid},
	0x06F8: {prop: pValid},
	0x06F9: {prop: pValid},

	0x0640: {prop: disallowed},
	0x07FA: {prop: disallowed},
	0x302E: {prop: disallowed},
	0x302F: {prop: disallowed},
	0x3031: {prop: disallowed},
	0x3032: {prop: disallowed},
	0x3033: {prop: disallowed},
	0x3034: {prop: disallowed},
	0x3035: {prop: disallowed},
	0x303B: {prop: disallowed},
}

// LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1
// r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}.
func isLetterDigits(r rune) bool {
	return unicode.In(r,
		unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters
		unicode.Mn, unicode.Mc, // Modifiers
		unicode.Nd, // Digits
	)
}

func isIdDisAndFreePVal(r rune) bool {
	return unicode.In(r,
		// OtherLetterDigits: https://tools.ietf.org/html/rfc7564#section-9.18
		// r in in {Lt, Nl, No, Me}
		unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers
		unicode.Me, // Modifiers

		// Spaces: https://tools.ietf.org/html/rfc7564#section-9.14
		// r in in {Zs}
		unicode.Zs,

		// Symbols: https://tools.ietf.org/html/rfc7564#section-9.15
		// r in {Sm, Sc, Sk, So}
		unicode.Sm, unicode.Sc, unicode.Sk, unicode.So,

		// Punctuation: https://tools.ietf.org/html/rfc7564#section-9.16
		// r in {Pc, Pd, Ps, Pe, Pi, Pf, Po}
		unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe,
		unicode.Pi, unicode.Pf, unicode.Po,
	)
}

// HasCompat: https://tools.ietf.org/html/rfc7564#section-9.17
func hasCompat(r rune) bool {
	return !norm.NFKC.IsNormalString(string(r))
}

// From https://tools.ietf.org/html/rfc5892:
//
// If .cp. .in. Exceptions Then Exceptions(cp);
//   Else If .cp. .in. BackwardCompatible Then BackwardCompatible(cp);
//   Else If .cp. .in. Unassigned Then UNASSIGNED;
//   Else If .cp. .in. ASCII7 Then PVALID;
//   Else If .cp. .in. JoinControl Then CONTEXTJ;
//   Else If .cp. .in. OldHangulJamo Then DISALLOWED;
//   Else If .cp. .in. PrecisIgnorableProperties Then DISALLOWED;
//   Else If .cp. .in. Controls Then DISALLOWED;
//   Else If .cp. .in. HasCompat Then ID_DIS or FREE_PVAL;
//   Else If .cp. .in. LetterDigits Then PVALID;
//   Else If .cp. .in. OtherLetterDigits Then ID_DIS or FREE_PVAL;
//   Else If .cp. .in. Spaces Then ID_DIS or FREE_PVAL;
//   Else If .cp. .in. Symbols Then ID_DIS or FREE_PVAL;
//   Else If .cp. .in. Punctuation Then ID_DIS or FREE_PVAL;
//   Else DISALLOWED;

func writeTables() {
	propTrie := triegen.NewTrie("derivedProperties")
	w := gen.NewCodeWriter()
	defer w.WriteVersionedGoFile(*outputFile, "precis")
	gen.WriteUnicodeVersion(w)

	// Iterate over all the runes...
	for i := rune(0); i < unicode.MaxRune; i++ {
		r := rune(i)

		if !utf8.ValidRune(r) {
			continue
		}

		e, ok := exceptions[i]
		p := e.prop
		switch {
		case ok:
		case !unicode.In(r, assigned):
			p = unassigned
		case r >= 0x0021 && r <= 0x007e: // Is ASCII 7
			p = pValid
		case unicode.In(r, disallowedRunes, unicode.Cc):
			p = disallowed
		case hasCompat(r):
			p = idDisOrFreePVal
		case isLetterDigits(r):
			p = pValid
		case isIdDisAndFreePVal(r):
			p = idDisOrFreePVal
		default:
			p = disallowed
		}
		cat := runeCategory[r]
		// Don't set category for runes that are disallowed.
		if p == disallowed {
			cat = exceptions[r].cat
		}
		propTrie.Insert(r, uint64(p)|uint64(cat))
	}
	sz, err := propTrie.Gen(w)
	if err != nil {
		log.Fatal(err)
	}
	w.Size += sz
}