diff options
Diffstat (limited to 'vendor/golang.org/x/text/encoding')
8 files changed, 1522 insertions, 48 deletions
diff --git a/vendor/golang.org/x/text/encoding/charmap/maketables.go b/vendor/golang.org/x/text/encoding/charmap/maketables.go new file mode 100644 index 000000000..f7941701e --- /dev/null +++ b/vendor/golang.org/x/text/encoding/charmap/maketables.go @@ -0,0 +1,556 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +package main + +import ( + "bufio" + "fmt" + "log" + "net/http" + "sort" + "strings" + "unicode/utf8" + + "golang.org/x/text/encoding" + "golang.org/x/text/internal/gen" +) + +const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + + ` !"#$%&'()*+,-./0123456789:;<=>?` + + `@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` + + "`abcdefghijklmnopqrstuvwxyz{|}~\u007f" + +var encodings = []struct { + name string + mib string + comment string + varName string + replacement byte + mapping string +}{ + { + "IBM Code Page 037", + "IBM037", + "", + "CodePage037", + 0x3f, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm", + }, + { + "IBM Code Page 437", + "PC8CodePage437", + "", + "CodePage437", + encoding.ASCIISub, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm", + }, + { + "IBM Code Page 850", + "PC850Multilingual", + "", + "CodePage850", + encoding.ASCIISub, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm", + }, + { + "IBM Code Page 852", + "PCp852", + "", + "CodePage852", + encoding.ASCIISub, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm", + }, + { + "IBM Code Page 855", + "IBM855", + "", + "CodePage855", + encoding.ASCIISub, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm", + }, + { + "Windows Code Page 858", // PC latin1 with Euro + "IBM00858", + "", + "CodePage858", + encoding.ASCIISub, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm", + }, + { + "IBM Code Page 860", + "IBM860", + "", + "CodePage860", + encoding.ASCIISub, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm", + }, + { + "IBM Code Page 862", + "PC862LatinHebrew", + "", + "CodePage862", + encoding.ASCIISub, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm", + }, + { + "IBM Code Page 863", + "IBM863", + "", + "CodePage863", + encoding.ASCIISub, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm", + }, + { + "IBM Code Page 865", + "IBM865", + "", + "CodePage865", + encoding.ASCIISub, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm", + }, + { + "IBM Code Page 866", + "IBM866", + "", + "CodePage866", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-ibm866.txt", + }, + { + "IBM Code Page 1047", + "IBM1047", + "", + "CodePage1047", + 0x3f, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm", + }, + { + "IBM Code Page 1140", + "IBM01140", + "", + "CodePage1140", + 0x3f, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm", + }, + { + "ISO 8859-1", + "ISOLatin1", + "", + "ISO8859_1", + encoding.ASCIISub, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm", + }, + { + "ISO 8859-2", + "ISOLatin2", + "", + "ISO8859_2", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-iso-8859-2.txt", + }, + { + "ISO 8859-3", + "ISOLatin3", + "", + "ISO8859_3", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-iso-8859-3.txt", + }, + { + "ISO 8859-4", + "ISOLatin4", + "", + "ISO8859_4", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-iso-8859-4.txt", + }, + { + "ISO 8859-5", + "ISOLatinCyrillic", + "", + "ISO8859_5", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-iso-8859-5.txt", + }, + { + "ISO 8859-6", + "ISOLatinArabic", + "", + "ISO8859_6,ISO8859_6E,ISO8859_6I", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-iso-8859-6.txt", + }, + { + "ISO 8859-7", + "ISOLatinGreek", + "", + "ISO8859_7", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-iso-8859-7.txt", + }, + { + "ISO 8859-8", + "ISOLatinHebrew", + "", + "ISO8859_8,ISO8859_8E,ISO8859_8I", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-iso-8859-8.txt", + }, + { + "ISO 8859-9", + "ISOLatin5", + "", + "ISO8859_9", + encoding.ASCIISub, + "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm", + }, + { + "ISO 8859-10", + "ISOLatin6", + "", + "ISO8859_10", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-iso-8859-10.txt", + }, + { + "ISO 8859-13", + "ISO885913", + "", + "ISO8859_13", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-iso-8859-13.txt", + }, + { + "ISO 8859-14", + "ISO885914", + "", + "ISO8859_14", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-iso-8859-14.txt", + }, + { + "ISO 8859-15", + "ISO885915", + "", + "ISO8859_15", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-iso-8859-15.txt", + }, + { + "ISO 8859-16", + "ISO885916", + "", + "ISO8859_16", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-iso-8859-16.txt", + }, + { + "KOI8-R", + "KOI8R", + "", + "KOI8R", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-koi8-r.txt", + }, + { + "KOI8-U", + "KOI8U", + "", + "KOI8U", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-koi8-u.txt", + }, + { + "Macintosh", + "Macintosh", + "", + "Macintosh", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-macintosh.txt", + }, + { + "Macintosh Cyrillic", + "MacintoshCyrillic", + "", + "MacintoshCyrillic", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt", + }, + { + "Windows 874", + "Windows874", + "", + "Windows874", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-windows-874.txt", + }, + { + "Windows 1250", + "Windows1250", + "", + "Windows1250", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-windows-1250.txt", + }, + { + "Windows 1251", + "Windows1251", + "", + "Windows1251", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-windows-1251.txt", + }, + { + "Windows 1252", + "Windows1252", + "", + "Windows1252", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-windows-1252.txt", + }, + { + "Windows 1253", + "Windows1253", + "", + "Windows1253", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-windows-1253.txt", + }, + { + "Windows 1254", + "Windows1254", + "", + "Windows1254", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-windows-1254.txt", + }, + { + "Windows 1255", + "Windows1255", + "", + "Windows1255", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-windows-1255.txt", + }, + { + "Windows 1256", + "Windows1256", + "", + "Windows1256", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-windows-1256.txt", + }, + { + "Windows 1257", + "Windows1257", + "", + "Windows1257", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-windows-1257.txt", + }, + { + "Windows 1258", + "Windows1258", + "", + "Windows1258", + encoding.ASCIISub, + "http://encoding.spec.whatwg.org/index-windows-1258.txt", + }, + { + "X-User-Defined", + "XUserDefined", + "It is defined at http://encoding.spec.whatwg.org/#x-user-defined", + "XUserDefined", + encoding.ASCIISub, + ascii + + "\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" + + "\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" + + "\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" + + "\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" + + "\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" + + "\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" + + "\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" + + "\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" + + "\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" + + "\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" + + "\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" + + "\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" + + "\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" + + "\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" + + "\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" + + "\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff", + }, +} + +func getWHATWG(url string) string { + res, err := http.Get(url) + if err != nil { + log.Fatalf("%q: Get: %v", url, err) + } + defer res.Body.Close() + + mapping := make([]rune, 128) + for i := range mapping { + mapping[i] = '\ufffd' + } + + scanner := bufio.NewScanner(res.Body) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if s == "" || s[0] == '#' { + continue + } + x, y := 0, 0 + if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil { + log.Fatalf("could not parse %q", s) + } + if x < 0 || 128 <= x { + log.Fatalf("code %d is out of range", x) + } + if 0x80 <= y && y < 0xa0 { + // We diverge from the WHATWG spec by mapping control characters + // in the range [0x80, 0xa0) to U+FFFD. + continue + } + mapping[x] = rune(y) + } + return ascii + string(mapping) +} + +func getUCM(url string) string { + res, err := http.Get(url) + if err != nil { + log.Fatalf("%q: Get: %v", url, err) + } + defer res.Body.Close() + + mapping := make([]rune, 256) + for i := range mapping { + mapping[i] = '\ufffd' + } + + charsFound := 0 + scanner := bufio.NewScanner(res.Body) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if s == "" || s[0] == '#' { + continue + } + var c byte + var r rune + if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil { + continue + } + mapping[c] = r + charsFound++ + } + + if charsFound < 200 { + log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound) + } + + return string(mapping) +} + +func main() { + mibs := map[string]bool{} + all := []string{} + + w := gen.NewCodeWriter() + defer w.WriteGoFile("tables.go", "charmap") + + printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) } + + printf("import (\n") + printf("\t\"golang.org/x/text/encoding\"\n") + printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n") + printf(")\n\n") + for _, e := range encodings { + varNames := strings.Split(e.varName, ",") + all = append(all, varNames...) + varName := varNames[0] + switch { + case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"): + e.mapping = getWHATWG(e.mapping) + case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"): + e.mapping = getUCM(e.mapping) + } + + asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00 + if asciiSuperset { + low = 0x80 + } + lvn := 1 + if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") { + lvn = 3 + } + lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:] + printf("// %s is the %s encoding.\n", varName, e.name) + if e.comment != "" { + printf("//\n// %s\n", e.comment) + } + printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n", + varName, lowerVarName, lowerVarName, e.name) + if mibs[e.mib] { + log.Fatalf("MIB type %q declared multiple times.", e.mib) + } + printf("mib: identifier.%s,\n", e.mib) + printf("asciiSuperset: %t,\n", asciiSuperset) + printf("low: 0x%02x,\n", low) + printf("replacement: 0x%02x,\n", e.replacement) + + printf("decode: [256]utf8Enc{\n") + i, backMapping := 0, map[rune]byte{} + for _, c := range e.mapping { + if _, ok := backMapping[c]; !ok && c != utf8.RuneError { + backMapping[c] = byte(i) + } + var buf [8]byte + n := utf8.EncodeRune(buf[:], c) + if n > 3 { + panic(fmt.Sprintf("rune %q (%U) is too long", c, c)) + } + printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2]) + if i%2 == 1 { + printf("\n") + } + i++ + } + printf("},\n") + + printf("encode: [256]uint32{\n") + encode := make([]uint32, 0, 256) + for c, i := range backMapping { + encode = append(encode, uint32(i)<<24|uint32(c)) + } + sort.Sort(byRune(encode)) + for len(encode) < cap(encode) { + encode = append(encode, encode[len(encode)-1]) + } + for i, enc := range encode { + printf("0x%08x,", enc) + if i%8 == 7 { + printf("\n") + } + } + printf("},\n}\n") + + // Add an estimate of the size of a single Charmap{} struct value, which + // includes two 256 elem arrays of 4 bytes and some extra fields, which + // align to 3 uint64s on 64-bit architectures. + w.Size += 2*4*256 + 3*8 + } + // TODO: add proper line breaking. + printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n")) +} + +type byRune []uint32 + +func (b byRune) Len() int { return len(b) } +func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff } +func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] } diff --git a/vendor/golang.org/x/text/encoding/htmlindex/gen.go b/vendor/golang.org/x/text/encoding/htmlindex/gen.go new file mode 100644 index 000000000..ac6b4a77f --- /dev/null +++ b/vendor/golang.org/x/text/encoding/htmlindex/gen.go @@ -0,0 +1,173 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +package main + +import ( + "bytes" + "encoding/json" + "fmt" + "log" + "strings" + + "golang.org/x/text/internal/gen" +) + +type group struct { + Encodings []struct { + Labels []string + Name string + } +} + +func main() { + gen.Init() + + r := gen.Open("https://encoding.spec.whatwg.org", "whatwg", "encodings.json") + var groups []group + if err := json.NewDecoder(r).Decode(&groups); err != nil { + log.Fatalf("Error reading encodings.json: %v", err) + } + + w := &bytes.Buffer{} + fmt.Fprintln(w, "type htmlEncoding byte") + fmt.Fprintln(w, "const (") + for i, g := range groups { + for _, e := range g.Encodings { + key := strings.ToLower(e.Name) + name := consts[key] + if name == "" { + log.Fatalf("No const defined for %s.", key) + } + if i == 0 { + fmt.Fprintf(w, "%s htmlEncoding = iota\n", name) + } else { + fmt.Fprintf(w, "%s\n", name) + } + } + } + fmt.Fprintln(w, "numEncodings") + fmt.Fprint(w, ")\n\n") + + fmt.Fprintln(w, "var canonical = [numEncodings]string{") + for _, g := range groups { + for _, e := range g.Encodings { + fmt.Fprintf(w, "%q,\n", strings.ToLower(e.Name)) + } + } + fmt.Fprint(w, "}\n\n") + + fmt.Fprintln(w, "var nameMap = map[string]htmlEncoding{") + for _, g := range groups { + for _, e := range g.Encodings { + for _, l := range e.Labels { + key := strings.ToLower(e.Name) + name := consts[key] + fmt.Fprintf(w, "%q: %s,\n", l, name) + } + } + } + fmt.Fprint(w, "}\n\n") + + var tags []string + fmt.Fprintln(w, "var localeMap = []htmlEncoding{") + for _, loc := range locales { + tags = append(tags, loc.tag) + fmt.Fprintf(w, "%s, // %s \n", consts[loc.name], loc.tag) + } + fmt.Fprint(w, "}\n\n") + + fmt.Fprintf(w, "const locales = %q\n", strings.Join(tags, " ")) + + gen.WriteGoFile("tables.go", "htmlindex", w.Bytes()) +} + +// consts maps canonical encoding name to internal constant. +var consts = map[string]string{ + "utf-8": "utf8", + "ibm866": "ibm866", + "iso-8859-2": "iso8859_2", + "iso-8859-3": "iso8859_3", + "iso-8859-4": "iso8859_4", + "iso-8859-5": "iso8859_5", + "iso-8859-6": "iso8859_6", + "iso-8859-7": "iso8859_7", + "iso-8859-8": "iso8859_8", + "iso-8859-8-i": "iso8859_8I", + "iso-8859-10": "iso8859_10", + "iso-8859-13": "iso8859_13", + "iso-8859-14": "iso8859_14", + "iso-8859-15": "iso8859_15", + "iso-8859-16": "iso8859_16", + "koi8-r": "koi8r", + "koi8-u": "koi8u", + "macintosh": "macintosh", + "windows-874": "windows874", + "windows-1250": "windows1250", + "windows-1251": "windows1251", + "windows-1252": "windows1252", + "windows-1253": "windows1253", + "windows-1254": "windows1254", + "windows-1255": "windows1255", + "windows-1256": "windows1256", + "windows-1257": "windows1257", + "windows-1258": "windows1258", + "x-mac-cyrillic": "macintoshCyrillic", + "gbk": "gbk", + "gb18030": "gb18030", + // "hz-gb-2312": "hzgb2312", // Was removed from WhatWG + "big5": "big5", + "euc-jp": "eucjp", + "iso-2022-jp": "iso2022jp", + "shift_jis": "shiftJIS", + "euc-kr": "euckr", + "replacement": "replacement", + "utf-16be": "utf16be", + "utf-16le": "utf16le", + "x-user-defined": "xUserDefined", +} + +// locales is taken from +// https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm. +var locales = []struct{ tag, name string }{ + // The default value. Explicitly state latin to benefit from the exact + // script option, while still making 1252 the default encoding for languages + // written in Latin script. + {"und_Latn", "windows-1252"}, + {"ar", "windows-1256"}, + {"ba", "windows-1251"}, + {"be", "windows-1251"}, + {"bg", "windows-1251"}, + {"cs", "windows-1250"}, + {"el", "iso-8859-7"}, + {"et", "windows-1257"}, + {"fa", "windows-1256"}, + {"he", "windows-1255"}, + {"hr", "windows-1250"}, + {"hu", "iso-8859-2"}, + {"ja", "shift_jis"}, + {"kk", "windows-1251"}, + {"ko", "euc-kr"}, + {"ku", "windows-1254"}, + {"ky", "windows-1251"}, + {"lt", "windows-1257"}, + {"lv", "windows-1257"}, + {"mk", "windows-1251"}, + {"pl", "iso-8859-2"}, + {"ru", "windows-1251"}, + {"sah", "windows-1251"}, + {"sk", "windows-1250"}, + {"sl", "iso-8859-2"}, + {"sr", "windows-1251"}, + {"tg", "windows-1251"}, + {"th", "windows-874"}, + {"tr", "windows-1254"}, + {"tt", "windows-1251"}, + {"uk", "windows-1251"}, + {"vi", "windows-1258"}, + {"zh-hans", "gb18030"}, + {"zh-hant", "big5"}, +} diff --git a/vendor/golang.org/x/text/encoding/internal/identifier/gen.go b/vendor/golang.org/x/text/encoding/internal/identifier/gen.go new file mode 100644 index 000000000..26cfef9c6 --- /dev/null +++ b/vendor/golang.org/x/text/encoding/internal/identifier/gen.go @@ -0,0 +1,142 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +package main + +import ( + "bytes" + "encoding/xml" + "fmt" + "io" + "log" + "strings" + + "golang.org/x/text/internal/gen" +) + +type registry struct { + XMLName xml.Name `xml:"registry"` + Updated string `xml:"updated"` + Registry []struct { + ID string `xml:"id,attr"` + Record []struct { + Name string `xml:"name"` + Xref []struct { + Type string `xml:"type,attr"` + Data string `xml:"data,attr"` + } `xml:"xref"` + Desc struct { + Data string `xml:",innerxml"` + // Any []struct { + // Data string `xml:",chardata"` + // } `xml:",any"` + // Data string `xml:",chardata"` + } `xml:"description,"` + MIB string `xml:"value"` + Alias []string `xml:"alias"` + MIME string `xml:"preferred_alias"` + } `xml:"record"` + } `xml:"registry"` +} + +func main() { + r := gen.OpenIANAFile("assignments/character-sets/character-sets.xml") + reg := ®istry{} + if err := xml.NewDecoder(r).Decode(®); err != nil && err != io.EOF { + log.Fatalf("Error decoding charset registry: %v", err) + } + if len(reg.Registry) == 0 || reg.Registry[0].ID != "character-sets-1" { + log.Fatalf("Unexpected ID %s", reg.Registry[0].ID) + } + + w := &bytes.Buffer{} + fmt.Fprintf(w, "const (\n") + for _, rec := range reg.Registry[0].Record { + constName := "" + for _, a := range rec.Alias { + if strings.HasPrefix(a, "cs") && strings.IndexByte(a, '-') == -1 { + // Some of the constant definitions have comments in them. Strip those. + constName = strings.Title(strings.SplitN(a[2:], "\n", 2)[0]) + } + } + if constName == "" { + switch rec.MIB { + case "2085": + constName = "HZGB2312" // Not listed as alias for some reason. + default: + log.Fatalf("No cs alias defined for %s.", rec.MIB) + } + } + if rec.MIME != "" { + rec.MIME = fmt.Sprintf(" (MIME: %s)", rec.MIME) + } + fmt.Fprintf(w, "// %s is the MIB identifier with IANA name %s%s.\n//\n", constName, rec.Name, rec.MIME) + if len(rec.Desc.Data) > 0 { + fmt.Fprint(w, "// ") + d := xml.NewDecoder(strings.NewReader(rec.Desc.Data)) + inElem := true + attr := "" + for { + t, err := d.Token() + if err != nil { + if err != io.EOF { + log.Fatal(err) + } + break + } + switch x := t.(type) { + case xml.CharData: + attr = "" // Don't need attribute info. + a := bytes.Split([]byte(x), []byte("\n")) + for i, b := range a { + if b = bytes.TrimSpace(b); len(b) != 0 { + if !inElem && i > 0 { + fmt.Fprint(w, "\n// ") + } + inElem = false + fmt.Fprintf(w, "%s ", string(b)) + } + } + case xml.StartElement: + if x.Name.Local == "xref" { + inElem = true + use := false + for _, a := range x.Attr { + if a.Name.Local == "type" { + use = use || a.Value != "person" + } + if a.Name.Local == "data" && use { + // Patch up URLs to use https. From some links, the + // https version is different from the http one. + s := a.Value + s = strings.Replace(s, "http://", "https://", -1) + s = strings.Replace(s, "/unicode/", "/", -1) + attr = s + " " + } + } + } + case xml.EndElement: + inElem = false + fmt.Fprint(w, attr) + } + } + fmt.Fprint(w, "\n") + } + for _, x := range rec.Xref { + switch x.Type { + case "rfc": + fmt.Fprintf(w, "// Reference: %s\n", strings.ToUpper(x.Data)) + case "uri": + fmt.Fprintf(w, "// Reference: %s\n", x.Data) + } + } + fmt.Fprintf(w, "%s MIB = %s\n", constName, rec.MIB) + fmt.Fprintln(w) + } + fmt.Fprintln(w, ")") + + gen.WriteGoFile("mib.go", "identifier", w.Bytes()) +} diff --git a/vendor/golang.org/x/text/encoding/internal/identifier/mib.go b/vendor/golang.org/x/text/encoding/internal/identifier/mib.go index 8cc29021c..fc7df1bc7 100644 --- a/vendor/golang.org/x/text/encoding/internal/identifier/mib.go +++ b/vendor/golang.org/x/text/encoding/internal/identifier/mib.go @@ -538,8 +538,6 @@ const ( // ISO111ECMACyrillic is the MIB identifier with IANA name ECMA-cyrillic. // // ISO registry - // (formerly ECMA - // registry ) ISO111ECMACyrillic MIB = 77 // ISO121Canadian1 is the MIB identifier with IANA name CSA_Z243.4-1985-1. @@ -732,18 +730,18 @@ const ( // ISO885913 is the MIB identifier with IANA name ISO-8859-13. // - // ISO See http://www.iana.org/assignments/charset-reg/ISO-8859-13 http://www.iana.org/assignments/charset-reg/ISO-8859-13 + // ISO See https://www.iana.org/assignments/charset-reg/ISO-8859-13 https://www.iana.org/assignments/charset-reg/ISO-8859-13 ISO885913 MIB = 109 // ISO885914 is the MIB identifier with IANA name ISO-8859-14. // - // ISO See http://www.iana.org/assignments/charset-reg/ISO-8859-14 + // ISO See https://www.iana.org/assignments/charset-reg/ISO-8859-14 ISO885914 MIB = 110 // ISO885915 is the MIB identifier with IANA name ISO-8859-15. // // ISO - // Please see: http://www.iana.org/assignments/charset-reg/ISO-8859-15 + // Please see: https://www.iana.org/assignments/charset-reg/ISO-8859-15 ISO885915 MIB = 111 // ISO885916 is the MIB identifier with IANA name ISO-8859-16. @@ -754,41 +752,41 @@ const ( // GBK is the MIB identifier with IANA name GBK. // // Chinese IT Standardization Technical Committee - // Please see: http://www.iana.org/assignments/charset-reg/GBK + // Please see: https://www.iana.org/assignments/charset-reg/GBK GBK MIB = 113 // GB18030 is the MIB identifier with IANA name GB18030. // // Chinese IT Standardization Technical Committee - // Please see: http://www.iana.org/assignments/charset-reg/GB18030 + // Please see: https://www.iana.org/assignments/charset-reg/GB18030 GB18030 MIB = 114 // OSDEBCDICDF0415 is the MIB identifier with IANA name OSD_EBCDIC_DF04_15. // // Fujitsu-Siemens standard mainframe EBCDIC encoding - // Please see: http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-15 + // Please see: https://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-15 OSDEBCDICDF0415 MIB = 115 // OSDEBCDICDF03IRV is the MIB identifier with IANA name OSD_EBCDIC_DF03_IRV. // // Fujitsu-Siemens standard mainframe EBCDIC encoding - // Please see: http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF03-IRV + // Please see: https://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF03-IRV OSDEBCDICDF03IRV MIB = 116 // OSDEBCDICDF041 is the MIB identifier with IANA name OSD_EBCDIC_DF04_1. // // Fujitsu-Siemens standard mainframe EBCDIC encoding - // Please see: http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-1 + // Please see: https://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-1 OSDEBCDICDF041 MIB = 117 // ISO115481 is the MIB identifier with IANA name ISO-11548-1. // - // See http://www.iana.org/assignments/charset-reg/ISO-11548-1 + // See https://www.iana.org/assignments/charset-reg/ISO-11548-1 ISO115481 MIB = 118 // KZ1048 is the MIB identifier with IANA name KZ-1048. // - // See http://www.iana.org/assignments/charset-reg/KZ-1048 + // See https://www.iana.org/assignments/charset-reg/KZ-1048 KZ1048 MIB = 119 // Unicode is the MIB identifier with IANA name ISO-10646-UCS-2. @@ -855,7 +853,7 @@ const ( // SCSU is the MIB identifier with IANA name SCSU. // - // SCSU See http://www.iana.org/assignments/charset-reg/SCSU + // SCSU See https://www.iana.org/assignments/charset-reg/SCSU SCSU MIB = 1011 // UTF7 is the MIB identifier with IANA name UTF-7. @@ -884,22 +882,22 @@ const ( // CESU8 is the MIB identifier with IANA name CESU-8. // - // https://www.unicode.org/unicode/reports/tr26 + // https://www.unicode.org/reports/tr26 CESU8 MIB = 1016 // UTF32 is the MIB identifier with IANA name UTF-32. // - // https://www.unicode.org/unicode/reports/tr19/ + // https://www.unicode.org/reports/tr19/ UTF32 MIB = 1017 // UTF32BE is the MIB identifier with IANA name UTF-32BE. // - // https://www.unicode.org/unicode/reports/tr19/ + // https://www.unicode.org/reports/tr19/ UTF32BE MIB = 1018 // UTF32LE is the MIB identifier with IANA name UTF-32LE. // - // https://www.unicode.org/unicode/reports/tr19/ + // https://www.unicode.org/reports/tr19/ UTF32LE MIB = 1019 // BOCU1 is the MIB identifier with IANA name BOCU-1. @@ -1461,152 +1459,152 @@ const ( // IBM00858 is the MIB identifier with IANA name IBM00858. // - // IBM See http://www.iana.org/assignments/charset-reg/IBM00858 + // IBM See https://www.iana.org/assignments/charset-reg/IBM00858 IBM00858 MIB = 2089 // IBM00924 is the MIB identifier with IANA name IBM00924. // - // IBM See http://www.iana.org/assignments/charset-reg/IBM00924 + // IBM See https://www.iana.org/assignments/charset-reg/IBM00924 IBM00924 MIB = 2090 // IBM01140 is the MIB identifier with IANA name IBM01140. // - // IBM See http://www.iana.org/assignments/charset-reg/IBM01140 + // IBM See https://www.iana.org/assignments/charset-reg/IBM01140 IBM01140 MIB = 2091 // IBM01141 is the MIB identifier with IANA name IBM01141. // - // IBM See http://www.iana.org/assignments/charset-reg/IBM01141 + // IBM See https://www.iana.org/assignments/charset-reg/IBM01141 IBM01141 MIB = 2092 // IBM01142 is the MIB identifier with IANA name IBM01142. // - // IBM See http://www.iana.org/assignments/charset-reg/IBM01142 + // IBM See https://www.iana.org/assignments/charset-reg/IBM01142 IBM01142 MIB = 2093 // IBM01143 is the MIB identifier with IANA name IBM01143. // - // IBM See http://www.iana.org/assignments/charset-reg/IBM01143 + // IBM See https://www.iana.org/assignments/charset-reg/IBM01143 IBM01143 MIB = 2094 // IBM01144 is the MIB identifier with IANA name IBM01144. // - // IBM See http://www.iana.org/assignments/charset-reg/IBM01144 + // IBM See https://www.iana.org/assignments/charset-reg/IBM01144 IBM01144 MIB = 2095 // IBM01145 is the MIB identifier with IANA name IBM01145. // - // IBM See http://www.iana.org/assignments/charset-reg/IBM01145 + // IBM See https://www.iana.org/assignments/charset-reg/IBM01145 IBM01145 MIB = 2096 // IBM01146 is the MIB identifier with IANA name IBM01146. // - // IBM See http://www.iana.org/assignments/charset-reg/IBM01146 + // IBM See https://www.iana.org/assignments/charset-reg/IBM01146 IBM01146 MIB = 2097 // IBM01147 is the MIB identifier with IANA name IBM01147. // - // IBM See http://www.iana.org/assignments/charset-reg/IBM01147 + // IBM See https://www.iana.org/assignments/charset-reg/IBM01147 IBM01147 MIB = 2098 // IBM01148 is the MIB identifier with IANA name IBM01148. // - // IBM See http://www.iana.org/assignments/charset-reg/IBM01148 + // IBM See https://www.iana.org/assignments/charset-reg/IBM01148 IBM01148 MIB = 2099 // IBM01149 is the MIB identifier with IANA name IBM01149. // - // IBM See http://www.iana.org/assignments/charset-reg/IBM01149 + // IBM See https://www.iana.org/assignments/charset-reg/IBM01149 IBM01149 MIB = 2100 // Big5HKSCS is the MIB identifier with IANA name Big5-HKSCS. // - // See http://www.iana.org/assignments/charset-reg/Big5-HKSCS + // See https://www.iana.org/assignments/charset-reg/Big5-HKSCS Big5HKSCS MIB = 2101 // IBM1047 is the MIB identifier with IANA name IBM1047. // - // IBM1047 (EBCDIC Latin 1/Open Systems) http://www-1.ibm.com/servers/eserver/iseries/software/globalization/pdf/cp01047z.pdf + // IBM1047 (EBCDIC Latin 1/Open Systems) https://www-1.ibm.com/servers/eserver/iseries/software/globalization/pdf/cp01047z.pdf IBM1047 MIB = 2102 // PTCP154 is the MIB identifier with IANA name PTCP154. // - // See http://www.iana.org/assignments/charset-reg/PTCP154 + // See https://www.iana.org/assignments/charset-reg/PTCP154 PTCP154 MIB = 2103 // Amiga1251 is the MIB identifier with IANA name Amiga-1251. // - // See http://www.amiga.ultranet.ru/Amiga-1251.html + // See https://www.amiga.ultranet.ru/Amiga-1251.html Amiga1251 MIB = 2104 // KOI7switched is the MIB identifier with IANA name KOI7-switched. // - // See http://www.iana.org/assignments/charset-reg/KOI7-switched + // See https://www.iana.org/assignments/charset-reg/KOI7-switched KOI7switched MIB = 2105 // BRF is the MIB identifier with IANA name BRF. // - // See http://www.iana.org/assignments/charset-reg/BRF + // See https://www.iana.org/assignments/charset-reg/BRF BRF MIB = 2106 // TSCII is the MIB identifier with IANA name TSCII. // - // See http://www.iana.org/assignments/charset-reg/TSCII + // See https://www.iana.org/assignments/charset-reg/TSCII TSCII MIB = 2107 // CP51932 is the MIB identifier with IANA name CP51932. // - // See http://www.iana.org/assignments/charset-reg/CP51932 + // See https://www.iana.org/assignments/charset-reg/CP51932 CP51932 MIB = 2108 // Windows874 is the MIB identifier with IANA name windows-874. // - // See http://www.iana.org/assignments/charset-reg/windows-874 + // See https://www.iana.org/assignments/charset-reg/windows-874 Windows874 MIB = 2109 // Windows1250 is the MIB identifier with IANA name windows-1250. // - // Microsoft http://www.iana.org/assignments/charset-reg/windows-1250 + // Microsoft https://www.iana.org/assignments/charset-reg/windows-1250 Windows1250 MIB = 2250 // Windows1251 is the MIB identifier with IANA name windows-1251. // - // Microsoft http://www.iana.org/assignments/charset-reg/windows-1251 + // Microsoft https://www.iana.org/assignments/charset-reg/windows-1251 Windows1251 MIB = 2251 // Windows1252 is the MIB identifier with IANA name windows-1252. // - // Microsoft http://www.iana.org/assignments/charset-reg/windows-1252 + // Microsoft https://www.iana.org/assignments/charset-reg/windows-1252 Windows1252 MIB = 2252 // Windows1253 is the MIB identifier with IANA name windows-1253. // - // Microsoft http://www.iana.org/assignments/charset-reg/windows-1253 + // Microsoft https://www.iana.org/assignments/charset-reg/windows-1253 Windows1253 MIB = 2253 // Windows1254 is the MIB identifier with IANA name windows-1254. // - // Microsoft http://www.iana.org/assignments/charset-reg/windows-1254 + // Microsoft https://www.iana.org/assignments/charset-reg/windows-1254 Windows1254 MIB = 2254 // Windows1255 is the MIB identifier with IANA name windows-1255. // - // Microsoft http://www.iana.org/assignments/charset-reg/windows-1255 + // Microsoft https://www.iana.org/assignments/charset-reg/windows-1255 Windows1255 MIB = 2255 // Windows1256 is the MIB identifier with IANA name windows-1256. // - // Microsoft http://www.iana.org/assignments/charset-reg/windows-1256 + // Microsoft https://www.iana.org/assignments/charset-reg/windows-1256 Windows1256 MIB = 2256 // Windows1257 is the MIB identifier with IANA name windows-1257. // - // Microsoft http://www.iana.org/assignments/charset-reg/windows-1257 + // Microsoft https://www.iana.org/assignments/charset-reg/windows-1257 Windows1257 MIB = 2257 // Windows1258 is the MIB identifier with IANA name windows-1258. // - // Microsoft http://www.iana.org/assignments/charset-reg/windows-1258 + // Microsoft https://www.iana.org/assignments/charset-reg/windows-1258 Windows1258 MIB = 2258 // TIS620 is the MIB identifier with IANA name TIS-620. @@ -1616,6 +1614,6 @@ const ( // CP50220 is the MIB identifier with IANA name CP50220. // - // See http://www.iana.org/assignments/charset-reg/CP50220 + // See https://www.iana.org/assignments/charset-reg/CP50220 CP50220 MIB = 2260 ) diff --git a/vendor/golang.org/x/text/encoding/japanese/maketables.go b/vendor/golang.org/x/text/encoding/japanese/maketables.go new file mode 100644 index 000000000..023957a67 --- /dev/null +++ b/vendor/golang.org/x/text/encoding/japanese/maketables.go @@ -0,0 +1,161 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +package main + +// This program generates tables.go: +// go run maketables.go | gofmt > tables.go + +// TODO: Emoji extensions? +// https://www.unicode.org/faq/emoji_dingbats.html +// https://www.unicode.org/Public/UNIDATA/EmojiSources.txt + +import ( + "bufio" + "fmt" + "log" + "net/http" + "sort" + "strings" +) + +type entry struct { + jisCode, table int +} + +func main() { + fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") + fmt.Printf("// Package japanese provides Japanese encodings such as EUC-JP and Shift JIS.\n") + fmt.Printf(`package japanese // import "golang.org/x/text/encoding/japanese"` + "\n\n") + + reverse := [65536]entry{} + for i := range reverse { + reverse[i].table = -1 + } + + tables := []struct { + url string + name string + }{ + {"http://encoding.spec.whatwg.org/index-jis0208.txt", "0208"}, + {"http://encoding.spec.whatwg.org/index-jis0212.txt", "0212"}, + } + for i, table := range tables { + res, err := http.Get(table.url) + if err != nil { + log.Fatalf("%q: Get: %v", table.url, err) + } + defer res.Body.Close() + + mapping := [65536]uint16{} + + scanner := bufio.NewScanner(res.Body) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if s == "" || s[0] == '#' { + continue + } + x, y := 0, uint16(0) + if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { + log.Fatalf("%q: could not parse %q", table.url, s) + } + if x < 0 || 120*94 <= x { + log.Fatalf("%q: JIS code %d is out of range", table.url, x) + } + mapping[x] = y + if reverse[y].table == -1 { + reverse[y] = entry{jisCode: x, table: i} + } + } + if err := scanner.Err(); err != nil { + log.Fatalf("%q: scanner error: %v", table.url, err) + } + + fmt.Printf("// jis%sDecode is the decoding table from JIS %s code to Unicode.\n// It is defined at %s\n", + table.name, table.name, table.url) + fmt.Printf("var jis%sDecode = [...]uint16{\n", table.name) + for i, m := range mapping { + if m != 0 { + fmt.Printf("\t%d: 0x%04X,\n", i, m) + } + } + fmt.Printf("}\n\n") + } + + // Any run of at least separation continuous zero entries in the reverse map will + // be a separate encode table. + const separation = 1024 + + intervals := []interval(nil) + low, high := -1, -1 + for i, v := range reverse { + if v.table == -1 { + continue + } + if low < 0 { + low = i + } else if i-high >= separation { + if high >= 0 { + intervals = append(intervals, interval{low, high}) + } + low = i + } + high = i + 1 + } + if high >= 0 { + intervals = append(intervals, interval{low, high}) + } + sort.Sort(byDecreasingLength(intervals)) + + fmt.Printf("const (\n") + fmt.Printf("\tjis0208 = 1\n") + fmt.Printf("\tjis0212 = 2\n") + fmt.Printf("\tcodeMask = 0x7f\n") + fmt.Printf("\tcodeShift = 7\n") + fmt.Printf("\ttableShift = 14\n") + fmt.Printf(")\n\n") + + fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) + fmt.Printf("// encodeX are the encoding tables from Unicode to JIS code,\n") + fmt.Printf("// sorted by decreasing length.\n") + for i, v := range intervals { + fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high) + } + fmt.Printf("//\n") + fmt.Printf("// The high two bits of the value record whether the JIS code comes from the\n") + fmt.Printf("// JIS0208 table (high bits == 1) or the JIS0212 table (high bits == 2).\n") + fmt.Printf("// The low 14 bits are two 7-bit unsigned integers j1 and j2 that form the\n") + fmt.Printf("// JIS code (94*j1 + j2) within that table.\n") + fmt.Printf("\n") + + for i, v := range intervals { + fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) + fmt.Printf("var encode%d = [...]uint16{\n", i) + for j := v.low; j < v.high; j++ { + x := reverse[j] + if x.table == -1 { + continue + } + fmt.Printf("\t%d - %d: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n", + j, v.low, tables[x.table].name, x.jisCode/94, x.jisCode%94) + } + fmt.Printf("}\n\n") + } +} + +// interval is a half-open interval [low, high). +type interval struct { + low, high int +} + +func (i interval) len() int { return i.high - i.low } + +// byDecreasingLength sorts intervals by decreasing length. +type byDecreasingLength []interval + +func (b byDecreasingLength) Len() int { return len(b) } +func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } +func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } diff --git a/vendor/golang.org/x/text/encoding/korean/maketables.go b/vendor/golang.org/x/text/encoding/korean/maketables.go new file mode 100644 index 000000000..c84034fb6 --- /dev/null +++ b/vendor/golang.org/x/text/encoding/korean/maketables.go @@ -0,0 +1,143 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +package main + +// This program generates tables.go: +// go run maketables.go | gofmt > tables.go + +import ( + "bufio" + "fmt" + "log" + "net/http" + "sort" + "strings" +) + +func main() { + fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") + fmt.Printf("// Package korean provides Korean encodings such as EUC-KR.\n") + fmt.Printf(`package korean // import "golang.org/x/text/encoding/korean"` + "\n\n") + + res, err := http.Get("http://encoding.spec.whatwg.org/index-euc-kr.txt") + if err != nil { + log.Fatalf("Get: %v", err) + } + defer res.Body.Close() + + mapping := [65536]uint16{} + reverse := [65536]uint16{} + + scanner := bufio.NewScanner(res.Body) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if s == "" || s[0] == '#' { + continue + } + x, y := uint16(0), uint16(0) + if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { + log.Fatalf("could not parse %q", s) + } + if x < 0 || 178*(0xc7-0x81)+(0xfe-0xc7)*94+(0xff-0xa1) <= x { + log.Fatalf("EUC-KR code %d is out of range", x) + } + mapping[x] = y + if reverse[y] == 0 { + c0, c1 := uint16(0), uint16(0) + if x < 178*(0xc7-0x81) { + c0 = uint16(x/178) + 0x81 + c1 = uint16(x % 178) + switch { + case c1 < 1*26: + c1 += 0x41 + case c1 < 2*26: + c1 += 0x47 + default: + c1 += 0x4d + } + } else { + x -= 178 * (0xc7 - 0x81) + c0 = uint16(x/94) + 0xc7 + c1 = uint16(x%94) + 0xa1 + } + reverse[y] = c0<<8 | c1 + } + } + if err := scanner.Err(); err != nil { + log.Fatalf("scanner error: %v", err) + } + + fmt.Printf("// decode is the decoding table from EUC-KR code to Unicode.\n") + fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-euc-kr.txt\n") + fmt.Printf("var decode = [...]uint16{\n") + for i, v := range mapping { + if v != 0 { + fmt.Printf("\t%d: 0x%04X,\n", i, v) + } + } + fmt.Printf("}\n\n") + + // Any run of at least separation continuous zero entries in the reverse map will + // be a separate encode table. + const separation = 1024 + + intervals := []interval(nil) + low, high := -1, -1 + for i, v := range reverse { + if v == 0 { + continue + } + if low < 0 { + low = i + } else if i-high >= separation { + if high >= 0 { + intervals = append(intervals, interval{low, high}) + } + low = i + } + high = i + 1 + } + if high >= 0 { + intervals = append(intervals, interval{low, high}) + } + sort.Sort(byDecreasingLength(intervals)) + + fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) + fmt.Printf("// encodeX are the encoding tables from Unicode to EUC-KR code,\n") + fmt.Printf("// sorted by decreasing length.\n") + for i, v := range intervals { + fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high) + } + fmt.Printf("\n") + + for i, v := range intervals { + fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) + fmt.Printf("var encode%d = [...]uint16{\n", i) + for j := v.low; j < v.high; j++ { + x := reverse[j] + if x == 0 { + continue + } + fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x) + } + fmt.Printf("}\n\n") + } +} + +// interval is a half-open interval [low, high). +type interval struct { + low, high int +} + +func (i interval) len() int { return i.high - i.low } + +// byDecreasingLength sorts intervals by decreasing length. +type byDecreasingLength []interval + +func (b byDecreasingLength) Len() int { return len(b) } +func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } +func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } diff --git a/vendor/golang.org/x/text/encoding/simplifiedchinese/maketables.go b/vendor/golang.org/x/text/encoding/simplifiedchinese/maketables.go new file mode 100644 index 000000000..55016c786 --- /dev/null +++ b/vendor/golang.org/x/text/encoding/simplifiedchinese/maketables.go @@ -0,0 +1,161 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +package main + +// This program generates tables.go: +// go run maketables.go | gofmt > tables.go + +import ( + "bufio" + "fmt" + "log" + "net/http" + "sort" + "strings" +) + +func main() { + fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") + fmt.Printf("// Package simplifiedchinese provides Simplified Chinese encodings such as GBK.\n") + fmt.Printf(`package simplifiedchinese // import "golang.org/x/text/encoding/simplifiedchinese"` + "\n\n") + + printGB18030() + printGBK() +} + +func printGB18030() { + res, err := http.Get("http://encoding.spec.whatwg.org/index-gb18030.txt") + if err != nil { + log.Fatalf("Get: %v", err) + } + defer res.Body.Close() + + fmt.Printf("// gb18030 is the table from http://encoding.spec.whatwg.org/index-gb18030.txt\n") + fmt.Printf("var gb18030 = [...][2]uint16{\n") + scanner := bufio.NewScanner(res.Body) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if s == "" || s[0] == '#' { + continue + } + x, y := uint32(0), uint32(0) + if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { + log.Fatalf("could not parse %q", s) + } + if x < 0x10000 && y < 0x10000 { + fmt.Printf("\t{0x%04x, 0x%04x},\n", x, y) + } + } + fmt.Printf("}\n\n") +} + +func printGBK() { + res, err := http.Get("http://encoding.spec.whatwg.org/index-gbk.txt") + if err != nil { + log.Fatalf("Get: %v", err) + } + defer res.Body.Close() + + mapping := [65536]uint16{} + reverse := [65536]uint16{} + + scanner := bufio.NewScanner(res.Body) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if s == "" || s[0] == '#' { + continue + } + x, y := uint16(0), uint16(0) + if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { + log.Fatalf("could not parse %q", s) + } + if x < 0 || 126*190 <= x { + log.Fatalf("GBK code %d is out of range", x) + } + mapping[x] = y + if reverse[y] == 0 { + c0, c1 := x/190, x%190 + if c1 >= 0x3f { + c1++ + } + reverse[y] = (0x81+c0)<<8 | (0x40 + c1) + } + } + if err := scanner.Err(); err != nil { + log.Fatalf("scanner error: %v", err) + } + + fmt.Printf("// decode is the decoding table from GBK code to Unicode.\n") + fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt\n") + fmt.Printf("var decode = [...]uint16{\n") + for i, v := range mapping { + if v != 0 { + fmt.Printf("\t%d: 0x%04X,\n", i, v) + } + } + fmt.Printf("}\n\n") + + // Any run of at least separation continuous zero entries in the reverse map will + // be a separate encode table. + const separation = 1024 + + intervals := []interval(nil) + low, high := -1, -1 + for i, v := range reverse { + if v == 0 { + continue + } + if low < 0 { + low = i + } else if i-high >= separation { + if high >= 0 { + intervals = append(intervals, interval{low, high}) + } + low = i + } + high = i + 1 + } + if high >= 0 { + intervals = append(intervals, interval{low, high}) + } + sort.Sort(byDecreasingLength(intervals)) + + fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) + fmt.Printf("// encodeX are the encoding tables from Unicode to GBK code,\n") + fmt.Printf("// sorted by decreasing length.\n") + for i, v := range intervals { + fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high) + } + fmt.Printf("\n") + + for i, v := range intervals { + fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) + fmt.Printf("var encode%d = [...]uint16{\n", i) + for j := v.low; j < v.high; j++ { + x := reverse[j] + if x == 0 { + continue + } + fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x) + } + fmt.Printf("}\n\n") + } +} + +// interval is a half-open interval [low, high). +type interval struct { + low, high int +} + +func (i interval) len() int { return i.high - i.low } + +// byDecreasingLength sorts intervals by decreasing length. +type byDecreasingLength []interval + +func (b byDecreasingLength) Len() int { return len(b) } +func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } +func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } diff --git a/vendor/golang.org/x/text/encoding/traditionalchinese/maketables.go b/vendor/golang.org/x/text/encoding/traditionalchinese/maketables.go new file mode 100644 index 000000000..cf7fdb31a --- /dev/null +++ b/vendor/golang.org/x/text/encoding/traditionalchinese/maketables.go @@ -0,0 +1,140 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +package main + +// This program generates tables.go: +// go run maketables.go | gofmt > tables.go + +import ( + "bufio" + "fmt" + "log" + "net/http" + "sort" + "strings" +) + +func main() { + fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") + fmt.Printf("// Package traditionalchinese provides Traditional Chinese encodings such as Big5.\n") + fmt.Printf(`package traditionalchinese // import "golang.org/x/text/encoding/traditionalchinese"` + "\n\n") + + res, err := http.Get("http://encoding.spec.whatwg.org/index-big5.txt") + if err != nil { + log.Fatalf("Get: %v", err) + } + defer res.Body.Close() + + mapping := [65536]uint32{} + reverse := [65536 * 4]uint16{} + + scanner := bufio.NewScanner(res.Body) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if s == "" || s[0] == '#' { + continue + } + x, y := uint16(0), uint32(0) + if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { + log.Fatalf("could not parse %q", s) + } + if x < 0 || 126*157 <= x { + log.Fatalf("Big5 code %d is out of range", x) + } + mapping[x] = y + + // The WHATWG spec http://encoding.spec.whatwg.org/#indexes says that + // "The index pointer for code point in index is the first pointer + // corresponding to code point in index", which would normally mean + // that the code below should be guarded by "if reverse[y] == 0", but + // last instead of first seems to match the behavior of + // "iconv -f UTF-8 -t BIG5". For example, U+8005 者 occurs twice in + // http://encoding.spec.whatwg.org/index-big5.txt, as index 2148 + // (encoded as "\x8e\xcd") and index 6543 (encoded as "\xaa\xcc") + // and "echo 者 | iconv -f UTF-8 -t BIG5 | xxd" gives "\xaa\xcc". + c0, c1 := x/157, x%157 + if c1 < 0x3f { + c1 += 0x40 + } else { + c1 += 0x62 + } + reverse[y] = (0x81+c0)<<8 | c1 + } + if err := scanner.Err(); err != nil { + log.Fatalf("scanner error: %v", err) + } + + fmt.Printf("// decode is the decoding table from Big5 code to Unicode.\n") + fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-big5.txt\n") + fmt.Printf("var decode = [...]uint32{\n") + for i, v := range mapping { + if v != 0 { + fmt.Printf("\t%d: 0x%08X,\n", i, v) + } + } + fmt.Printf("}\n\n") + + // Any run of at least separation continuous zero entries in the reverse map will + // be a separate encode table. + const separation = 1024 + + intervals := []interval(nil) + low, high := -1, -1 + for i, v := range reverse { + if v == 0 { + continue + } + if low < 0 { + low = i + } else if i-high >= separation { + if high >= 0 { + intervals = append(intervals, interval{low, high}) + } + low = i + } + high = i + 1 + } + if high >= 0 { + intervals = append(intervals, interval{low, high}) + } + sort.Sort(byDecreasingLength(intervals)) + + fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) + fmt.Printf("// encodeX are the encoding tables from Unicode to Big5 code,\n") + fmt.Printf("// sorted by decreasing length.\n") + for i, v := range intervals { + fmt.Printf("// encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high) + } + fmt.Printf("\n") + + for i, v := range intervals { + fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) + fmt.Printf("var encode%d = [...]uint16{\n", i) + for j := v.low; j < v.high; j++ { + x := reverse[j] + if x == 0 { + continue + } + fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x) + } + fmt.Printf("}\n\n") + } +} + +// interval is a half-open interval [low, high). +type interval struct { + low, high int +} + +func (i interval) len() int { return i.high - i.low } + +// byDecreasingLength sorts intervals by decreasing length. +type byDecreasingLength []interval + +func (b byDecreasingLength) Len() int { return len(b) } +func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } +func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |