diff options
Diffstat (limited to 'vendor/golang.org/x/text/language/parse.go')
-rw-r--r-- | vendor/golang.org/x/text/language/parse.go | 737 |
1 files changed, 53 insertions, 684 deletions
diff --git a/vendor/golang.org/x/text/language/parse.go b/vendor/golang.org/x/text/language/parse.go index cfa28f56e..11acfd885 100644 --- a/vendor/golang.org/x/text/language/parse.go +++ b/vendor/golang.org/x/text/language/parse.go @@ -5,216 +5,21 @@ package language import ( - "bytes" "errors" - "fmt" - "sort" "strconv" "strings" - "golang.org/x/text/internal/tag" + "golang.org/x/text/internal/language" ) -// isAlpha returns true if the byte is not a digit. -// b must be an ASCII letter or digit. -func isAlpha(b byte) bool { - return b > '9' -} - -// isAlphaNum returns true if the string contains only ASCII letters or digits. -func isAlphaNum(s []byte) bool { - for _, c := range s { - if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') { - return false - } - } - return true -} - -// errSyntax is returned by any of the parsing functions when the -// input is not well-formed, according to BCP 47. -// TODO: return the position at which the syntax error occurred? -var errSyntax = errors.New("language: tag is not well-formed") - // ValueError is returned by any of the parsing functions when the // input is well-formed but the respective subtag is not recognized // as a valid value. -type ValueError struct { - v [8]byte -} - -func mkErrInvalid(s []byte) error { - var e ValueError - copy(e.v[:], s) - return e -} - -func (e ValueError) tag() []byte { - n := bytes.IndexByte(e.v[:], 0) - if n == -1 { - n = 8 - } - return e.v[:n] -} - -// Error implements the error interface. -func (e ValueError) Error() string { - return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag()) -} - -// Subtag returns the subtag for which the error occurred. -func (e ValueError) Subtag() string { - return string(e.tag()) -} - -// scanner is used to scan BCP 47 tokens, which are separated by _ or -. -type scanner struct { - b []byte - bytes [max99thPercentileSize]byte - token []byte - start int // start position of the current token - end int // end position of the current token - next int // next point for scan - err error - done bool -} - -func makeScannerString(s string) scanner { - scan := scanner{} - if len(s) <= len(scan.bytes) { - scan.b = scan.bytes[:copy(scan.bytes[:], s)] - } else { - scan.b = []byte(s) - } - scan.init() - return scan -} - -// makeScanner returns a scanner using b as the input buffer. -// b is not copied and may be modified by the scanner routines. -func makeScanner(b []byte) scanner { - scan := scanner{b: b} - scan.init() - return scan -} - -func (s *scanner) init() { - for i, c := range s.b { - if c == '_' { - s.b[i] = '-' - } - } - s.scan() -} - -// restToLower converts the string between start and end to lower case. -func (s *scanner) toLower(start, end int) { - for i := start; i < end; i++ { - c := s.b[i] - if 'A' <= c && c <= 'Z' { - s.b[i] += 'a' - 'A' - } - } -} +type ValueError interface { + error -func (s *scanner) setError(e error) { - if s.err == nil || (e == errSyntax && s.err != errSyntax) { - s.err = e - } -} - -// resizeRange shrinks or grows the array at position oldStart such that -// a new string of size newSize can fit between oldStart and oldEnd. -// Sets the scan point to after the resized range. -func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) { - s.start = oldStart - if end := oldStart + newSize; end != oldEnd { - diff := end - oldEnd - if end < cap(s.b) { - b := make([]byte, len(s.b)+diff) - copy(b, s.b[:oldStart]) - copy(b[end:], s.b[oldEnd:]) - s.b = b - } else { - s.b = append(s.b[end:], s.b[oldEnd:]...) - } - s.next = end + (s.next - s.end) - s.end = end - } -} - -// replace replaces the current token with repl. -func (s *scanner) replace(repl string) { - s.resizeRange(s.start, s.end, len(repl)) - copy(s.b[s.start:], repl) -} - -// gobble removes the current token from the input. -// Caller must call scan after calling gobble. -func (s *scanner) gobble(e error) { - s.setError(e) - if s.start == 0 { - s.b = s.b[:+copy(s.b, s.b[s.next:])] - s.end = 0 - } else { - s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])] - s.end = s.start - 1 - } - s.next = s.start -} - -// deleteRange removes the given range from s.b before the current token. -func (s *scanner) deleteRange(start, end int) { - s.setError(errSyntax) - s.b = s.b[:start+copy(s.b[start:], s.b[end:])] - diff := end - start - s.next -= diff - s.start -= diff - s.end -= diff -} - -// scan parses the next token of a BCP 47 string. Tokens that are larger -// than 8 characters or include non-alphanumeric characters result in an error -// and are gobbled and removed from the output. -// It returns the end position of the last token consumed. -func (s *scanner) scan() (end int) { - end = s.end - s.token = nil - for s.start = s.next; s.next < len(s.b); { - i := bytes.IndexByte(s.b[s.next:], '-') - if i == -1 { - s.end = len(s.b) - s.next = len(s.b) - i = s.end - s.start - } else { - s.end = s.next + i - s.next = s.end + 1 - } - token := s.b[s.start:s.end] - if i < 1 || i > 8 || !isAlphaNum(token) { - s.gobble(errSyntax) - continue - } - s.token = token - return end - } - if n := len(s.b); n > 0 && s.b[n-1] == '-' { - s.setError(errSyntax) - s.b = s.b[:len(s.b)-1] - } - s.done = true - return end -} - -// acceptMinSize parses multiple tokens of the given size or greater. -// It returns the end position of the last token consumed. -func (s *scanner) acceptMinSize(min int) (end int) { - end = s.end - s.scan() - for ; len(s.token) >= min; s.scan() { - end = s.end - } - return end + // Subtag returns the subtag for which the error occurred. + Subtag() string } // Parse parses the given BCP 47 string and returns a valid Tag. If parsing @@ -223,7 +28,7 @@ func (s *scanner) acceptMinSize(min int) (end int) { // ValueError. The Tag returned in this case is just stripped of the unknown // value. All other values are preserved. It accepts tags in the BCP 47 format // and extensions to this standard defined in -// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. +// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. // The resulting tag is canonicalized using the default canonicalization type. func Parse(s string) (t Tag, err error) { return Default.Parse(s) @@ -235,327 +40,18 @@ func Parse(s string) (t Tag, err error) { // ValueError. The Tag returned in this case is just stripped of the unknown // value. All other values are preserved. It accepts tags in the BCP 47 format // and extensions to this standard defined in -// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. -// The resulting tag is canonicalized using the the canonicalization type c. +// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. +// The resulting tag is canonicalized using the canonicalization type c. func (c CanonType) Parse(s string) (t Tag, err error) { - // TODO: consider supporting old-style locale key-value pairs. - if s == "" { - return und, errSyntax - } - if len(s) <= maxAltTaglen { - b := [maxAltTaglen]byte{} - for i, c := range s { - // Generating invalid UTF-8 is okay as it won't match. - if 'A' <= c && c <= 'Z' { - c += 'a' - 'A' - } else if c == '_' { - c = '-' - } - b[i] = byte(c) - } - if t, ok := grandfathered(b); ok { - return t, nil - } + tt, err := language.Parse(s) + if err != nil { + return makeTag(tt), err } - scan := makeScannerString(s) - t, err = parse(&scan, s) - t, changed := t.canonicalize(c) + tt, changed := canonicalize(c, tt) if changed { - t.remakeString() - } - return t, err -} - -func parse(scan *scanner, s string) (t Tag, err error) { - t = und - var end int - if n := len(scan.token); n <= 1 { - scan.toLower(0, len(scan.b)) - if n == 0 || scan.token[0] != 'x' { - return t, errSyntax - } - end = parseExtensions(scan) - } else if n >= 4 { - return und, errSyntax - } else { // the usual case - t, end = parseTag(scan) - if n := len(scan.token); n == 1 { - t.pExt = uint16(end) - end = parseExtensions(scan) - } else if end < len(scan.b) { - scan.setError(errSyntax) - scan.b = scan.b[:end] - } - } - if int(t.pVariant) < len(scan.b) { - if end < len(s) { - s = s[:end] - } - if len(s) > 0 && tag.Compare(s, scan.b) == 0 { - t.str = s - } else { - t.str = string(scan.b) - } - } else { - t.pVariant, t.pExt = 0, 0 - } - return t, scan.err -} - -// parseTag parses language, script, region and variants. -// It returns a Tag and the end position in the input that was parsed. -func parseTag(scan *scanner) (t Tag, end int) { - var e error - // TODO: set an error if an unknown lang, script or region is encountered. - t.lang, e = getLangID(scan.token) - scan.setError(e) - scan.replace(t.lang.String()) - langStart := scan.start - end = scan.scan() - for len(scan.token) == 3 && isAlpha(scan.token[0]) { - // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent - // to a tag of the form <extlang>. - lang, e := getLangID(scan.token) - if lang != 0 { - t.lang = lang - copy(scan.b[langStart:], lang.String()) - scan.b[langStart+3] = '-' - scan.start = langStart + 4 - } - scan.gobble(e) - end = scan.scan() - } - if len(scan.token) == 4 && isAlpha(scan.token[0]) { - t.script, e = getScriptID(script, scan.token) - if t.script == 0 { - scan.gobble(e) - } - end = scan.scan() - } - if n := len(scan.token); n >= 2 && n <= 3 { - t.region, e = getRegionID(scan.token) - if t.region == 0 { - scan.gobble(e) - } else { - scan.replace(t.region.String()) - } - end = scan.scan() - } - scan.toLower(scan.start, len(scan.b)) - t.pVariant = byte(end) - end = parseVariants(scan, end, t) - t.pExt = uint16(end) - return t, end -} - -var separator = []byte{'-'} - -// parseVariants scans tokens as long as each token is a valid variant string. -// Duplicate variants are removed. -func parseVariants(scan *scanner, end int, t Tag) int { - start := scan.start - varIDBuf := [4]uint8{} - variantBuf := [4][]byte{} - varID := varIDBuf[:0] - variant := variantBuf[:0] - last := -1 - needSort := false - for ; len(scan.token) >= 4; scan.scan() { - // TODO: measure the impact of needing this conversion and redesign - // the data structure if there is an issue. - v, ok := variantIndex[string(scan.token)] - if !ok { - // unknown variant - // TODO: allow user-defined variants? - scan.gobble(mkErrInvalid(scan.token)) - continue - } - varID = append(varID, v) - variant = append(variant, scan.token) - if !needSort { - if last < int(v) { - last = int(v) - } else { - needSort = true - // There is no legal combinations of more than 7 variants - // (and this is by no means a useful sequence). - const maxVariants = 8 - if len(varID) > maxVariants { - break - } - } - } - end = scan.end - } - if needSort { - sort.Sort(variantsSort{varID, variant}) - k, l := 0, -1 - for i, v := range varID { - w := int(v) - if l == w { - // Remove duplicates. - continue - } - varID[k] = varID[i] - variant[k] = variant[i] - k++ - l = w - } - if str := bytes.Join(variant[:k], separator); len(str) == 0 { - end = start - 1 - } else { - scan.resizeRange(start, end, len(str)) - copy(scan.b[scan.start:], str) - end = scan.end - } - } - return end -} - -type variantsSort struct { - i []uint8 - v [][]byte -} - -func (s variantsSort) Len() int { - return len(s.i) -} - -func (s variantsSort) Swap(i, j int) { - s.i[i], s.i[j] = s.i[j], s.i[i] - s.v[i], s.v[j] = s.v[j], s.v[i] -} - -func (s variantsSort) Less(i, j int) bool { - return s.i[i] < s.i[j] -} - -type bytesSort [][]byte - -func (b bytesSort) Len() int { - return len(b) -} - -func (b bytesSort) Swap(i, j int) { - b[i], b[j] = b[j], b[i] -} - -func (b bytesSort) Less(i, j int) bool { - return bytes.Compare(b[i], b[j]) == -1 -} - -// parseExtensions parses and normalizes the extensions in the buffer. -// It returns the last position of scan.b that is part of any extension. -// It also trims scan.b to remove excess parts accordingly. -func parseExtensions(scan *scanner) int { - start := scan.start - exts := [][]byte{} - private := []byte{} - end := scan.end - for len(scan.token) == 1 { - extStart := scan.start - ext := scan.token[0] - end = parseExtension(scan) - extension := scan.b[extStart:end] - if len(extension) < 3 || (ext != 'x' && len(extension) < 4) { - scan.setError(errSyntax) - end = extStart - continue - } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) { - scan.b = scan.b[:end] - return end - } else if ext == 'x' { - private = extension - break - } - exts = append(exts, extension) + tt.RemakeString() } - sort.Sort(bytesSort(exts)) - if len(private) > 0 { - exts = append(exts, private) - } - scan.b = scan.b[:start] - if len(exts) > 0 { - scan.b = append(scan.b, bytes.Join(exts, separator)...) - } else if start > 0 { - // Strip trailing '-'. - scan.b = scan.b[:start-1] - } - return end -} - -// parseExtension parses a single extension and returns the position of -// the extension end. -func parseExtension(scan *scanner) int { - start, end := scan.start, scan.end - switch scan.token[0] { - case 'u': - attrStart := end - scan.scan() - for last := []byte{}; len(scan.token) > 2; scan.scan() { - if bytes.Compare(scan.token, last) != -1 { - // Attributes are unsorted. Start over from scratch. - p := attrStart + 1 - scan.next = p - attrs := [][]byte{} - for scan.scan(); len(scan.token) > 2; scan.scan() { - attrs = append(attrs, scan.token) - end = scan.end - } - sort.Sort(bytesSort(attrs)) - copy(scan.b[p:], bytes.Join(attrs, separator)) - break - } - last = scan.token - end = scan.end - } - var last, key []byte - for attrEnd := end; len(scan.token) == 2; last = key { - key = scan.token - keyEnd := scan.end - end = scan.acceptMinSize(3) - // TODO: check key value validity - if keyEnd == end || bytes.Compare(key, last) != 1 { - // We have an invalid key or the keys are not sorted. - // Start scanning keys from scratch and reorder. - p := attrEnd + 1 - scan.next = p - keys := [][]byte{} - for scan.scan(); len(scan.token) == 2; { - keyStart, keyEnd := scan.start, scan.end - end = scan.acceptMinSize(3) - if keyEnd != end { - keys = append(keys, scan.b[keyStart:end]) - } else { - scan.setError(errSyntax) - end = keyStart - } - } - sort.Sort(bytesSort(keys)) - reordered := bytes.Join(keys, separator) - if e := p + len(reordered); e < end { - scan.deleteRange(e, end) - end = e - } - copy(scan.b[p:], bytes.Join(keys, separator)) - break - } - } - case 't': - scan.scan() - if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) { - _, end = parseTag(scan) - scan.toLower(start, end) - } - for len(scan.token) == 2 && !isAlpha(scan.token[1]) { - end = scan.acceptMinSize(3) - } - case 'x': - end = scan.acceptMinSize(1) - default: - end = scan.acceptMinSize(2) - } - return end + return makeTag(tt), err } // Compose creates a Tag from individual parts, which may be of type Tag, Base, @@ -563,10 +59,11 @@ func parseExtension(scan *scanner) int { // Base, Script or Region or slice of type Variant or Extension is passed more // than once, the latter will overwrite the former. Variants and Extensions are // accumulated, but if two extensions of the same type are passed, the latter -// will replace the former. A Tag overwrites all former values and typically -// only makes sense as the first argument. The resulting tag is returned after -// canonicalizing using the Default CanonType. If one or more errors are -// encountered, one of the errors is returned. +// will replace the former. For -u extensions, though, the key-type pairs are +// added, where later values overwrite older ones. A Tag overwrites all former +// values and typically only makes sense as the first argument. The resulting +// tag is returned after canonicalizing using the Default CanonType. If one or +// more errors are encountered, one of the errors is returned. func Compose(part ...interface{}) (t Tag, err error) { return Default.Compose(part...) } @@ -576,196 +73,68 @@ func Compose(part ...interface{}) (t Tag, err error) { // Base, Script or Region or slice of type Variant or Extension is passed more // than once, the latter will overwrite the former. Variants and Extensions are // accumulated, but if two extensions of the same type are passed, the latter -// will replace the former. A Tag overwrites all former values and typically -// only makes sense as the first argument. The resulting tag is returned after -// canonicalizing using CanonType c. If one or more errors are encountered, -// one of the errors is returned. +// will replace the former. For -u extensions, though, the key-type pairs are +// added, where later values overwrite older ones. A Tag overwrites all former +// values and typically only makes sense as the first argument. The resulting +// tag is returned after canonicalizing using CanonType c. If one or more errors +// are encountered, one of the errors is returned. func (c CanonType) Compose(part ...interface{}) (t Tag, err error) { - var b builder - if err = b.update(part...); err != nil { + var b language.Builder + if err = update(&b, part...); err != nil { return und, err } - t, _ = b.tag.canonicalize(c) - - if len(b.ext) > 0 || len(b.variant) > 0 { - sort.Sort(sortVariant(b.variant)) - sort.Strings(b.ext) - if b.private != "" { - b.ext = append(b.ext, b.private) - } - n := maxCoreSize + tokenLen(b.variant...) + tokenLen(b.ext...) - buf := make([]byte, n) - p := t.genCoreBytes(buf) - t.pVariant = byte(p) - p += appendTokens(buf[p:], b.variant...) - t.pExt = uint16(p) - p += appendTokens(buf[p:], b.ext...) - t.str = string(buf[:p]) - } else if b.private != "" { - t.str = b.private - t.remakeString() - } - return -} - -type builder struct { - tag Tag - - private string // the x extension - ext []string - variant []string - - err error -} - -func (b *builder) addExt(e string) { - if e == "" { - } else if e[0] == 'x' { - b.private = e - } else { - b.ext = append(b.ext, e) - } + b.Tag, _ = canonicalize(c, b.Tag) + return makeTag(b.Make()), err } var errInvalidArgument = errors.New("invalid Extension or Variant") -func (b *builder) update(part ...interface{}) (err error) { - replace := func(l *[]string, s string, eq func(a, b string) bool) bool { - if s == "" { - b.err = errInvalidArgument - return true - } - for i, v := range *l { - if eq(v, s) { - (*l)[i] = s - return true - } - } - return false - } +func update(b *language.Builder, part ...interface{}) (err error) { for _, x := range part { switch v := x.(type) { case Tag: - b.tag.lang = v.lang - b.tag.region = v.region - b.tag.script = v.script - if v.str != "" { - b.variant = nil - for x, s := "", v.str[v.pVariant:v.pExt]; s != ""; { - x, s = nextToken(s) - b.variant = append(b.variant, x) - } - b.ext, b.private = nil, "" - for i, e := int(v.pExt), ""; i < len(v.str); { - i, e = getExtension(v.str, i) - b.addExt(e) - } - } + b.SetTag(v.tag()) case Base: - b.tag.lang = v.langID + b.Tag.LangID = v.langID case Script: - b.tag.script = v.scriptID + b.Tag.ScriptID = v.scriptID case Region: - b.tag.region = v.regionID + b.Tag.RegionID = v.regionID case Variant: - if !replace(&b.variant, v.variant, func(a, b string) bool { return a == b }) { - b.variant = append(b.variant, v.variant) + if v.variant == "" { + err = errInvalidArgument + break } + b.AddVariant(v.variant) case Extension: - if !replace(&b.ext, v.s, func(a, b string) bool { return a[0] == b[0] }) { - b.addExt(v.s) + if v.s == "" { + err = errInvalidArgument + break } + b.SetExt(v.s) case []Variant: - b.variant = nil - for _, x := range v { - b.update(x) + b.ClearVariants() + for _, v := range v { + b.AddVariant(v.variant) } case []Extension: - b.ext, b.private = nil, "" + b.ClearExtensions() for _, e := range v { - b.update(e) + b.SetExt(e.s) } // TODO: support parsing of raw strings based on morphology or just extensions? case error: - err = v - } - } - return -} - -func tokenLen(token ...string) (n int) { - for _, t := range token { - n += len(t) + 1 - } - return -} - -func appendTokens(b []byte, token ...string) int { - p := 0 - for _, t := range token { - b[p] = '-' - copy(b[p+1:], t) - p += 1 + len(t) - } - return p -} - -type sortVariant []string - -func (s sortVariant) Len() int { - return len(s) -} - -func (s sortVariant) Swap(i, j int) { - s[j], s[i] = s[i], s[j] -} - -func (s sortVariant) Less(i, j int) bool { - return variantIndex[s[i]] < variantIndex[s[j]] -} - -func findExt(list []string, x byte) int { - for i, e := range list { - if e[0] == x { - return i - } - } - return -1 -} - -// getExtension returns the name, body and end position of the extension. -func getExtension(s string, p int) (end int, ext string) { - if s[p] == '-' { - p++ - } - if s[p] == 'x' { - return len(s), s[p:] - } - end = nextExtension(s, p) - return end, s[p:end] -} - -// nextExtension finds the next extension within the string, searching -// for the -<char>- pattern from position p. -// In the fast majority of cases, language tags will have at most -// one extension and extensions tend to be small. -func nextExtension(s string, p int) int { - for n := len(s) - 3; p < n; { - if s[p] == '-' { - if s[p+2] == '-' { - return p + if v != nil { + err = v } - p += 3 - } else { - p++ } } - return len(s) + return } var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight") -// ParseAcceptLanguage parses the contents of a Accept-Language header as +// ParseAcceptLanguage parses the contents of an Accept-Language header as // defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and // a list of corresponding quality weights. It is more permissive than RFC 2616 // and may return non-nil slices even if the input is not valid. @@ -788,7 +157,7 @@ func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) { if !ok { return nil, nil, err } - t = Tag{lang: id} + t = makeTag(language.Tag{LangID: id}) } // Scan the optional weight. @@ -830,9 +199,9 @@ func split(s string, c byte) (head, tail string) { return strings.TrimSpace(s), "" } -// Add hack mapping to deal with a small number of cases that that occur +// Add hack mapping to deal with a small number of cases that occur // in Accept-Language (with reasonable frequency). -var acceptFallback = map[string]langID{ +var acceptFallback = map[string]language.Language{ "english": _en, "deutsch": _de, "italian": _it, |