summaryrefslogtreecommitdiff
path: root/vendor/golang.org/x/text/encoding
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/golang.org/x/text/encoding')
-rw-r--r--vendor/golang.org/x/text/encoding/unicode/unicode.go94
1 files changed, 86 insertions, 8 deletions
diff --git a/vendor/golang.org/x/text/encoding/unicode/unicode.go b/vendor/golang.org/x/text/encoding/unicode/unicode.go
index 4850ff365..dd99ad14d 100644
--- a/vendor/golang.org/x/text/encoding/unicode/unicode.go
+++ b/vendor/golang.org/x/text/encoding/unicode/unicode.go
@@ -6,6 +6,7 @@
package unicode // import "golang.org/x/text/encoding/unicode"
import (
+ "bytes"
"errors"
"unicode/utf16"
"unicode/utf8"
@@ -25,15 +26,95 @@ import (
// the introduction of some kind of error type for conveying the erroneous code
// point.
-// UTF8 is the UTF-8 encoding.
+// UTF8 is the UTF-8 encoding. It neither removes nor adds byte order marks.
var UTF8 encoding.Encoding = utf8enc
+// UTF8BOM is an UTF-8 encoding where the decoder strips a leading byte order
+// mark while the encoder adds one.
+//
+// Some editors add a byte order mark as a signature to UTF-8 files. Although
+// the byte order mark is not useful for detecting byte order in UTF-8, it is
+// sometimes used as a convention to mark UTF-8-encoded files. This relies on
+// the observation that the UTF-8 byte order mark is either an illegal or at
+// least very unlikely sequence in any other character encoding.
+var UTF8BOM encoding.Encoding = utf8bomEncoding{}
+
+type utf8bomEncoding struct{}
+
+func (utf8bomEncoding) String() string {
+ return "UTF-8-BOM"
+}
+
+func (utf8bomEncoding) ID() (identifier.MIB, string) {
+ return identifier.Unofficial, "x-utf8bom"
+}
+
+func (utf8bomEncoding) NewEncoder() *encoding.Encoder {
+ return &encoding.Encoder{
+ Transformer: &utf8bomEncoder{t: runes.ReplaceIllFormed()},
+ }
+}
+
+func (utf8bomEncoding) NewDecoder() *encoding.Decoder {
+ return &encoding.Decoder{Transformer: &utf8bomDecoder{}}
+}
+
var utf8enc = &internal.Encoding{
&internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()},
"UTF-8",
identifier.UTF8,
}
+type utf8bomDecoder struct {
+ checked bool
+}
+
+func (t *utf8bomDecoder) Reset() {
+ t.checked = false
+}
+
+func (t *utf8bomDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+ if !t.checked {
+ if !atEOF && len(src) < len(utf8BOM) {
+ if len(src) == 0 {
+ return 0, 0, nil
+ }
+ return 0, 0, transform.ErrShortSrc
+ }
+ if bytes.HasPrefix(src, []byte(utf8BOM)) {
+ nSrc += len(utf8BOM)
+ src = src[len(utf8BOM):]
+ }
+ t.checked = true
+ }
+ nDst, n, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
+ nSrc += n
+ return nDst, nSrc, err
+}
+
+type utf8bomEncoder struct {
+ written bool
+ t transform.Transformer
+}
+
+func (t *utf8bomEncoder) Reset() {
+ t.written = false
+ t.t.Reset()
+}
+
+func (t *utf8bomEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+ if !t.written {
+ if len(dst) < len(utf8BOM) {
+ return nDst, 0, transform.ErrShortDst
+ }
+ nDst = copy(dst, utf8BOM)
+ t.written = true
+ }
+ n, nSrc, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
+ nDst += n
+ return nDst, nSrc, err
+}
+
type utf8Decoder struct{ transform.NopResetter }
func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
@@ -287,16 +368,13 @@ func (u *utf16Decoder) Reset() {
}
func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+ if len(src) < 2 && atEOF && u.current.bomPolicy&requireBOM != 0 {
+ return 0, 0, ErrMissingBOM
+ }
if len(src) == 0 {
- if atEOF && u.current.bomPolicy&requireBOM != 0 {
- return 0, 0, ErrMissingBOM
- }
return 0, 0, nil
}
- if u.current.bomPolicy&acceptBOM != 0 {
- if len(src) < 2 {
- return 0, 0, transform.ErrShortSrc
- }
+ if len(src) >= 2 && u.current.bomPolicy&acceptBOM != 0 {
switch {
case src[0] == 0xfe && src[1] == 0xff:
u.current.endianness = BigEndian