1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
|
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate stringer -type=Kind
//go:generate go run gen.go gen_common.go gen_trieval.go
// Package width provides functionality for handling different widths in text.
//
// Wide characters behave like ideographs; they tend to allow line breaks after
// each character and remain upright in vertical text layout. Narrow characters
// are kept together in words or runs that are rotated sideways in vertical text
// layout.
//
// For more information, see http://unicode.org/reports/tr11/.
package width // import "golang.org/x/text/width"
import (
"unicode/utf8"
"golang.org/x/text/transform"
)
// TODO
// 1) Reduce table size by compressing blocks.
// 2) API proposition for computing display length
// (approximation, fixed pitch only).
// 3) Implement display length.
// Kind indicates the type of width property as defined in http://unicode.org/reports/tr11/.
type Kind int
const (
// Neutral characters do not occur in legacy East Asian character sets.
Neutral Kind = iota
// EastAsianAmbiguous characters that can be sometimes wide and sometimes
// narrow and require additional information not contained in the character
// code to further resolve their width.
EastAsianAmbiguous
// EastAsianWide characters are wide in its usual form. They occur only in
// the context of East Asian typography. These runes may have explicit
// halfwidth counterparts.
EastAsianWide
// EastAsianNarrow characters are narrow in its usual form. They often have
// fullwidth counterparts.
EastAsianNarrow
// Note: there exist Narrow runes that do not have fullwidth or wide
// counterparts, despite what the definition says (e.g. U+27E6).
// EastAsianFullwidth characters have a compatibility decompositions of type
// wide that map to a narrow counterpart.
EastAsianFullwidth
// EastAsianHalfwidth characters have a compatibility decomposition of type
// narrow that map to a wide or ambiguous counterpart, plus U+20A9 ₩ WON
// SIGN.
EastAsianHalfwidth
// Note: there exist runes that have a halfwidth counterparts but that are
// classified as Ambiguous, rather than wide (e.g. U+2190).
)
// TODO: the generated tries need to return size 1 for invalid runes for the
// width to be computed correctly (each byte should render width 1)
var trie = newWidthTrie(0)
// Lookup reports the Properties of the first rune in b and the number of bytes
// of its UTF-8 encoding.
func Lookup(b []byte) (p Properties, size int) {
v, sz := trie.lookup(b)
return Properties{elem(v), b[sz-1]}, sz
}
// LookupString reports the Properties of the first rune in s and the number of
// bytes of its UTF-8 encoding.
func LookupString(s string) (p Properties, size int) {
v, sz := trie.lookupString(s)
return Properties{elem(v), s[sz-1]}, sz
}
// LookupRune reports the Properties of rune r.
func LookupRune(r rune) Properties {
var buf [4]byte
n := utf8.EncodeRune(buf[:], r)
v, _ := trie.lookup(buf[:n])
last := byte(r)
if r >= utf8.RuneSelf {
last = 0x80 + byte(r&0x3f)
}
return Properties{elem(v), last}
}
// Properties provides access to width properties of a rune.
type Properties struct {
elem elem
last byte
}
func (e elem) kind() Kind {
return Kind(e >> typeShift)
}
// Kind returns the Kind of a rune as defined in Unicode TR #11.
// See http://unicode.org/reports/tr11/ for more details.
func (p Properties) Kind() Kind {
return p.elem.kind()
}
// Folded returns the folded variant of a rune or 0 if the rune is canonical.
func (p Properties) Folded() rune {
if p.elem&tagNeedsFold != 0 {
buf := inverseData[byte(p.elem)]
buf[buf[0]] ^= p.last
r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
return r
}
return 0
}
// Narrow returns the narrow variant of a rune or 0 if the rune is already
// narrow or doesn't have a narrow variant.
func (p Properties) Narrow() rune {
if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianFullwidth || k == EastAsianWide || k == EastAsianAmbiguous) {
buf := inverseData[byte(p.elem)]
buf[buf[0]] ^= p.last
r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
return r
}
return 0
}
// Wide returns the wide variant of a rune or 0 if the rune is already
// wide or doesn't have a wide variant.
func (p Properties) Wide() rune {
if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianHalfwidth || k == EastAsianNarrow) {
buf := inverseData[byte(p.elem)]
buf[buf[0]] ^= p.last
r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
return r
}
return 0
}
// TODO for Properties:
// - Add Fullwidth/Halfwidth or Inverted methods for computing variants
// mapping.
// - Add width information (including information on non-spacing runes).
// Transformer implements the transform.Transformer interface.
type Transformer struct {
t transform.SpanningTransformer
}
// Reset implements the transform.Transformer interface.
func (t Transformer) Reset() { t.t.Reset() }
// Transform implements the transform.Transformer interface.
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
return t.t.Transform(dst, src, atEOF)
}
// Span implements the transform.SpanningTransformer interface.
func (t Transformer) Span(src []byte, atEOF bool) (n int, err error) {
return t.t.Span(src, atEOF)
}
// Bytes returns a new byte slice with the result of applying t to b.
func (t Transformer) Bytes(b []byte) []byte {
b, _, _ = transform.Bytes(t, b)
return b
}
// String returns a string with the result of applying t to s.
func (t Transformer) String(s string) string {
s, _, _ = transform.String(t, s)
return s
}
var (
// Fold is a transform that maps all runes to their canonical width.
//
// Note that the NFKC and NFKD transforms in golang.org/x/text/unicode/norm
// provide a more generic folding mechanism.
Fold Transformer = Transformer{foldTransform{}}
// Widen is a transform that maps runes to their wide variant, if
// available.
Widen Transformer = Transformer{wideTransform{}}
// Narrow is a transform that maps runes to their narrow variant, if
// available.
Narrow Transformer = Transformer{narrowTransform{}}
)
// TODO: Consider the following options:
// - Treat Ambiguous runes that have a halfwidth counterpart as wide, or some
// generalized variant of this.
// - Consider a wide Won character to be the default width (or some generalized
// variant of this).
// - Filter the set of characters that gets converted (the preferred approach is
// to allow applying filters to transforms).
|