aboutsummaryrefslogtreecommitdiff
path: root/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s
blob: b3a16ef751a677842c0f26b5121ecb4c351b4950 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build go1.11
// +build !gccgo,!appengine

#include "textflag.h"

#define NUM_ROUNDS 10

// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
	MOVD	dst+0(FP), R1
	MOVD	src+24(FP), R2
	MOVD	src_len+32(FP), R3
	MOVD	key+48(FP), R4
	MOVD	nonce+56(FP), R6
	MOVD	counter+64(FP), R7

	MOVD	$·constants(SB), R10
	MOVD	$·incRotMatrix(SB), R11

	MOVW	(R7), R20

	AND	$~255, R3, R13
	ADD	R2, R13, R12 // R12 for block end
	AND	$255, R3, R13
loop:
	MOVD	$NUM_ROUNDS, R21
	VLD1	(R11), [V30.S4, V31.S4]

	// load contants
	// VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
	WORD	$0x4D60E940

	// load keys
	// VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
	WORD	$0x4DFFE884
	// VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
	WORD	$0x4DFFE888
	SUB	$32, R4

	// load counter + nonce
	// VLD1R (R7), [V12.S4]
	WORD	$0x4D40C8EC

	// VLD3R (R6), [V13.S4, V14.S4, V15.S4]
	WORD	$0x4D40E8CD

	// update counter
	VADD	V30.S4, V12.S4, V12.S4

chacha:
	// V0..V3 += V4..V7
	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
	VADD	V0.S4, V4.S4, V0.S4
	VADD	V1.S4, V5.S4, V1.S4
	VADD	V2.S4, V6.S4, V2.S4
	VADD	V3.S4, V7.S4, V3.S4
	VEOR	V12.B16, V0.B16, V12.B16
	VEOR	V13.B16, V1.B16, V13.B16
	VEOR	V14.B16, V2.B16, V14.B16
	VEOR	V15.B16, V3.B16, V15.B16
	VREV32	V12.H8, V12.H8
	VREV32	V13.H8, V13.H8
	VREV32	V14.H8, V14.H8
	VREV32	V15.H8, V15.H8
	// V8..V11 += V12..V15
	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
	VADD	V8.S4, V12.S4, V8.S4
	VADD	V9.S4, V13.S4, V9.S4
	VADD	V10.S4, V14.S4, V10.S4
	VADD	V11.S4, V15.S4, V11.S4
	VEOR	V8.B16, V4.B16, V16.B16
	VEOR	V9.B16, V5.B16, V17.B16
	VEOR	V10.B16, V6.B16, V18.B16
	VEOR	V11.B16, V7.B16, V19.B16
	VSHL	$12, V16.S4, V4.S4
	VSHL	$12, V17.S4, V5.S4
	VSHL	$12, V18.S4, V6.S4
	VSHL	$12, V19.S4, V7.S4
	VSRI	$20, V16.S4, V4.S4
	VSRI	$20, V17.S4, V5.S4
	VSRI	$20, V18.S4, V6.S4
	VSRI	$20, V19.S4, V7.S4

	// V0..V3 += V4..V7
	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
	VADD	V0.S4, V4.S4, V0.S4
	VADD	V1.S4, V5.S4, V1.S4
	VADD	V2.S4, V6.S4, V2.S4
	VADD	V3.S4, V7.S4, V3.S4
	VEOR	V12.B16, V0.B16, V12.B16
	VEOR	V13.B16, V1.B16, V13.B16
	VEOR	V14.B16, V2.B16, V14.B16
	VEOR	V15.B16, V3.B16, V15.B16
	VTBL	V31.B16, [V12.B16], V12.B16
	VTBL	V31.B16, [V13.B16], V13.B16
	VTBL	V31.B16, [V14.B16], V14.B16
	VTBL	V31.B16, [V15.B16], V15.B16

	// V8..V11 += V12..V15
	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
	VADD	V12.S4, V8.S4, V8.S4
	VADD	V13.S4, V9.S4, V9.S4
	VADD	V14.S4, V10.S4, V10.S4
	VADD	V15.S4, V11.S4, V11.S4
	VEOR	V8.B16, V4.B16, V16.B16
	VEOR	V9.B16, V5.B16, V17.B16
	VEOR	V10.B16, V6.B16, V18.B16
	VEOR	V11.B16, V7.B16, V19.B16
	VSHL	$7, V16.S4, V4.S4
	VSHL	$7, V17.S4, V5.S4
	VSHL	$7, V18.S4, V6.S4
	VSHL	$7, V19.S4, V7.S4
	VSRI	$25, V16.S4, V4.S4
	VSRI	$25, V17.S4, V5.S4
	VSRI	$25, V18.S4, V6.S4
	VSRI	$25, V19.S4, V7.S4

	// V0..V3 += V5..V7, V4
	// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
	VADD	V0.S4, V5.S4, V0.S4
	VADD	V1.S4, V6.S4, V1.S4
	VADD	V2.S4, V7.S4, V2.S4
	VADD	V3.S4, V4.S4, V3.S4
	VEOR	V15.B16, V0.B16, V15.B16
	VEOR	V12.B16, V1.B16, V12.B16
	VEOR	V13.B16, V2.B16, V13.B16
	VEOR	V14.B16, V3.B16, V14.B16
	VREV32	V12.H8, V12.H8
	VREV32	V13.H8, V13.H8
	VREV32	V14.H8, V14.H8
	VREV32	V15.H8, V15.H8

	// V10 += V15; V5 <<<= ((V10 XOR V5), 12)
	// ...
	VADD	V15.S4, V10.S4, V10.S4
	VADD	V12.S4, V11.S4, V11.S4
	VADD	V13.S4, V8.S4, V8.S4
	VADD	V14.S4, V9.S4, V9.S4
	VEOR	V10.B16, V5.B16, V16.B16
	VEOR	V11.B16, V6.B16, V17.B16
	VEOR	V8.B16, V7.B16, V18.B16
	VEOR	V9.B16, V4.B16, V19.B16
	VSHL	$12, V16.S4, V5.S4
	VSHL	$12, V17.S4, V6.S4
	VSHL	$12, V18.S4, V7.S4
	VSHL	$12, V19.S4, V4.S4
	VSRI	$20, V16.S4, V5.S4
	VSRI	$20, V17.S4, V6.S4
	VSRI	$20, V18.S4, V7.S4
	VSRI	$20, V19.S4, V4.S4

	// V0 += V5; V15 <<<= ((V0 XOR V15), 8)
	// ...
	VADD	V5.S4, V0.S4, V0.S4
	VADD	V6.S4, V1.S4, V1.S4
	VADD	V7.S4, V2.S4, V2.S4
	VADD	V4.S4, V3.S4, V3.S4
	VEOR	V0.B16, V15.B16, V15.B16
	VEOR	V1.B16, V12.B16, V12.B16
	VEOR	V2.B16, V13.B16, V13.B16
	VEOR	V3.B16, V14.B16, V14.B16
	VTBL	V31.B16, [V12.B16], V12.B16
	VTBL	V31.B16, [V13.B16], V13.B16
	VTBL	V31.B16, [V14.B16], V14.B16
	VTBL	V31.B16, [V15.B16], V15.B16

	// V10 += V15; V5 <<<= ((V10 XOR V5), 7)
	// ...
	VADD	V15.S4, V10.S4, V10.S4
	VADD	V12.S4, V11.S4, V11.S4
	VADD	V13.S4, V8.S4, V8.S4
	VADD	V14.S4, V9.S4, V9.S4
	VEOR	V10.B16, V5.B16, V16.B16
	VEOR	V11.B16, V6.B16, V17.B16
	VEOR	V8.B16, V7.B16, V18.B16
	VEOR	V9.B16, V4.B16, V19.B16
	VSHL	$7, V16.S4, V5.S4
	VSHL	$7, V17.S4, V6.S4
	VSHL	$7, V18.S4, V7.S4
	VSHL	$7, V19.S4, V4.S4
	VSRI	$25, V16.S4, V5.S4
	VSRI	$25, V17.S4, V6.S4
	VSRI	$25, V18.S4, V7.S4
	VSRI	$25, V19.S4, V4.S4

	SUB	$1, R21
	CBNZ	R21, chacha

	// VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
	WORD	$0x4D60E950

	// VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
	WORD	$0x4DFFE894
	VADD	V30.S4, V12.S4, V12.S4
	VADD	V16.S4, V0.S4, V0.S4
	VADD	V17.S4, V1.S4, V1.S4
	VADD	V18.S4, V2.S4, V2.S4
	VADD	V19.S4, V3.S4, V3.S4
	// VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
	WORD	$0x4DFFE898
	// restore R4
	SUB	$32, R4

	// load counter + nonce
	// VLD1R (R7), [V28.S4]
	WORD	$0x4D40C8FC
	// VLD3R (R6), [V29.S4, V30.S4, V31.S4]
	WORD	$0x4D40E8DD

	VADD	V20.S4, V4.S4, V4.S4
	VADD	V21.S4, V5.S4, V5.S4
	VADD	V22.S4, V6.S4, V6.S4
	VADD	V23.S4, V7.S4, V7.S4
	VADD	V24.S4, V8.S4, V8.S4
	VADD	V25.S4, V9.S4, V9.S4
	VADD	V26.S4, V10.S4, V10.S4
	VADD	V27.S4, V11.S4, V11.S4
	VADD	V28.S4, V12.S4, V12.S4
	VADD	V29.S4, V13.S4, V13.S4
	VADD	V30.S4, V14.S4, V14.S4
	VADD	V31.S4, V15.S4, V15.S4

	VZIP1	V1.S4, V0.S4, V16.S4
	VZIP2	V1.S4, V0.S4, V17.S4
	VZIP1	V3.S4, V2.S4, V18.S4
	VZIP2	V3.S4, V2.S4, V19.S4
	VZIP1	V5.S4, V4.S4, V20.S4
	VZIP2	V5.S4, V4.S4, V21.S4
	VZIP1	V7.S4, V6.S4, V22.S4
	VZIP2	V7.S4, V6.S4, V23.S4
	VZIP1	V9.S4, V8.S4, V24.S4
	VZIP2	V9.S4, V8.S4, V25.S4
	VZIP1	V11.S4, V10.S4, V26.S4
	VZIP2	V11.S4, V10.S4, V27.S4
	VZIP1	V13.S4, V12.S4, V28.S4
	VZIP2	V13.S4, V12.S4, V29.S4
	VZIP1	V15.S4, V14.S4, V30.S4
	VZIP2	V15.S4, V14.S4, V31.S4
	VZIP1	V18.D2, V16.D2, V0.D2
	VZIP2	V18.D2, V16.D2, V4.D2
	VZIP1	V19.D2, V17.D2, V8.D2
	VZIP2	V19.D2, V17.D2, V12.D2
	VLD1.P	64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]

	VZIP1	V22.D2, V20.D2, V1.D2
	VZIP2	V22.D2, V20.D2, V5.D2
	VZIP1	V23.D2, V21.D2, V9.D2
	VZIP2	V23.D2, V21.D2, V13.D2
	VLD1.P	64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
	VZIP1	V26.D2, V24.D2, V2.D2
	VZIP2	V26.D2, V24.D2, V6.D2
	VZIP1	V27.D2, V25.D2, V10.D2
	VZIP2	V27.D2, V25.D2, V14.D2
	VLD1.P	64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
	VZIP1	V30.D2, V28.D2, V3.D2
	VZIP2	V30.D2, V28.D2, V7.D2
	VZIP1	V31.D2, V29.D2, V11.D2
	VZIP2	V31.D2, V29.D2, V15.D2
	VLD1.P	64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
	VEOR	V0.B16, V16.B16, V16.B16
	VEOR	V1.B16, V17.B16, V17.B16
	VEOR	V2.B16, V18.B16, V18.B16
	VEOR	V3.B16, V19.B16, V19.B16
	VST1.P	[V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
	VEOR	V4.B16, V20.B16, V20.B16
	VEOR	V5.B16, V21.B16, V21.B16
	VEOR	V6.B16, V22.B16, V22.B16
	VEOR	V7.B16, V23.B16, V23.B16
	VST1.P	[V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
	VEOR	V8.B16, V24.B16, V24.B16
	VEOR	V9.B16, V25.B16, V25.B16
	VEOR	V10.B16, V26.B16, V26.B16
	VEOR	V11.B16, V27.B16, V27.B16
	VST1.P	[V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
	VEOR	V12.B16, V28.B16, V28.B16
	VEOR	V13.B16, V29.B16, V29.B16
	VEOR	V14.B16, V30.B16, V30.B16
	VEOR	V15.B16, V31.B16, V31.B16
	VST1.P	[V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)

	ADD	$4, R20
	MOVW	R20, (R7) // update counter

	CMP	R2, R12
	BGT	loop

	RET


DATA	·constants+0x00(SB)/4, $0x61707865
DATA	·constants+0x04(SB)/4, $0x3320646e
DATA	·constants+0x08(SB)/4, $0x79622d32
DATA	·constants+0x0c(SB)/4, $0x6b206574
GLOBL	·constants(SB), NOPTR|RODATA, $32

DATA	·incRotMatrix+0x00(SB)/4, $0x00000000
DATA	·incRotMatrix+0x04(SB)/4, $0x00000001
DATA	·incRotMatrix+0x08(SB)/4, $0x00000002
DATA	·incRotMatrix+0x0c(SB)/4, $0x00000003
DATA	·incRotMatrix+0x10(SB)/4, $0x02010003
DATA	·incRotMatrix+0x14(SB)/4, $0x06050407
DATA	·incRotMatrix+0x18(SB)/4, $0x0A09080B
DATA	·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
GLOBL	·incRotMatrix(SB), NOPTR|RODATA, $32