Skip to content

Commit 4d48e5f

Browse files
Jungho Ahnagl
Jungho Ahn
authored andcommitted
x/crypto/poly1305: add ARM assembly
This change adds ARMv6 assembly implementation. The referenced assembly code was the public domain source by Andrew Moon in https://github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305/poly1305_armv6-32.inc. The author has confirmed that it's ok to put it under the Go license. Benchmark results on Raspberry Pi (ARMv6-compatible processor rev 7), o Without ARMv6 assembly Benchmark1K 5000 287177 ns/op 3.57 MB/s Benchmark64 50000 38880 ns/op 1.65 MB/s o With ARMv6 assembly Benchmark1K 100000 15964 ns/op 64.14 MB/s Benchmark64 1000000 1472 ns/op 43.46 MB/s Change-Id: Iea5b0b831ac097cc6d477a8fccbf0ddb4819724c Reviewed-on: https://go-review.googlesource.com/9765 Reviewed-by: Adam Langley <[email protected]> Run-TryBot: Adam Langley <[email protected]>
1 parent e3f150b commit 4d48e5f

File tree

3 files changed

+358
-1
lines changed

3 files changed

+358
-1
lines changed

poly1305/poly1305_arm.s

Lines changed: 333 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
// Copyright 2015 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// This code was translated into a form compatible with 5a from the public
6+
// domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305.
7+
8+
// +build arm,!gccgo,!appengine
9+
10+
#include "textflag.h"
11+
12+
DATA poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff
13+
DATA poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03
14+
DATA poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff
15+
DATA poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff
16+
DATA poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff
17+
GLOBL poly1305_init_constants_armv6<>(SB), RODATA, $20
18+
19+
// Warning: the linker may use R11 to synthesize certain instructions. Please
20+
// take care and verify that no synthetic instructions use it.
21+
22+
TEXT poly1305_init_ext_armv6<>(SB),NOSPLIT,$-4
23+
MOVM.DB.W [R4-R11], (R13)
24+
MOVM.IA.W (R1), [R2-R5]
25+
MOVW $poly1305_init_constants_armv6<>(SB), R7
26+
MOVW R2, R8
27+
MOVW R2>>26, R9
28+
MOVW R3>>20, g
29+
MOVW R4>>14, R11
30+
MOVW R5>>8, R12
31+
ORR R3<<6, R9, R9
32+
ORR R4<<12, g, g
33+
ORR R5<<18, R11, R11
34+
MOVM.IA (R7), [R2-R6]
35+
AND R8, R2, R2
36+
AND R9, R3, R3
37+
AND g, R4, R4
38+
AND R11, R5, R5
39+
AND R12, R6, R6
40+
MOVM.IA.W [R2-R6], (R0)
41+
EOR R2, R2, R2
42+
EOR R3, R3, R3
43+
EOR R4, R4, R4
44+
EOR R5, R5, R5
45+
EOR R6, R6, R6
46+
MOVM.IA.W [R2-R6], (R0)
47+
MOVM.IA.W (R1), [R2-R5]
48+
MOVM.IA [R2-R6], (R0)
49+
MOVM.IA.W (R13), [R4-R11]
50+
RET
51+
52+
TEXT poly1305_blocks_armv6<>(SB),NOSPLIT,$-4
53+
MOVM.DB.W [R4, R5, R6, R7, R8, R9, g, R11, R14], (R13)
54+
SUB $128, R13
55+
MOVW R0, 36(R13)
56+
MOVW R1, 40(R13)
57+
MOVW R2, 44(R13)
58+
MOVW R1, R14
59+
MOVW R2, R12
60+
MOVW 56(R0), R8
61+
WORD $0xe1180008 // TST R8, R8 not working see issue 5921
62+
EOR R6, R6, R6
63+
MOVW.EQ $(1<<24), R6
64+
MOVW R6, 32(R13)
65+
ADD $64, R13, g
66+
MOVM.IA (R0), [R0-R9]
67+
MOVM.IA [R0-R4], (g)
68+
CMP $16, R12
69+
BLO poly1305_blocks_armv6_done
70+
poly1305_blocks_armv6_mainloop:
71+
MOVM.IA.W (R14), [R0-R3]
72+
MOVW R0>>26, g
73+
MOVW R1>>20, R11
74+
MOVW R2>>14, R12
75+
MOVW R14, 40(R13)
76+
MOVW R3>>8, R4
77+
ORR R1<<6, g, g
78+
ORR R2<<12, R11, R11
79+
ORR R3<<18, R12, R12
80+
BIC $0xfc000000, R0, R0
81+
BIC $0xfc000000, g, g
82+
MOVW 32(R13), R3
83+
BIC $0xfc000000, R11, R11
84+
BIC $0xfc000000, R12, R12
85+
ADD R0, R5, R5
86+
ADD g, R6, R6
87+
ORR R3, R4, R4
88+
ADD R11, R7, R7
89+
ADD $64, R13, R14
90+
ADD R12, R8, R8
91+
ADD R4, R9, R9
92+
MOVM.IA (R14), [R0-R4]
93+
MULLU R4, R5, (R11, g)
94+
MULLU R3, R5, (R14, R12)
95+
MULALU R3, R6, (R11, g)
96+
MULALU R2, R6, (R14, R12)
97+
MULALU R2, R7, (R11, g)
98+
MULALU R1, R7, (R14, R12)
99+
ADD R4<<2, R4, R4
100+
ADD R3<<2, R3, R3
101+
MULALU R1, R8, (R11, g)
102+
MULALU R0, R8, (R14, R12)
103+
MULALU R0, R9, (R11, g)
104+
MULALU R4, R9, (R14, R12)
105+
MOVW g, 24(R13)
106+
MOVW R11, 28(R13)
107+
MOVW R12, 16(R13)
108+
MOVW R14, 20(R13)
109+
MULLU R2, R5, (R11, g)
110+
MULLU R1, R5, (R14, R12)
111+
MULALU R1, R6, (R11, g)
112+
MULALU R0, R6, (R14, R12)
113+
MULALU R0, R7, (R11, g)
114+
MULALU R4, R7, (R14, R12)
115+
ADD R2<<2, R2, R2
116+
ADD R1<<2, R1, R1
117+
MULALU R4, R8, (R11, g)
118+
MULALU R3, R8, (R14, R12)
119+
MULALU R3, R9, (R11, g)
120+
MULALU R2, R9, (R14, R12)
121+
MOVW g, 8(R13)
122+
MOVW R11, 12(R13)
123+
MOVW R12, 0(R13)
124+
MOVW R14, 4(SP)
125+
MULLU R0, R5, (R11, g)
126+
MULALU R4, R6, (R11, g)
127+
MULALU R3, R7, (R11, g)
128+
MULALU R2, R8, (R11, g)
129+
MULALU R1, R9, (R11, g)
130+
MOVM.IA (R13), [R0-R7]
131+
MOVW g>>26, R12
132+
MOVW R4>>26, R14
133+
ORR R11<<6, R12, R12
134+
ORR R5<<6, R14, R14
135+
BIC $0xfc000000, g, g
136+
BIC $0xfc000000, R4, R4
137+
ADD.S R12, R0, R0
138+
ADC $0, R1, R1
139+
ADD.S R14, R6, R6
140+
ADC $0, R7, R7
141+
MOVW R0>>26, R12
142+
MOVW R6>>26, R14
143+
ORR R1<<6, R12, R12
144+
ORR R7<<6, R14, R14
145+
BIC $0xfc000000, R0, R0
146+
BIC $0xfc000000, R6, R6
147+
ADD R14<<2, R14, R14
148+
ADD.S R12, R2, R2
149+
ADC $0, R3, R3
150+
ADD R14, g, g
151+
MOVW R2>>26, R12
152+
MOVW g>>26, R14
153+
ORR R3<<6, R12, R12
154+
BIC $0xfc000000, g, R5
155+
BIC $0xfc000000, R2, R7
156+
ADD R12, R4, R4
157+
ADD R14, R0, R0
158+
MOVW R4>>26, R12
159+
BIC $0xfc000000, R4, R8
160+
ADD R12, R6, R9
161+
MOVW 44(SP), R12
162+
MOVW 40(SP), R14
163+
MOVW R0, R6
164+
CMP $32, R12
165+
SUB $16, R12, R12
166+
MOVW R12, 44(R13)
167+
BHS poly1305_blocks_armv6_mainloop
168+
poly1305_blocks_armv6_done:
169+
MOVW 36(R13), R12
170+
MOVW R5, 20(R12)
171+
MOVW R6, 24(R12)
172+
MOVW R7, 28(R12)
173+
MOVW R8, 32(R12)
174+
MOVW R9, 36(R12)
175+
ADD $128, R13, R13
176+
MOVM.IA.W (R13), [R4, R5, R6, R7, R8, R9, g, R11, R14]
177+
RET
178+
179+
TEXT poly1305_finish_ext_armv6<>(SB),NOSPLIT,$-4
180+
MOVM.DB.W [R4, R5, R6, R7, R8, R9, g, R11, R14], (R13)
181+
SUB $16, R13, R13
182+
MOVW R0, R5
183+
MOVW R1, R6
184+
MOVW R2, R7
185+
MOVW R3, R8
186+
AND.S R2, R2, R2
187+
BEQ poly1305_finish_ext_armv6_noremaining
188+
EOR R0, R0
189+
MOVW R13, R9
190+
MOVW R0, 0(R13)
191+
MOVW R0, 4(R13)
192+
MOVW R0, 8(R13)
193+
MOVW R0, 12(R13)
194+
WORD $0xe3120008 // TST R2, #8 not working see issue 5921
195+
BEQ poly1305_finish_ext_armv6_skip8
196+
MOVM.IA.W (R1), [g-R11]
197+
MOVM.IA.W [g-R11], (R9)
198+
poly1305_finish_ext_armv6_skip8:
199+
WORD $0xe3120004 // TST $4, R2 not working see issue 5921
200+
BEQ poly1305_finish_ext_armv6_skip4
201+
MOVW.P 4(R1), g
202+
MOVW.P g, 4(R9)
203+
poly1305_finish_ext_armv6_skip4:
204+
WORD $0xe3120002 // TST $2, R2 not working see issue 5921
205+
BEQ poly1305_finish_ext_armv6_skip2
206+
MOVHU.P 2(R1), g
207+
MOVH.P g, 2(R9)
208+
poly1305_finish_ext_armv6_skip2:
209+
WORD $0xe3120001 // TST $1, R2 not working see issue 5921
210+
BEQ poly1305_finish_ext_armv6_skip1
211+
MOVBU.P 1(R1), g
212+
MOVBU.P g, 1(R9)
213+
poly1305_finish_ext_armv6_skip1:
214+
MOVW $1, R11
215+
MOVBU R11, 0(R9)
216+
MOVW R11, 56(R5)
217+
MOVW R5, R0
218+
MOVW R13, R1
219+
MOVW $16, R2
220+
BL poly1305_blocks_armv6<>(SB)
221+
poly1305_finish_ext_armv6_noremaining:
222+
MOVW 20(R5), R0
223+
MOVW 24(R5), R1
224+
MOVW 28(R5), R2
225+
MOVW 32(R5), R3
226+
MOVW 36(R5), R4
227+
MOVW R4>>26, R12
228+
BIC $0xfc000000, R4, R4
229+
ADD R12<<2, R12, R12
230+
ADD R12, R0, R0
231+
MOVW R0>>26, R12
232+
BIC $0xfc000000, R0, R0
233+
ADD R12, R1, R1
234+
MOVW R1>>26, R12
235+
BIC $0xfc000000, R1, R1
236+
ADD R12, R2, R2
237+
MOVW R2>>26, R12
238+
BIC $0xfc000000, R2, R2
239+
ADD R12, R3, R3
240+
MOVW R3>>26, R12
241+
BIC $0xfc000000, R3, R3
242+
ADD R12, R4, R4
243+
ADD $5, R0, R6
244+
MOVW R6>>26, R12
245+
BIC $0xfc000000, R6, R6
246+
ADD R12, R1, R7
247+
MOVW R7>>26, R12
248+
BIC $0xfc000000, R7, R7
249+
ADD R12, R2, g
250+
MOVW g>>26, R12
251+
BIC $0xfc000000, g, g
252+
ADD R12, R3, R11
253+
MOVW $-(1<<26), R12
254+
ADD R11>>26, R12, R12
255+
BIC $0xfc000000, R11, R11
256+
ADD R12, R4, R14
257+
MOVW R14>>31, R12
258+
SUB $1, R12
259+
AND R12, R6, R6
260+
AND R12, R7, R7
261+
AND R12, g, g
262+
AND R12, R11, R11
263+
AND R12, R14, R14
264+
MVN R12, R12
265+
AND R12, R0, R0
266+
AND R12, R1, R1
267+
AND R12, R2, R2
268+
AND R12, R3, R3
269+
AND R12, R4, R4
270+
ORR R6, R0, R0
271+
ORR R7, R1, R1
272+
ORR g, R2, R2
273+
ORR R11, R3, R3
274+
ORR R14, R4, R4
275+
ORR R1<<26, R0, R0
276+
MOVW R1>>6, R1
277+
ORR R2<<20, R1, R1
278+
MOVW R2>>12, R2
279+
ORR R3<<14, R2, R2
280+
MOVW R3>>18, R3
281+
ORR R4<<8, R3, R3
282+
MOVW 40(R5), R6
283+
MOVW 44(R5), R7
284+
MOVW 48(R5), g
285+
MOVW 52(R5), R11
286+
ADD.S R6, R0, R0
287+
ADC.S R7, R1, R1
288+
ADC.S g, R2, R2
289+
ADC.S R11, R3, R3
290+
MOVM.IA [R0-R3], (R8)
291+
MOVW R5, R12
292+
EOR R0, R0, R0
293+
EOR R1, R1, R1
294+
EOR R2, R2, R2
295+
EOR R3, R3, R3
296+
EOR R4, R4, R4
297+
EOR R5, R5, R5
298+
EOR R6, R6, R6
299+
EOR R7, R7, R7
300+
MOVM.IA.W [R0-R7], (R12)
301+
MOVM.IA [R0-R7], (R12)
302+
ADD $16, R13, R13
303+
MOVM.IA.W (R13), [R4, R5, R6, R7, R8, R9, g, R11, R14]
304+
RET
305+
306+
// func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
307+
TEXT ·poly1305_auth_armv6(SB),0,$280-16
308+
MOVW out+0(FP), R4
309+
MOVW m+4(FP), R5
310+
MOVW mlen+8(FP), R6
311+
MOVW key+12(FP), R7
312+
313+
MOVW R13, R8
314+
BIC $63, R13
315+
SUB $64, R13, R13
316+
MOVW R13, R0
317+
MOVW R7, R1
318+
BL poly1305_init_ext_armv6<>(SB)
319+
BIC.S $15, R6, R2
320+
BEQ poly1305_auth_armv6_noblocks
321+
MOVW R13, R0
322+
MOVW R5, R1
323+
ADD R2, R5, R5
324+
SUB R2, R6, R6
325+
BL poly1305_blocks_armv6<>(SB)
326+
poly1305_auth_armv6_noblocks:
327+
MOVW R13, R0
328+
MOVW R5, R1
329+
MOVW R6, R2
330+
MOVW R4, R3
331+
BL poly1305_finish_ext_armv6<>(SB)
332+
MOVW R8, R13
333+
RET

poly1305/sum_arm.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// Copyright 2015 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// +build arm,!gccgo,!appengine
6+
7+
package poly1305
8+
9+
// This function is implemented in poly1305_arm.s
10+
11+
//go:noescape
12+
13+
func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]byte)
14+
15+
// Sum generates an authenticator for m using a one-time key and puts the
16+
// 16-byte result into out. Authenticating two different messages with the same
17+
// key allows an attacker to forge messages at will.
18+
func Sum(out *[16]byte, m []byte, key *[32]byte) {
19+
var mPtr *byte
20+
if len(m) > 0 {
21+
mPtr = &m[0]
22+
}
23+
poly1305_auth_armv6(out, mPtr, uint32(len(m)), key)
24+
}

poly1305/sum_ref.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
// +build !amd64 gccgo appengine
5+
// +build !amd64,!arm gccgo appengine
66

77
package poly1305
88

0 commit comments

Comments
 (0)