Text file
src/hash/crc32/crc32_amd64.s
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6 #include "go_asm.h"
7
8 // castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
9 //
10 // func castagnoliSSE42(crc uint32, p []byte) uint32
11 TEXT ·castagnoliSSE42(SB),NOSPLIT,$0
12 MOVL crc+0(FP), AX // CRC value
13 MOVQ p+8(FP), SI // data pointer
14 MOVQ p_len+16(FP), CX // len(p)
15
16 // If there are fewer than 8 bytes to process, skip alignment.
17 CMPQ CX, $8
18 JL less_than_8
19
20 MOVQ SI, BX
21 ANDQ $7, BX
22 JZ aligned
23
24 // Process the first few bytes to 8-byte align the input.
25
26 // BX = 8 - BX. We need to process this many bytes to align.
27 SUBQ $1, BX
28 XORQ $7, BX
29
30 BTQ $0, BX
31 JNC align_2
32
33 CRC32B (SI), AX
34 DECQ CX
35 INCQ SI
36
37 align_2:
38 BTQ $1, BX
39 JNC align_4
40
41 CRC32W (SI), AX
42
43 SUBQ $2, CX
44 ADDQ $2, SI
45
46 align_4:
47 BTQ $2, BX
48 JNC aligned
49
50 CRC32L (SI), AX
51
52 SUBQ $4, CX
53 ADDQ $4, SI
54
55 aligned:
56 // The input is now 8-byte aligned and we can process 8-byte chunks.
57 CMPQ CX, $8
58 JL less_than_8
59
60 CRC32Q (SI), AX
61 ADDQ $8, SI
62 SUBQ $8, CX
63 JMP aligned
64
65 less_than_8:
66 // We may have some bytes left over; process 4 bytes, then 2, then 1.
67 BTQ $2, CX
68 JNC less_than_4
69
70 CRC32L (SI), AX
71 ADDQ $4, SI
72
73 less_than_4:
74 BTQ $1, CX
75 JNC less_than_2
76
77 CRC32W (SI), AX
78 ADDQ $2, SI
79
80 less_than_2:
81 BTQ $0, CX
82 JNC done
83
84 CRC32B (SI), AX
85
86 done:
87 MOVL AX, ret+32(FP)
88 RET
89
90 // castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
91 // bytes from each buffer.
92 //
93 // func castagnoliSSE42Triple(
94 // crc1, crc2, crc3 uint32,
95 // a, b, c []byte,
96 // rounds uint32,
97 // ) (retA uint32, retB uint32, retC uint32)
98 TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0
99 MOVL crcA+0(FP), AX
100 MOVL crcB+4(FP), CX
101 MOVL crcC+8(FP), DX
102
103 MOVQ a+16(FP), R8 // data pointer
104 MOVQ b+40(FP), R9 // data pointer
105 MOVQ c+64(FP), R10 // data pointer
106
107 MOVL rounds+88(FP), R11
108
109 loop:
110 CRC32Q (R8), AX
111 CRC32Q (R9), CX
112 CRC32Q (R10), DX
113
114 CRC32Q 8(R8), AX
115 CRC32Q 8(R9), CX
116 CRC32Q 8(R10), DX
117
118 CRC32Q 16(R8), AX
119 CRC32Q 16(R9), CX
120 CRC32Q 16(R10), DX
121
122 ADDQ $24, R8
123 ADDQ $24, R9
124 ADDQ $24, R10
125
126 DECQ R11
127 JNZ loop
128
129 MOVL AX, retA+96(FP)
130 MOVL CX, retB+100(FP)
131 MOVL DX, retC+104(FP)
132 RET
133
134 // CRC32 polynomial data
135 //
136 // These constants are lifted from the
137 // Linux kernel, since they avoid the costly
138 // PSHUFB 16 byte reversal proposed in the
139 // original Intel paper.
140 // Splatted so it can be loaded with a single VMOVDQU64
141 DATA r2r1<>+0(SB)/8, $0x154442bd4
142 DATA r2r1<>+8(SB)/8, $0x1c6e41596
143 DATA r2r1<>+16(SB)/8, $0x154442bd4
144 DATA r2r1<>+24(SB)/8, $0x1c6e41596
145 DATA r2r1<>+32(SB)/8, $0x154442bd4
146 DATA r2r1<>+40(SB)/8, $0x1c6e41596
147 DATA r2r1<>+48(SB)/8, $0x154442bd4
148 DATA r2r1<>+56(SB)/8, $0x1c6e41596
149
150 DATA r4r3<>+0(SB)/8, $0x1751997d0
151 DATA r4r3<>+8(SB)/8, $0x0ccaa009e
152 DATA rupoly<>+0(SB)/8, $0x1db710641
153 DATA rupoly<>+8(SB)/8, $0x1f7011641
154 DATA r5<>+0(SB)/8, $0x163cd6124
155
156 GLOBL r2r1<>(SB), RODATA, $64
157 GLOBL r4r3<>(SB),RODATA,$16
158 GLOBL rupoly<>(SB),RODATA,$16
159 GLOBL r5<>(SB),RODATA,$8
160
161 // Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
162 // len(p) must be at least 64, and must be a multiple of 16.
163
164 // func ieeeCLMUL(crc uint32, p []byte) uint32
165 TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
166 MOVL crc+0(FP), X0 // Initial CRC value
167 MOVQ p+8(FP), SI // data pointer
168 MOVQ p_len+16(FP), CX // len(p)
169
170 // Check feature support and length to be >= 1024 bytes.
171 CMPB internal∕cpu·X86+const_offsetX86HasAVX512VPCLMULQDQL(SB), $1
172 JNE useSSE42
173 CMPQ CX, $1024
174 JL useSSE42
175
176 // Use AVX512
177 VPXORQ Z0, Z0, Z0
178 VMOVQ AX, X0
179 VMOVDQU64 (SI), Z1
180 VPXORQ Z0, Z1, Z1 // Merge initial CRC value into Z1
181 ADDQ $64, SI // buf+=64
182 SUBQ $64, CX // len-=64
183
184 VMOVDQU64 r2r1<>+0(SB), Z0
185
186 loopback64Avx512:
187 VMOVDQU64 (SI), Z11 // Load next
188 VPCLMULQDQ $0x11, Z0, Z1, Z5
189 VPCLMULQDQ $0, Z0, Z1, Z1
190 VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1
191
192 ADDQ $0x40, DI
193 ADDQ $64, SI // buf+=64
194 SUBQ $64, CX // len-=64
195 CMPQ CX, $64 // Less than 64 bytes left?
196 JGE loopback64Avx512
197
198 // Unfold result into XMM1-XMM4 to match SSE4 code.
199 VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane
200 VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane
201 VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane
202 VZEROUPPER
203 JMP remain64
204
205 PCALIGN $16
206 useSSE42:
207 MOVOU (SI), X1
208 MOVOU 16(SI), X2
209 MOVOU 32(SI), X3
210 MOVOU 48(SI), X4
211 PXOR X0, X1
212 ADDQ $64, SI // buf+=64
213 SUBQ $64, CX // len-=64
214 CMPQ CX, $64 // Less than 64 bytes left
215 JB remain64
216
217 MOVOA r2r1<>+0(SB), X0
218 loopback64:
219 MOVOA X1, X5
220 MOVOA X2, X6
221 MOVOA X3, X7
222 MOVOA X4, X8
223
224 PCLMULQDQ $0, X0, X1
225 PCLMULQDQ $0, X0, X2
226 PCLMULQDQ $0, X0, X3
227 PCLMULQDQ $0, X0, X4
228
229 /* Load next early */
230 MOVOU (SI), X11
231 MOVOU 16(SI), X12
232 MOVOU 32(SI), X13
233 MOVOU 48(SI), X14
234
235 PCLMULQDQ $0x11, X0, X5
236 PCLMULQDQ $0x11, X0, X6
237 PCLMULQDQ $0x11, X0, X7
238 PCLMULQDQ $0x11, X0, X8
239
240 PXOR X5, X1
241 PXOR X6, X2
242 PXOR X7, X3
243 PXOR X8, X4
244
245 PXOR X11, X1
246 PXOR X12, X2
247 PXOR X13, X3
248 PXOR X14, X4
249
250 ADDQ $0x40, DI
251 ADDQ $64, SI // buf+=64
252 SUBQ $64, CX // len-=64
253 CMPQ CX, $64 // Less than 64 bytes left?
254 JGE loopback64
255
256 PCALIGN $16
257 /* Fold result into a single register (X1) */
258 remain64:
259 MOVOA r4r3<>+0(SB), X0
260
261 MOVOA X1, X5
262 PCLMULQDQ $0, X0, X1
263 PCLMULQDQ $0x11, X0, X5
264 PXOR X5, X1
265 PXOR X2, X1
266
267 MOVOA X1, X5
268 PCLMULQDQ $0, X0, X1
269 PCLMULQDQ $0x11, X0, X5
270 PXOR X5, X1
271 PXOR X3, X1
272
273 MOVOA X1, X5
274 PCLMULQDQ $0, X0, X1
275 PCLMULQDQ $0x11, X0, X5
276 PXOR X5, X1
277 PXOR X4, X1
278
279 /* If there is less than 16 bytes left we are done */
280 CMPQ CX, $16
281 JB finish
282
283 /* Encode 16 bytes */
284 remain16:
285 MOVOU (SI), X10
286 MOVOA X1, X5
287 PCLMULQDQ $0, X0, X1
288 PCLMULQDQ $0x11, X0, X5
289 PXOR X5, X1
290 PXOR X10, X1
291 SUBQ $16, CX
292 ADDQ $16, SI
293 CMPQ CX, $16
294 JGE remain16
295
296 finish:
297 /* Fold final result into 32 bits and return it */
298 PCMPEQB X3, X3
299 PCLMULQDQ $1, X1, X0
300 PSRLDQ $8, X1
301 PXOR X0, X1
302
303 MOVOA X1, X2
304 MOVQ r5<>+0(SB), X0
305
306 /* Creates 32 bit mask. Note that we don't care about upper half. */
307 PSRLQ $32, X3
308
309 PSRLDQ $4, X2
310 PAND X3, X1
311 PCLMULQDQ $0, X0, X1
312 PXOR X2, X1
313
314 MOVOA rupoly<>+0(SB), X0
315
316 MOVOA X1, X2
317 PAND X3, X1
318 PCLMULQDQ $0x10, X0, X1
319 PAND X3, X1
320 PCLMULQDQ $0, X0, X1
321 PXOR X2, X1
322
323 PEXTRD $1, X1, AX
324 MOVL AX, ret+32(FP)
325
326 RET
327
View as plain text