crc32_amd64.s

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  #include "go_asm.h"
     7  
     8  // castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
     9  //
    10  // func castagnoliSSE42(crc uint32, p []byte) uint32
    11  TEXT ·castagnoliSSE42(SB),NOSPLIT,$0
    12  	MOVL crc+0(FP), AX  // CRC value
    13  	MOVQ p+8(FP), SI  // data pointer
    14  	MOVQ p_len+16(FP), CX  // len(p)
    15  
    16  	// If there are fewer than 8 bytes to process, skip alignment.
    17  	CMPQ CX, $8
    18  	JL less_than_8
    19  
    20  	MOVQ SI, BX
    21  	ANDQ $7, BX
    22  	JZ aligned
    23  
    24  	// Process the first few bytes to 8-byte align the input.
    25  
    26  	// BX = 8 - BX. We need to process this many bytes to align.
    27  	SUBQ $1, BX
    28  	XORQ $7, BX
    29  
    30  	BTQ $0, BX
    31  	JNC align_2
    32  
    33  	CRC32B (SI), AX
    34  	DECQ CX
    35  	INCQ SI
    36  
    37  align_2:
    38  	BTQ $1, BX
    39  	JNC align_4
    40  
    41  	CRC32W (SI), AX
    42  
    43  	SUBQ $2, CX
    44  	ADDQ $2, SI
    45  
    46  align_4:
    47  	BTQ $2, BX
    48  	JNC aligned
    49  
    50  	CRC32L (SI), AX
    51  
    52  	SUBQ $4, CX
    53  	ADDQ $4, SI
    54  
    55  aligned:
    56  	// The input is now 8-byte aligned and we can process 8-byte chunks.
    57  	CMPQ CX, $8
    58  	JL less_than_8
    59  
    60  	CRC32Q (SI), AX
    61  	ADDQ $8, SI
    62  	SUBQ $8, CX
    63  	JMP aligned
    64  
    65  less_than_8:
    66  	// We may have some bytes left over; process 4 bytes, then 2, then 1.
    67  	BTQ $2, CX
    68  	JNC less_than_4
    69  
    70  	CRC32L (SI), AX
    71  	ADDQ $4, SI
    72  
    73  less_than_4:
    74  	BTQ $1, CX
    75  	JNC less_than_2
    76  
    77  	CRC32W (SI), AX
    78  	ADDQ $2, SI
    79  
    80  less_than_2:
    81  	BTQ $0, CX
    82  	JNC done
    83  
    84  	CRC32B (SI), AX
    85  
    86  done:
    87  	MOVL AX, ret+32(FP)
    88  	RET
    89  
    90  // castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
    91  // bytes from each buffer.
    92  //
    93  // func castagnoliSSE42Triple(
    94  //     crc1, crc2, crc3 uint32,
    95  //     a, b, c []byte,
    96  //     rounds uint32,
    97  // ) (retA uint32, retB uint32, retC uint32)
    98  TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0
    99  	MOVL crcA+0(FP), AX
   100  	MOVL crcB+4(FP), CX
   101  	MOVL crcC+8(FP), DX
   102  
   103  	MOVQ a+16(FP), R8   // data pointer
   104  	MOVQ b+40(FP), R9   // data pointer
   105  	MOVQ c+64(FP), R10  // data pointer
   106  
   107  	MOVL rounds+88(FP), R11
   108  
   109  loop:
   110  	CRC32Q (R8), AX
   111  	CRC32Q (R9), CX
   112  	CRC32Q (R10), DX
   113  
   114  	CRC32Q 8(R8), AX
   115  	CRC32Q 8(R9), CX
   116  	CRC32Q 8(R10), DX
   117  
   118  	CRC32Q 16(R8), AX
   119  	CRC32Q 16(R9), CX
   120  	CRC32Q 16(R10), DX
   121  
   122  	ADDQ $24, R8
   123  	ADDQ $24, R9
   124  	ADDQ $24, R10
   125  
   126  	DECQ R11
   127  	JNZ loop
   128  
   129  	MOVL AX, retA+96(FP)
   130  	MOVL CX, retB+100(FP)
   131  	MOVL DX, retC+104(FP)
   132  	RET
   133  
   134  // CRC32 polynomial data
   135  //
   136  // These constants are lifted from the
   137  // Linux kernel, since they avoid the costly
   138  // PSHUFB 16 byte reversal proposed in the
   139  // original Intel paper.
   140  // Splatted so it can be loaded with a single VMOVDQU64
   141  DATA r2r1<>+0(SB)/8, $0x154442bd4
   142  DATA r2r1<>+8(SB)/8, $0x1c6e41596
   143  DATA r2r1<>+16(SB)/8, $0x154442bd4
   144  DATA r2r1<>+24(SB)/8, $0x1c6e41596
   145  DATA r2r1<>+32(SB)/8, $0x154442bd4
   146  DATA r2r1<>+40(SB)/8, $0x1c6e41596
   147  DATA r2r1<>+48(SB)/8, $0x154442bd4
   148  DATA r2r1<>+56(SB)/8, $0x1c6e41596
   149  
   150  DATA r4r3<>+0(SB)/8, $0x1751997d0
   151  DATA r4r3<>+8(SB)/8, $0x0ccaa009e
   152  DATA rupoly<>+0(SB)/8, $0x1db710641
   153  DATA rupoly<>+8(SB)/8, $0x1f7011641
   154  DATA r5<>+0(SB)/8, $0x163cd6124
   155  
   156  GLOBL r2r1<>(SB), RODATA, $64
   157  GLOBL r4r3<>(SB),RODATA,$16
   158  GLOBL rupoly<>(SB),RODATA,$16
   159  GLOBL r5<>(SB),RODATA,$8
   160  
   161  // Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
   162  // len(p) must be at least 64, and must be a multiple of 16.
   163  
   164  // func ieeeCLMUL(crc uint32, p []byte) uint32
   165  TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
   166  	MOVL   crc+0(FP), X0             // Initial CRC value
   167  	MOVQ   p+8(FP), SI  	         // data pointer
   168  	MOVQ   p_len+16(FP), CX          // len(p)
   169  
   170  	// Check feature support and length to be >= 1024 bytes.
   171  	CMPB internal∕cpu·X86+const_offsetX86HasAVX512VPCLMULQDQL(SB), $1
   172  	JNE  useSSE42
   173  	CMPQ CX, $1024
   174  	JL   useSSE42
   175  
   176  	// Use AVX512
   177  	VPXORQ    Z0, Z0, Z0
   178  	VMOVQ     AX, X0
   179  	VMOVDQU64 (SI), Z1
   180  	VPXORQ    Z0, Z1, Z1 // Merge initial CRC value into Z1
   181  	ADDQ      $64, SI    // buf+=64
   182  	SUBQ      $64, CX    // len-=64
   183  
   184  	VMOVDQU64 r2r1<>+0(SB), Z0
   185  
   186  loopback64Avx512:
   187  	VMOVDQU64  (SI), Z11          // Load next
   188  	VPCLMULQDQ $0x11, Z0, Z1, Z5
   189  	VPCLMULQDQ $0, Z0, Z1, Z1
   190  	VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1
   191  
   192  	ADDQ $0x40, DI
   193  	ADDQ $64, SI    // buf+=64
   194  	SUBQ $64, CX    // len-=64
   195  	CMPQ CX, $64    // Less than 64 bytes left?
   196  	JGE  loopback64Avx512
   197  
   198  	// Unfold result into XMM1-XMM4 to match SSE4 code.
   199  	VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane
   200  	VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane
   201  	VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane
   202  	VZEROUPPER
   203  	JMP remain64
   204  
   205  	PCALIGN $16
   206  useSSE42:
   207  	MOVOU  (SI), X1
   208  	MOVOU  16(SI), X2
   209  	MOVOU  32(SI), X3
   210  	MOVOU  48(SI), X4
   211  	PXOR   X0, X1
   212  	ADDQ   $64, SI                  // buf+=64
   213  	SUBQ   $64, CX                  // len-=64
   214  	CMPQ   CX, $64                  // Less than 64 bytes left
   215  	JB     remain64
   216  
   217  	MOVOA  r2r1<>+0(SB), X0
   218  loopback64:
   219  	MOVOA  X1, X5
   220  	MOVOA  X2, X6
   221  	MOVOA  X3, X7
   222  	MOVOA  X4, X8
   223  
   224  	PCLMULQDQ $0, X0, X1
   225  	PCLMULQDQ $0, X0, X2
   226  	PCLMULQDQ $0, X0, X3
   227  	PCLMULQDQ $0, X0, X4
   228  
   229  	/* Load next early */
   230  	MOVOU    (SI), X11
   231  	MOVOU    16(SI), X12
   232  	MOVOU    32(SI), X13
   233  	MOVOU    48(SI), X14
   234  
   235  	PCLMULQDQ $0x11, X0, X5
   236  	PCLMULQDQ $0x11, X0, X6
   237  	PCLMULQDQ $0x11, X0, X7
   238  	PCLMULQDQ $0x11, X0, X8
   239  
   240  	PXOR     X5, X1
   241  	PXOR     X6, X2
   242  	PXOR     X7, X3
   243  	PXOR     X8, X4
   244  
   245  	PXOR     X11, X1
   246  	PXOR     X12, X2
   247  	PXOR     X13, X3
   248  	PXOR     X14, X4
   249  
   250  	ADDQ    $0x40, DI
   251  	ADDQ    $64, SI      // buf+=64
   252  	SUBQ    $64, CX      // len-=64
   253  	CMPQ    CX, $64      // Less than 64 bytes left?
   254  	JGE     loopback64
   255  
   256  	PCALIGN $16
   257  	/* Fold result into a single register (X1) */
   258  remain64:
   259  	MOVOA       r4r3<>+0(SB), X0
   260  
   261  	MOVOA       X1, X5
   262  	PCLMULQDQ   $0, X0, X1
   263  	PCLMULQDQ   $0x11, X0, X5
   264  	PXOR        X5, X1
   265  	PXOR        X2, X1
   266  
   267  	MOVOA       X1, X5
   268  	PCLMULQDQ   $0, X0, X1
   269  	PCLMULQDQ   $0x11, X0, X5
   270  	PXOR        X5, X1
   271  	PXOR        X3, X1
   272  
   273  	MOVOA       X1, X5
   274  	PCLMULQDQ   $0, X0, X1
   275  	PCLMULQDQ   $0x11, X0, X5
   276  	PXOR        X5, X1
   277  	PXOR        X4, X1
   278  
   279  	/* If there is less than 16 bytes left we are done */
   280  	CMPQ        CX, $16
   281  	JB          finish
   282  
   283  	/* Encode 16 bytes */
   284  remain16:
   285  	MOVOU       (SI), X10
   286  	MOVOA       X1, X5
   287  	PCLMULQDQ   $0, X0, X1
   288  	PCLMULQDQ   $0x11, X0, X5
   289  	PXOR        X5, X1
   290  	PXOR        X10, X1
   291  	SUBQ        $16, CX
   292  	ADDQ        $16, SI
   293  	CMPQ        CX, $16
   294  	JGE         remain16
   295  
   296  finish:
   297  	/* Fold final result into 32 bits and return it */
   298  	PCMPEQB     X3, X3
   299  	PCLMULQDQ   $1, X1, X0
   300  	PSRLDQ      $8, X1
   301  	PXOR        X0, X1
   302  
   303  	MOVOA       X1, X2
   304  	MOVQ        r5<>+0(SB), X0
   305  
   306  	/* Creates 32 bit mask. Note that we don't care about upper half. */
   307  	PSRLQ       $32, X3
   308  
   309  	PSRLDQ      $4, X2
   310  	PAND        X3, X1
   311  	PCLMULQDQ   $0, X0, X1
   312  	PXOR        X2, X1
   313  
   314  	MOVOA       rupoly<>+0(SB), X0
   315  
   316  	MOVOA       X1, X2
   317  	PAND        X3, X1
   318  	PCLMULQDQ   $0x10, X0, X1
   319  	PAND        X3, X1
   320  	PCLMULQDQ   $0, X0, X1
   321  	PXOR        X2, X1
   322  
   323  	PEXTRD	$1, X1, AX
   324  	MOVL        AX, ret+32(FP)
   325  
   326  	RET
   327
View as plain text