Source file src/simd/archsimd/ops_internal_amd64.go
1 // Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT. 2 3 //go:build goexperiment.simd 4 5 package archsimd 6 7 /* blend */ 8 9 // blend blends two vectors based on mask values, choosing either 10 // the first or the second based on whether the third is false or true 11 // 12 // Asm: VPBLENDVB, CPU Feature: AVX 13 func (x Int8x16) blend(y Int8x16, mask Int8x16) Int8x16 14 15 // blend blends two vectors based on mask values, choosing either 16 // the first or the second based on whether the third is false or true 17 // 18 // Asm: VPBLENDVB, CPU Feature: AVX2 19 func (x Int8x32) blend(y Int8x32, mask Int8x32) Int8x32 20 21 /* blendMasked */ 22 23 // blendMasked blends two vectors based on mask values, choosing either 24 // the first or the second based on whether the third is false or true 25 // 26 // This operation is applied selectively under a write mask. 27 // 28 // Asm: VPBLENDMB, CPU Feature: AVX512 29 func (x Int8x64) blendMasked(y Int8x64, mask Mask8x64) Int8x64 30 31 // blendMasked blends two vectors based on mask values, choosing either 32 // the first or the second based on whether the third is false or true 33 // 34 // This operation is applied selectively under a write mask. 35 // 36 // Asm: VPBLENDMW, CPU Feature: AVX512 37 func (x Int16x32) blendMasked(y Int16x32, mask Mask16x32) Int16x32 38 39 // blendMasked blends two vectors based on mask values, choosing either 40 // the first or the second based on whether the third is false or true 41 // 42 // This operation is applied selectively under a write mask. 43 // 44 // Asm: VPBLENDMD, CPU Feature: AVX512 45 func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16 46 47 // blendMasked blends two vectors based on mask values, choosing either 48 // the first or the second based on whether the third is false or true 49 // 50 // This operation is applied selectively under a write mask. 51 // 52 // Asm: VPBLENDMQ, CPU Feature: AVX512 53 func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8 54 55 /* carrylessMultiply */ 56 57 // carrylessMultiply computes one of four possible Galois polynomial 58 // products of selected high and low halves of x and y, 59 // depending on the value of xyHiLo, returning the 128-bit 60 // product in the concatenated two elements of the result. 61 // Bit 0 selects the low (0) or high (1) element of x and 62 // bit 4 selects the low (0x00) or high (0x10) element of y. 63 // 64 // xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 65 // 66 // Asm: VPCLMULQDQ, CPU Feature: AVX 67 func (x Uint64x2) carrylessMultiply(xyHiLo uint8, y Uint64x2) Uint64x2 68 69 // carrylessMultiply computes one of two possible Galois polynomial 70 // products of selected high and low halves of each of the two 71 // 128-bit lanes of x and y, depending on the value of xyHiLo, 72 // and returns the four 128-bit products in the result's lanes. 73 // Bit 0 selects the low (0) or high (1) elements of x's lanes and 74 // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes. 75 // 76 // xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 77 // 78 // Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ 79 func (x Uint64x4) carrylessMultiply(xyHiLo uint8, y Uint64x4) Uint64x4 80 81 // carrylessMultiply computes one of four possible Galois polynomial 82 // products of selected high and low halves of each of the four 83 // 128-bit lanes of x and y, depending on the value of xyHiLo, 84 // and returns the four 128-bit products in the result's lanes. 85 // Bit 0 selects the low (0) or high (1) elements of x's lanes and 86 // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes. 87 // 88 // xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 89 // 90 // Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ 91 func (x Uint64x8) carrylessMultiply(xyHiLo uint8, y Uint64x8) Uint64x8 92 93 /* concatSelectedConstant */ 94 95 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 96 // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 97 // where each {h,l}{1,0} is two bits specify which element from y or x to select. 98 // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns 99 // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). 100 // 101 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 102 // 103 // Asm: VSHUFPS, CPU Feature: AVX 104 func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4 105 106 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 107 // halves of the output. The selection is chosen by the constant parameter hilo 108 // where hi and lo are each one bit specifying which 64-bit element to select 109 // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) 110 // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, 111 // selecting from y, is 1, and selects 7. 112 // 113 // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 114 // 115 // Asm: VSHUFPD, CPU Feature: AVX 116 func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2 117 118 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 119 // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 120 // where each {h,l}{1,0} is two bits specify which element from y or x to select. 121 // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns 122 // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). 123 // 124 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 125 // 126 // Asm: VSHUFPS, CPU Feature: AVX 127 func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4 128 129 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 130 // halves of the output. The selection is chosen by the constant parameter hilo 131 // where hi and lo are each one bit specifying which 64-bit element to select 132 // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) 133 // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, 134 // selecting from y, is 1, and selects 7. 135 // 136 // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 137 // 138 // Asm: VSHUFPD, CPU Feature: AVX 139 func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2 140 141 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 142 // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 143 // where each {h,l}{1,0} is two bits specify which element from y or x to select. 144 // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns 145 // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). 146 // 147 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 148 // 149 // Asm: VSHUFPS, CPU Feature: AVX 150 func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4 151 152 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 153 // halves of the output. The selection is chosen by the constant parameter hilo 154 // where hi and lo are each one bit specifying which 64-bit element to select 155 // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) 156 // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, 157 // selecting from y, is 1, and selects 7. 158 // 159 // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 160 // 161 // Asm: VSHUFPD, CPU Feature: AVX 162 func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2 163 164 /* concatSelectedConstantGrouped */ 165 166 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 167 // into the lower and upper halves of corresponding subvectors of the output. 168 // The selection is chosen by the constant parameter h1h0l1l0 169 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 170 // For example, 171 // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) 172 // returns {2,0,5,7,10,8,13,15} 173 // (don't forget that the binary constant is written big-endian). 174 // 175 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 176 // 177 // Asm: VSHUFPS, CPU Feature: AVX 178 func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8 179 180 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 181 // into the lower and upper halves of corresponding subvectors of the output. 182 // The selection is chosen by the constant parameter h1h0l1l0 183 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 184 // For example, 185 // 186 // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( 187 // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) 188 // 189 // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} 190 // 191 // (don't forget that the binary constant is written big-endian). 192 // 193 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 194 // 195 // Asm: VSHUFPS, CPU Feature: AVX512 196 func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16 197 198 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 199 // into the lower and upper halves of corresponding subvectors of the output. 200 // The selections are specified by the constant parameter hilos where each 201 // hi and lo pair select 64-bit elements from the corresponding 128-bit 202 // subvectors of x and y. 203 // 204 // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) 205 // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least 206 // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 207 // then 1, selecting element 1 from x's upper 128 bits (9), then 1, 208 // selecting element 1 from y's upper 128 bits (11). 209 // This differs from the same method applied to a 32x8 vector, where 210 // the 8-bit constant performs the same selection on both subvectors. 211 // 212 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 213 // 214 // Asm: VSHUFPD, CPU Feature: AVX 215 func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4 216 217 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 218 // into the lower and upper halves of corresponding subvectors of the output. 219 // The selections are specified by the constant parameter hilos where each 220 // hi and lo pair select 64-bit elements from the corresponding 128-bit 221 // subvectors of x and y. 222 // 223 // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) 224 // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's 225 // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 226 // then 1, selecting element 1 from x's next 128 bits (9), then 1, 227 // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select 228 // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two 229 // 1 bits select the upper elements from x and y's last 128 bits (17, 19). 230 // This differs from the same method applied to a 32x8 or 32x16 vector, where 231 // the 8-bit constant performs the same selection on all the subvectors. 232 // 233 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 234 // 235 // Asm: VSHUFPD, CPU Feature: AVX512 236 func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8 237 238 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 239 // into the lower and upper halves of corresponding subvectors of the output. 240 // The selection is chosen by the constant parameter h1h0l1l0 241 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 242 // For example, 243 // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) 244 // returns {2,0,5,7,10,8,13,15} 245 // (don't forget that the binary constant is written big-endian). 246 // 247 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 248 // 249 // Asm: VSHUFPS, CPU Feature: AVX 250 func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8 251 252 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 253 // into the lower and upper halves of corresponding subvectors of the output. 254 // The selection is chosen by the constant parameter h1h0l1l0 255 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 256 // For example, 257 // 258 // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( 259 // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) 260 // 261 // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} 262 // 263 // (don't forget that the binary constant is written big-endian). 264 // 265 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 266 // 267 // Asm: VSHUFPS, CPU Feature: AVX512 268 func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16 269 270 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 271 // into the lower and upper halves of corresponding subvectors of the output. 272 // The selections are specified by the constant parameter hilos where each 273 // hi and lo pair select 64-bit elements from the corresponding 128-bit 274 // subvectors of x and y. 275 // 276 // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) 277 // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least 278 // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 279 // then 1, selecting element 1 from x's upper 128 bits (9), then 1, 280 // selecting element 1 from y's upper 128 bits (11). 281 // This differs from the same method applied to a 32x8 vector, where 282 // the 8-bit constant performs the same selection on both subvectors. 283 // 284 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 285 // 286 // Asm: VSHUFPD, CPU Feature: AVX 287 func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4 288 289 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 290 // into the lower and upper halves of corresponding subvectors of the output. 291 // The selections are specified by the constant parameter hilos where each 292 // hi and lo pair select 64-bit elements from the corresponding 128-bit 293 // subvectors of x and y. 294 // 295 // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) 296 // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's 297 // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 298 // then 1, selecting element 1 from x's next 128 bits (9), then 1, 299 // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select 300 // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two 301 // 1 bits select the upper elements from x and y's last 128 bits (17, 19). 302 // This differs from the same method applied to a 32x8 or 32x16 vector, where 303 // the 8-bit constant performs the same selection on all the subvectors. 304 // 305 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 306 // 307 // Asm: VSHUFPD, CPU Feature: AVX512 308 func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8 309 310 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 311 // into the lower and upper halves of corresponding subvectors of the output. 312 // The selection is chosen by the constant parameter h1h0l1l0 313 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 314 // For example, 315 // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) 316 // returns {2,0,5,7,10,8,13,15} 317 // (don't forget that the binary constant is written big-endian). 318 // 319 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 320 // 321 // Asm: VSHUFPS, CPU Feature: AVX 322 func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8 323 324 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 325 // into the lower and upper halves of corresponding subvectors of the output. 326 // The selection is chosen by the constant parameter h1h0l1l0 327 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 328 // For example, 329 // 330 // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( 331 // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) 332 // 333 // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} 334 // 335 // (don't forget that the binary constant is written big-endian). 336 // 337 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 338 // 339 // Asm: VSHUFPS, CPU Feature: AVX512 340 func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16 341 342 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 343 // into the lower and upper halves of corresponding subvectors of the output. 344 // The selections are specified by the constant parameter hilos where each 345 // hi and lo pair select 64-bit elements from the corresponding 128-bit 346 // subvectors of x and y. 347 // 348 // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) 349 // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least 350 // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 351 // then 1, selecting element 1 from x's upper 128 bits (9), then 1, 352 // selecting element 1 from y's upper 128 bits (11). 353 // This differs from the same method applied to a 32x8 vector, where 354 // the 8-bit constant performs the same selection on both subvectors. 355 // 356 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 357 // 358 // Asm: VSHUFPD, CPU Feature: AVX 359 func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4 360 361 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 362 // into the lower and upper halves of corresponding subvectors of the output. 363 // The selections are specified by the constant parameter hilos where each 364 // hi and lo pair select 64-bit elements from the corresponding 128-bit 365 // subvectors of x and y. 366 // 367 // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) 368 // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's 369 // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 370 // then 1, selecting element 1 from x's next 128 bits (9), then 1, 371 // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select 372 // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two 373 // 1 bits select the upper elements from x and y's last 128 bits (17, 19). 374 // This differs from the same method applied to a 32x8 or 32x16 vector, where 375 // the 8-bit constant performs the same selection on all the subvectors. 376 // 377 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 378 // 379 // Asm: VSHUFPD, CPU Feature: AVX512 380 func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8 381 382 /* permuteScalars */ 383 384 // permuteScalars performs a permutation of vector x using constant indices: 385 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} 386 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 387 // 388 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 389 // 390 // Asm: VPSHUFD, CPU Feature: AVX 391 func (x Int32x4) permuteScalars(indices uint8) Int32x4 392 393 // permuteScalars performs a permutation of vector x using constant indices: 394 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} 395 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 396 // 397 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 398 // 399 // Asm: VPSHUFD, CPU Feature: AVX 400 func (x Uint32x4) permuteScalars(indices uint8) Uint32x4 401 402 /* permuteScalarsGrouped */ 403 404 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: 405 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} 406 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 407 // Each group is of size 128-bit. 408 // 409 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 410 // 411 // Asm: VPSHUFD, CPU Feature: AVX2 412 func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8 413 414 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: 415 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} 416 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 417 // Each group is of size 128-bit. 418 // 419 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 420 // 421 // Asm: VPSHUFD, CPU Feature: AVX512 422 func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16 423 424 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: 425 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} 426 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 427 // Each group is of size 128-bit. 428 // 429 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 430 // 431 // Asm: VPSHUFD, CPU Feature: AVX2 432 func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8 433 434 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: 435 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} 436 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 437 // Each group is of size 128-bit. 438 // 439 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 440 // 441 // Asm: VPSHUFD, CPU Feature: AVX512 442 func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16 443 444 /* permuteScalarsHi */ 445 446 // permuteScalarsHi performs a permutation of vector x using constant indices: 447 // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} 448 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 449 // 450 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 451 // 452 // Asm: VPSHUFHW, CPU Feature: AVX512 453 func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8 454 455 // permuteScalarsHi performs a permutation of vector x using constant indices: 456 // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} 457 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 458 // 459 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 460 // 461 // Asm: VPSHUFHW, CPU Feature: AVX512 462 func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8 463 464 /* permuteScalarsHiGrouped */ 465 466 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: 467 // result = 468 // 469 // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], 470 // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} 471 // 472 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 473 // Each group is of size 128-bit. 474 // 475 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 476 // 477 // Asm: VPSHUFHW, CPU Feature: AVX2 478 func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16 479 480 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: 481 // result = 482 // 483 // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], 484 // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} 485 // 486 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 487 // Each group is of size 128-bit. 488 // 489 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 490 // 491 // Asm: VPSHUFHW, CPU Feature: AVX512 492 func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32 493 494 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: 495 // result = 496 // 497 // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], 498 // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} 499 // 500 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 501 // Each group is of size 128-bit. 502 // 503 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 504 // 505 // Asm: VPSHUFHW, CPU Feature: AVX2 506 func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16 507 508 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: 509 // result = 510 // 511 // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], 512 // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} 513 // 514 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 515 // Each group is of size 128-bit. 516 // 517 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 518 // 519 // Asm: VPSHUFHW, CPU Feature: AVX512 520 func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32 521 522 /* permuteScalarsLo */ 523 524 // permuteScalarsLo performs a permutation of vector x using constant indices: 525 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]} 526 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 527 // 528 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 529 // 530 // Asm: VPSHUFLW, CPU Feature: AVX512 531 func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8 532 533 // permuteScalarsLo performs a permutation of vector x using constant indices: 534 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]} 535 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 536 // 537 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 538 // 539 // Asm: VPSHUFLW, CPU Feature: AVX512 540 func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8 541 542 /* permuteScalarsLoGrouped */ 543 544 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: 545 // 546 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], 547 // x_group1[indices[0:2]], ...} 548 // 549 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 550 // Each group is of size 128-bit. 551 // 552 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 553 // 554 // Asm: VPSHUFLW, CPU Feature: AVX2 555 func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16 556 557 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: 558 // 559 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], 560 // x_group1[indices[0:2]], ...} 561 // 562 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 563 // Each group is of size 128-bit. 564 // 565 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 566 // 567 // Asm: VPSHUFLW, CPU Feature: AVX512 568 func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32 569 570 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: 571 // 572 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], 573 // x_group1[indices[0:2]], ...} 574 // 575 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 576 // Each group is of size 128-bit. 577 // 578 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 579 // 580 // Asm: VPSHUFLW, CPU Feature: AVX2 581 func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16 582 583 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: 584 // 585 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], 586 // x_group1[indices[0:2]], ...} 587 // 588 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 589 // Each group is of size 128-bit. 590 // 591 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 592 // 593 // Asm: VPSHUFLW, CPU Feature: AVX512 594 func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32 595 596 /* tern */ 597 598 // tern performs a logical operation on three vectors based on the 8-bit truth table. 599 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 600 // 601 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 602 // 603 // Asm: VPTERNLOGD, CPU Feature: AVX512 604 func (x Int32x4) tern(table uint8, y Int32x4, z Int32x4) Int32x4 605 606 // tern performs a logical operation on three vectors based on the 8-bit truth table. 607 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 608 // 609 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 610 // 611 // Asm: VPTERNLOGD, CPU Feature: AVX512 612 func (x Int32x8) tern(table uint8, y Int32x8, z Int32x8) Int32x8 613 614 // tern performs a logical operation on three vectors based on the 8-bit truth table. 615 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 616 // 617 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 618 // 619 // Asm: VPTERNLOGD, CPU Feature: AVX512 620 func (x Int32x16) tern(table uint8, y Int32x16, z Int32x16) Int32x16 621 622 // tern performs a logical operation on three vectors based on the 8-bit truth table. 623 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 624 // 625 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 626 // 627 // Asm: VPTERNLOGQ, CPU Feature: AVX512 628 func (x Int64x2) tern(table uint8, y Int64x2, z Int64x2) Int64x2 629 630 // tern performs a logical operation on three vectors based on the 8-bit truth table. 631 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 632 // 633 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 634 // 635 // Asm: VPTERNLOGQ, CPU Feature: AVX512 636 func (x Int64x4) tern(table uint8, y Int64x4, z Int64x4) Int64x4 637 638 // tern performs a logical operation on three vectors based on the 8-bit truth table. 639 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 640 // 641 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 642 // 643 // Asm: VPTERNLOGQ, CPU Feature: AVX512 644 func (x Int64x8) tern(table uint8, y Int64x8, z Int64x8) Int64x8 645 646 // tern performs a logical operation on three vectors based on the 8-bit truth table. 647 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 648 // 649 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 650 // 651 // Asm: VPTERNLOGD, CPU Feature: AVX512 652 func (x Uint32x4) tern(table uint8, y Uint32x4, z Uint32x4) Uint32x4 653 654 // tern performs a logical operation on three vectors based on the 8-bit truth table. 655 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 656 // 657 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 658 // 659 // Asm: VPTERNLOGD, CPU Feature: AVX512 660 func (x Uint32x8) tern(table uint8, y Uint32x8, z Uint32x8) Uint32x8 661 662 // tern performs a logical operation on three vectors based on the 8-bit truth table. 663 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 664 // 665 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 666 // 667 // Asm: VPTERNLOGD, CPU Feature: AVX512 668 func (x Uint32x16) tern(table uint8, y Uint32x16, z Uint32x16) Uint32x16 669 670 // tern performs a logical operation on three vectors based on the 8-bit truth table. 671 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 672 // 673 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 674 // 675 // Asm: VPTERNLOGQ, CPU Feature: AVX512 676 func (x Uint64x2) tern(table uint8, y Uint64x2, z Uint64x2) Uint64x2 677 678 // tern performs a logical operation on three vectors based on the 8-bit truth table. 679 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 680 // 681 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 682 // 683 // Asm: VPTERNLOGQ, CPU Feature: AVX512 684 func (x Uint64x4) tern(table uint8, y Uint64x4, z Uint64x4) Uint64x4 685 686 // tern performs a logical operation on three vectors based on the 8-bit truth table. 687 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 688 // 689 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 690 // 691 // Asm: VPTERNLOGQ, CPU Feature: AVX512 692 func (x Uint64x8) tern(table uint8, y Uint64x8, z Uint64x8) Uint64x8 693