Source file src/simd/archsimd/internal/simd_test/simd_test.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.simd && amd64
     6  
     7  package simd_test
     8  
     9  import (
    10  	"fmt"
    11  	"os"
    12  	"reflect"
    13  	"simd/archsimd"
    14  	"slices"
    15  	"testing"
    16  )
    17  
    18  func TestMain(m *testing.M) {
    19  	if !archsimd.X86.AVX() {
    20  		fmt.Fprintln(os.Stderr, "Skipping tests: AVX is not available")
    21  		os.Exit(0)
    22  	}
    23  	os.Exit(m.Run())
    24  }
    25  
    26  var sink any
    27  
    28  func TestType(t *testing.T) {
    29  	// Testing:
    30  	// - Defined as another struct's field is ok
    31  	// - Pointer is ok
    32  	// - Type defition is ok
    33  	// - Type alias is ok
    34  	// - Type conversion is ok
    35  	// - Conversion to interface is ok
    36  	type alias = archsimd.Int32x4
    37  	type maskT archsimd.Mask32x4
    38  	type myStruct struct {
    39  		x alias
    40  		y *archsimd.Int32x4
    41  		z maskT
    42  	}
    43  	vals := [4]int32{1, 2, 3, 4}
    44  	v := myStruct{x: archsimd.LoadInt32x4(&vals)}
    45  	// masking elements 1 and 2.
    46  	want := []int32{2, 4, 0, 0}
    47  	y := archsimd.LoadInt32x4(&vals)
    48  	v.y = &y
    49  	sink = y
    50  
    51  	if !archsimd.X86.AVX512GFNI() {
    52  		t.Skip("Test requires X86.AVX512, not available on this hardware")
    53  		return
    54  	}
    55  	v.z = maskT(archsimd.Mask32x4FromBits(0b0011))
    56  	*v.y = v.y.Add(v.x).Masked(archsimd.Mask32x4(v.z))
    57  
    58  	got := [4]int32{}
    59  	v.y.Store(&got)
    60  	checkSlices(t, got[:], want)
    61  }
    62  
    63  func TestUncomparable(t *testing.T) {
    64  	// Test that simd vectors are not comparable
    65  	var x, y any = archsimd.LoadUint32x4(&[4]uint32{1, 2, 3, 4}), archsimd.LoadUint32x4(&[4]uint32{5, 6, 7, 8})
    66  	shouldPanic := func(fn func()) {
    67  		defer func() {
    68  			if recover() == nil {
    69  				panic("did not panic")
    70  			}
    71  		}()
    72  		fn()
    73  	}
    74  	shouldPanic(func() { _ = x == y })
    75  }
    76  
    77  func TestFuncValue(t *testing.T) {
    78  	// Test that simd intrinsic can be used as a function value.
    79  	xv := [4]int32{1, 2, 3, 4}
    80  	yv := [4]int32{5, 6, 7, 8}
    81  	want := []int32{6, 8, 10, 12}
    82  	x := archsimd.LoadInt32x4(&xv)
    83  	y := archsimd.LoadInt32x4(&yv)
    84  	fn := archsimd.Int32x4.Add
    85  	sink = fn
    86  	x = fn(x, y)
    87  	got := [4]int32{}
    88  	x.Store(&got)
    89  	checkSlices(t, got[:], want)
    90  }
    91  
    92  func TestReflectMethod(t *testing.T) {
    93  	// Test that simd intrinsic can be accessed via reflection.
    94  	// NOTE: we don't yet support reflect method.Call.
    95  	xv := [4]int32{1, 2, 3, 4}
    96  	yv := [4]int32{5, 6, 7, 8}
    97  	want := []int32{6, 8, 10, 12}
    98  	x := archsimd.LoadInt32x4(&xv)
    99  	y := archsimd.LoadInt32x4(&yv)
   100  	m, ok := reflect.TypeOf(x).MethodByName("Add")
   101  	if !ok {
   102  		t.Fatal("Add method not found")
   103  	}
   104  	fn := m.Func.Interface().(func(x, y archsimd.Int32x4) archsimd.Int32x4)
   105  	x = fn(x, y)
   106  	got := [4]int32{}
   107  	x.Store(&got)
   108  	checkSlices(t, got[:], want)
   109  }
   110  
   111  func TestVectorConversion(t *testing.T) {
   112  	if !archsimd.X86.AVX512GFNI() {
   113  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   114  		return
   115  	}
   116  	xv := [4]int32{1, 2, 3, 4}
   117  	x := archsimd.LoadInt32x4(&xv)
   118  	xPromoted := x.AsInt64x2()
   119  	xPromotedDemoted := xPromoted.AsInt32x4()
   120  	got := [4]int32{}
   121  	xPromotedDemoted.Store(&got)
   122  	for i := range 4 {
   123  		if xv[i] != got[i] {
   124  			t.Errorf("Result at %d incorrect: want %d, got %d", i, xv[i], got[i])
   125  		}
   126  	}
   127  }
   128  
   129  func TestMaskConversion(t *testing.T) {
   130  	if !archsimd.X86.AVX512GFNI() {
   131  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   132  		return
   133  	}
   134  	x := archsimd.LoadInt32x4Slice([]int32{5, 0, 7, 0})
   135  	mask := archsimd.Int32x4{}.Sub(x).ToMask()
   136  	y := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4}).Add(x).Masked(mask)
   137  	want := [4]int32{6, 0, 10, 0}
   138  	got := make([]int32, 4)
   139  	y.StoreSlice(got)
   140  	checkSlices(t, got[:], want[:])
   141  }
   142  
   143  func TestPermute(t *testing.T) {
   144  	if !archsimd.X86.AVX512() {
   145  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   146  		return
   147  	}
   148  	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
   149  	indices := []uint64{7, 6, 5, 4, 3, 2, 1, 0}
   150  	want := []int64{8, 7, 6, 5, 4, 3, 2, 1}
   151  	got := make([]int64, 8)
   152  	archsimd.LoadInt64x8Slice(x).Permute(archsimd.LoadUint64x8Slice(indices)).StoreSlice(got)
   153  	checkSlices(t, got, want)
   154  }
   155  
   156  func TestPermuteOrZero(t *testing.T) {
   157  	x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
   158  	indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
   159  	want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
   160  	got := make([]uint8, len(x))
   161  	archsimd.LoadUint8x16Slice(x).PermuteOrZero(archsimd.LoadInt8x16Slice(indices)).StoreSlice(got)
   162  	checkSlices(t, got, want)
   163  }
   164  
   165  func TestConcatPermute(t *testing.T) {
   166  	if !archsimd.X86.AVX512() {
   167  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   168  		return
   169  	}
   170  	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
   171  	y := []int64{-1, -2, -3, -4, -5, -6, -7, -8}
   172  	indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
   173  	want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
   174  	got := make([]int64, 8)
   175  	archsimd.LoadInt64x8Slice(x).ConcatPermute(archsimd.LoadInt64x8Slice(y), archsimd.LoadUint64x8Slice(indices)).StoreSlice(got)
   176  	checkSlices(t, got, want)
   177  }
   178  
   179  func TestCompress(t *testing.T) {
   180  	if !archsimd.X86.AVX512() {
   181  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   182  		return
   183  	}
   184  	v1234 := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
   185  	v2400 := v1234.Compress(archsimd.Mask32x4FromBits(0b1010))
   186  	got := make([]int32, 4)
   187  	v2400.StoreSlice(got)
   188  	want := []int32{2, 4, 0, 0}
   189  	if !slices.Equal(got, want) {
   190  		t.Errorf("want and got differ, want=%v, got=%v", want, got)
   191  	}
   192  }
   193  
   194  func TestExpand(t *testing.T) {
   195  	if !archsimd.X86.AVX512() {
   196  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   197  		return
   198  	}
   199  	v3400 := archsimd.LoadInt32x4Slice([]int32{3, 4, 0, 0})
   200  	v2400 := v3400.Expand(archsimd.Mask32x4FromBits(0b1010))
   201  	got := make([]int32, 4)
   202  	v2400.StoreSlice(got)
   203  	want := []int32{0, 3, 0, 4}
   204  	if !slices.Equal(got, want) {
   205  		t.Errorf("want and got differ, want=%v, got=%v", want, got)
   206  	}
   207  }
   208  
   209  var testShiftAllVal uint64 = 3
   210  
   211  func TestShiftAll(t *testing.T) {
   212  	got := make([]int32, 4)
   213  	archsimd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).ShiftAllLeft(2).StoreSlice(got)
   214  	for _, v := range got {
   215  		if v != 0b1100 {
   216  			t.Errorf("expect 0b1100, got %b", v)
   217  		}
   218  	}
   219  	archsimd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).ShiftAllLeft(testShiftAllVal).StoreSlice(got)
   220  	for _, v := range got {
   221  		if v != 0b11000 {
   222  			t.Errorf("expect 0b11000, got %b", v)
   223  		}
   224  	}
   225  }
   226  
   227  func TestSlicesInt8(t *testing.T) {
   228  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   229  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   230  	v := archsimd.LoadInt8x32Slice(a)
   231  	b := make([]int8, 32, 32)
   232  	v.StoreSlice(b)
   233  	checkSlices(t, a, b)
   234  }
   235  
   236  func TestSlicesInt8SetElem(t *testing.T) {
   237  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   238  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   239  	v := archsimd.LoadInt8x16Slice(a)
   240  
   241  	v = v.SetElem(3, 13)
   242  	a[3] = 13
   243  
   244  	b := make([]int8, 16, 16)
   245  	v.StoreSlice(b)
   246  	checkSlices(t, a, b)
   247  }
   248  
   249  func TestSlicesInt8GetElem(t *testing.T) {
   250  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   251  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   252  	v := archsimd.LoadInt8x16Slice(a)
   253  	e := v.GetElem(2)
   254  	if e != a[2] {
   255  		t.Errorf("GetElem(2) = %d != a[2] = %d", e, a[2])
   256  	}
   257  
   258  }
   259  
   260  func TestSlicesInt8TooShortLoad(t *testing.T) {
   261  	defer func() {
   262  		if r := recover(); r != nil {
   263  			t.Logf("Saw EXPECTED panic %v", r)
   264  		} else {
   265  			t.Errorf("Did not see expected panic")
   266  		}
   267  	}()
   268  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   269  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31} // TOO SHORT, should panic
   270  	v := archsimd.LoadInt8x32Slice(a)
   271  	b := make([]int8, 32, 32)
   272  	v.StoreSlice(b)
   273  	checkSlices(t, a, b)
   274  }
   275  
   276  func TestSlicesInt8TooShortStore(t *testing.T) {
   277  	defer func() {
   278  		if r := recover(); r != nil {
   279  			t.Logf("Saw EXPECTED panic %v", r)
   280  		} else {
   281  			t.Errorf("Did not see expected panic")
   282  		}
   283  	}()
   284  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   285  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   286  	v := archsimd.LoadInt8x32Slice(a)
   287  	b := make([]int8, 31) // TOO SHORT, should panic
   288  	v.StoreSlice(b)
   289  	checkSlices(t, a, b)
   290  }
   291  
   292  func TestSlicesFloat64(t *testing.T) {
   293  	a := []float64{1, 2, 3, 4, 5, 6, 7, 8} // too long, should be fine
   294  	v := archsimd.LoadFloat64x4Slice(a)
   295  	b := make([]float64, 4, 4)
   296  	v.StoreSlice(b)
   297  	for i := range b {
   298  		if a[i] != b[i] {
   299  			t.Errorf("a and b differ at index %d, a=%f, b=%f", i, a[i], b[i])
   300  		}
   301  	}
   302  }
   303  
   304  // TODO: try to reduce this test to be smaller.
   305  func TestMergeLocals(t *testing.T) {
   306  	testMergeLocalswrapper(t, archsimd.Int64x4.Add)
   307  }
   308  
   309  //go:noinline
   310  func forceSpill() {}
   311  
   312  func testMergeLocalswrapper(t *testing.T, op func(archsimd.Int64x4, archsimd.Int64x4) archsimd.Int64x4) {
   313  	t.Helper()
   314  	s0 := []int64{0, 1, 2, 3}
   315  	s1 := []int64{-1, 0, -1, 0}
   316  	want := []int64{-1, 1, 1, 3}
   317  	v := archsimd.LoadInt64x4Slice(s0)
   318  	m := archsimd.LoadInt64x4Slice(s1)
   319  	forceSpill()
   320  	got := make([]int64, 4)
   321  	gotv := op(v, m)
   322  	gotv.StoreSlice(got)
   323  	for i := range len(want) {
   324  		if !(got[i] == want[i]) {
   325  			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i])
   326  		}
   327  	}
   328  }
   329  
   330  func TestBitMaskFromBits(t *testing.T) {
   331  	if !archsimd.X86.AVX512() {
   332  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   333  		return
   334  	}
   335  	results := [2]int64{}
   336  	want := [2]int64{0, 6}
   337  	m := archsimd.Mask64x2FromBits(0b10)
   338  	archsimd.LoadInt64x2Slice([]int64{1, 2}).Add(archsimd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
   339  	for i := range 2 {
   340  		if results[i] != want[i] {
   341  			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
   342  		}
   343  	}
   344  }
   345  
   346  var maskForTestBitMaskFromBitsLoad = uint8(0b10)
   347  
   348  func TestBitMaskFromBitsLoad(t *testing.T) {
   349  	if !archsimd.X86.AVX512() {
   350  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   351  		return
   352  	}
   353  	results := [2]int64{}
   354  	want := [2]int64{0, 6}
   355  	m := archsimd.Mask64x2FromBits(maskForTestBitMaskFromBitsLoad)
   356  	archsimd.LoadInt64x2Slice([]int64{1, 2}).Add(archsimd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
   357  	for i := range 2 {
   358  		if results[i] != want[i] {
   359  			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
   360  		}
   361  	}
   362  }
   363  
   364  func TestBitMaskToBits(t *testing.T) {
   365  	if !archsimd.X86.AVX512() {
   366  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   367  		return
   368  	}
   369  	if v := archsimd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits(); v != 0b101 {
   370  		t.Errorf("Want 0b101, got %b", v)
   371  	}
   372  }
   373  
   374  var maskForTestBitMaskFromBitsStore uint8
   375  
   376  func TestBitMaskToBitsStore(t *testing.T) {
   377  	if !archsimd.X86.AVX512() {
   378  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   379  		return
   380  	}
   381  	maskForTestBitMaskFromBitsStore = archsimd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits()
   382  	if maskForTestBitMaskFromBitsStore != 0b101 {
   383  		t.Errorf("Want 0b101, got %b", maskForTestBitMaskFromBitsStore)
   384  	}
   385  }
   386  
   387  func TestMergeFloat(t *testing.T) {
   388  	k := make([]int64, 4, 4)
   389  	s := make([]float64, 4, 4)
   390  
   391  	a := archsimd.LoadFloat64x4Slice([]float64{1, 2, 3, 4})
   392  	b := archsimd.LoadFloat64x4Slice([]float64{4, 2, 3, 1})
   393  	g := a.Greater(b)
   394  	g.ToInt64x4().StoreSlice(k)
   395  	c := a.Merge(b, g)
   396  
   397  	c.StoreSlice(s)
   398  
   399  	checkSlices[int64](t, k, []int64{0, 0, 0, -1})
   400  	checkSlices[float64](t, s, []float64{4, 2, 3, 4})
   401  }
   402  
   403  func TestMergeFloat512(t *testing.T) {
   404  	if !archsimd.X86.AVX512() {
   405  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   406  		return
   407  	}
   408  
   409  	k := make([]int64, 8, 8)
   410  	s := make([]float64, 8, 8)
   411  
   412  	a := archsimd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8})
   413  	b := archsimd.LoadFloat64x8Slice([]float64{8, 7, 6, 5, 4, 2, 3, 1})
   414  	g := a.Greater(b)
   415  	g.ToInt64x8().StoreSlice(k)
   416  	c := a.Merge(b, g)
   417  	d := a.Masked(g)
   418  
   419  	checkSlices[int64](t, k, []int64{0, 0, 0, 0, -1, -1, -1, -1})
   420  
   421  	c.StoreSlice(s)
   422  	checkSlices[float64](t, s, []float64{8, 7, 6, 5, 5, 6, 7, 8})
   423  
   424  	d.StoreSlice(s)
   425  	checkSlices[float64](t, s, []float64{0, 0, 0, 0, 5, 6, 7, 8})
   426  }
   427  
   428  var ro uint8 = 2
   429  
   430  func TestRotateAllVariable(t *testing.T) {
   431  	if !archsimd.X86.AVX512() {
   432  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   433  		return
   434  	}
   435  	got := make([]int32, 4)
   436  	archsimd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).RotateAllLeft(ro).StoreSlice(got)
   437  	for _, v := range got {
   438  		if v != 0b1100 {
   439  			t.Errorf("Want 0b1100, got %b", v)
   440  		}
   441  	}
   442  }
   443  
   444  func TestBroadcastUint32x4(t *testing.T) {
   445  	s := make([]uint32, 4, 4)
   446  	archsimd.BroadcastUint32x4(123456789).StoreSlice(s)
   447  	checkSlices(t, s, []uint32{123456789, 123456789, 123456789, 123456789})
   448  }
   449  
   450  func TestBroadcastFloat32x8(t *testing.T) {
   451  	s := make([]float32, 8, 8)
   452  	archsimd.BroadcastFloat32x8(123456789).StoreSlice(s)
   453  	checkSlices(t, s, []float32{123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789})
   454  }
   455  
   456  func TestBroadcastFloat64x2(t *testing.T) {
   457  	s := make([]float64, 2, 2)
   458  	archsimd.BroadcastFloat64x2(123456789).StoreSlice(s)
   459  	checkSlices(t, s, []float64{123456789, 123456789})
   460  }
   461  
   462  func TestBroadcastUint64x2(t *testing.T) {
   463  	s := make([]uint64, 2, 2)
   464  	archsimd.BroadcastUint64x2(123456789).StoreSlice(s)
   465  	checkSlices(t, s, []uint64{123456789, 123456789})
   466  }
   467  
   468  func TestBroadcastUint16x8(t *testing.T) {
   469  	s := make([]uint16, 8, 8)
   470  	archsimd.BroadcastUint16x8(12345).StoreSlice(s)
   471  	checkSlices(t, s, []uint16{12345, 12345, 12345, 12345})
   472  }
   473  
   474  func TestBroadcastInt8x32(t *testing.T) {
   475  	s := make([]int8, 32, 32)
   476  	archsimd.BroadcastInt8x32(-123).StoreSlice(s)
   477  	checkSlices(t, s, []int8{-123, -123, -123, -123, -123, -123, -123, -123,
   478  		-123, -123, -123, -123, -123, -123, -123, -123,
   479  		-123, -123, -123, -123, -123, -123, -123, -123,
   480  		-123, -123, -123, -123, -123, -123, -123, -123,
   481  	})
   482  }
   483  
   484  func TestMaskOpt512(t *testing.T) {
   485  	if !archsimd.X86.AVX512() {
   486  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   487  		return
   488  	}
   489  
   490  	k := make([]int64, 8, 8)
   491  	s := make([]float64, 8, 8)
   492  
   493  	a := archsimd.LoadFloat64x8Slice([]float64{2, 0, 2, 0, 2, 0, 2, 0})
   494  	b := archsimd.LoadFloat64x8Slice([]float64{1, 1, 1, 1, 1, 1, 1, 1})
   495  	c := archsimd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8})
   496  	d := archsimd.LoadFloat64x8Slice([]float64{2, 4, 6, 8, 10, 12, 14, 16})
   497  	g := a.Greater(b)
   498  	e := c.Add(d).Masked(g)
   499  	e.StoreSlice(s)
   500  	g.ToInt64x8().StoreSlice(k)
   501  	checkSlices[int64](t, k, []int64{-1, 0, -1, 0, -1, 0, -1, 0})
   502  	checkSlices[float64](t, s, []float64{3, 0, 9, 0, 15, 0, 21, 0})
   503  }
   504  
   505  // flattenedTranspose tranposes x and y, regarded as a pair of 2x2
   506  // matrices, but then flattens the rows in order, i.e
   507  // x: ABCD ==> a: A1B2
   508  // y: 1234     b: C3D4
   509  func flattenedTranspose(x, y archsimd.Int32x4) (a, b archsimd.Int32x4) {
   510  	return x.InterleaveLo(y), x.InterleaveHi(y)
   511  }
   512  
   513  func TestFlattenedTranspose(t *testing.T) {
   514  	r := make([]int32, 4, 4)
   515  	s := make([]int32, 4, 4)
   516  
   517  	x := archsimd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD})
   518  	y := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
   519  	a, b := flattenedTranspose(x, y)
   520  
   521  	a.StoreSlice(r)
   522  	b.StoreSlice(s)
   523  
   524  	checkSlices[int32](t, r, []int32{0xA, 1, 0xB, 2})
   525  	checkSlices[int32](t, s, []int32{0xC, 3, 0xD, 4})
   526  
   527  }
   528  
   529  func TestClearAVXUpperBits(t *testing.T) {
   530  	// Test that ClearAVXUpperBits is safe even if there are SIMD values
   531  	// alive (although usually one should not do this).
   532  	if !archsimd.X86.AVX2() {
   533  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   534  		return
   535  	}
   536  
   537  	r := make([]int64, 4)
   538  	s := make([]int64, 4)
   539  
   540  	x := archsimd.LoadInt64x4Slice([]int64{10, 20, 30, 40})
   541  	y := archsimd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
   542  
   543  	x.Add(y).StoreSlice(r)
   544  	archsimd.ClearAVXUpperBits()
   545  	x.Sub(y).StoreSlice(s)
   546  
   547  	checkSlices[int64](t, r, []int64{11, 22, 33, 44})
   548  	checkSlices[int64](t, s, []int64{9, 18, 27, 36})
   549  }
   550  
   551  func TestLeadingZeros(t *testing.T) {
   552  	if !archsimd.X86.AVX512() {
   553  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   554  		return
   555  	}
   556  
   557  	src := []uint64{0b1111, 0}
   558  	want := []uint64{60, 64}
   559  	got := make([]uint64, 2)
   560  	archsimd.LoadUint64x2Slice(src).LeadingZeros().StoreSlice(got)
   561  	for i := range 2 {
   562  		if want[i] != got[i] {
   563  			t.Errorf("Result incorrect at %d: want %d, got %d", i, want[i], got[i])
   564  		}
   565  	}
   566  }
   567  
   568  func TestIsZero(t *testing.T) {
   569  	v1 := archsimd.LoadUint64x2Slice([]uint64{0, 1})
   570  	v2 := archsimd.LoadUint64x2Slice([]uint64{0, 0})
   571  	if v1.IsZero() {
   572  		t.Errorf("Result incorrect, want false, got true")
   573  	}
   574  	if !v2.IsZero() {
   575  		t.Errorf("Result incorrect, want true, got false")
   576  	}
   577  	if !v1.And(v2).IsZero() {
   578  		t.Errorf("Result incorrect, want true, got false")
   579  	}
   580  	if v1.AndNot(v2).IsZero() {
   581  		t.Errorf("Result incorrect, want false, got true")
   582  	}
   583  	if !v2.And(v1).IsZero() {
   584  		t.Errorf("Result incorrect, want true, got false")
   585  	}
   586  	if !v2.AndNot(v1).IsZero() {
   587  		t.Errorf("Result incorrect, want true, got false")
   588  	}
   589  }
   590  
   591  func TestSelect4FromPairConst(t *testing.T) {
   592  	x := archsimd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
   593  	y := archsimd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
   594  
   595  	llll := x.SelectFromPair(0, 1, 2, 3, y)
   596  	hhhh := x.SelectFromPair(4, 5, 6, 7, y)
   597  	llhh := x.SelectFromPair(0, 1, 6, 7, y)
   598  	hhll := x.SelectFromPair(6, 7, 0, 1, y)
   599  
   600  	lllh := x.SelectFromPair(0, 1, 2, 7, y)
   601  	llhl := x.SelectFromPair(0, 1, 7, 2, y)
   602  	lhll := x.SelectFromPair(0, 7, 1, 2, y)
   603  	hlll := x.SelectFromPair(7, 0, 1, 2, y)
   604  
   605  	hhhl := x.SelectFromPair(4, 5, 6, 0, y)
   606  	hhlh := x.SelectFromPair(4, 5, 0, 6, y)
   607  	hlhh := x.SelectFromPair(4, 0, 5, 6, y)
   608  	lhhh := x.SelectFromPair(0, 4, 5, 6, y)
   609  
   610  	lhlh := x.SelectFromPair(0, 4, 1, 5, y)
   611  	hlhl := x.SelectFromPair(4, 0, 5, 1, y)
   612  	lhhl := x.SelectFromPair(0, 4, 5, 1, y)
   613  	hllh := x.SelectFromPair(4, 0, 1, 5, y)
   614  
   615  	r := make([]int32, 4, 4)
   616  
   617  	foo := func(v archsimd.Int32x4, a, b, c, d int32) {
   618  		v.StoreSlice(r)
   619  		checkSlices[int32](t, r, []int32{a, b, c, d})
   620  	}
   621  
   622  	foo(llll, 0, 1, 2, 3)
   623  	foo(hhhh, 4, 5, 6, 7)
   624  	foo(llhh, 0, 1, 6, 7)
   625  	foo(hhll, 6, 7, 0, 1)
   626  
   627  	foo(lllh, 0, 1, 2, 7)
   628  	foo(llhl, 0, 1, 7, 2)
   629  	foo(lhll, 0, 7, 1, 2)
   630  	foo(hlll, 7, 0, 1, 2)
   631  
   632  	foo(hhhl, 4, 5, 6, 0)
   633  	foo(hhlh, 4, 5, 0, 6)
   634  	foo(hlhh, 4, 0, 5, 6)
   635  	foo(lhhh, 0, 4, 5, 6)
   636  
   637  	foo(lhlh, 0, 4, 1, 5)
   638  	foo(hlhl, 4, 0, 5, 1)
   639  	foo(lhhl, 0, 4, 5, 1)
   640  	foo(hllh, 4, 0, 1, 5)
   641  }
   642  
   643  //go:noinline
   644  func selectFromPairInt32x4(x archsimd.Int32x4, a, b, c, d uint8, y archsimd.Int32x4) archsimd.Int32x4 {
   645  	return x.SelectFromPair(a, b, c, d, y)
   646  }
   647  
   648  func TestSelect4FromPairVar(t *testing.T) {
   649  	x := archsimd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
   650  	y := archsimd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
   651  
   652  	llll := selectFromPairInt32x4(x, 0, 1, 2, 3, y)
   653  	hhhh := selectFromPairInt32x4(x, 4, 5, 6, 7, y)
   654  	llhh := selectFromPairInt32x4(x, 0, 1, 6, 7, y)
   655  	hhll := selectFromPairInt32x4(x, 6, 7, 0, 1, y)
   656  
   657  	lllh := selectFromPairInt32x4(x, 0, 1, 2, 7, y)
   658  	llhl := selectFromPairInt32x4(x, 0, 1, 7, 2, y)
   659  	lhll := selectFromPairInt32x4(x, 0, 7, 1, 2, y)
   660  	hlll := selectFromPairInt32x4(x, 7, 0, 1, 2, y)
   661  
   662  	hhhl := selectFromPairInt32x4(x, 4, 5, 6, 0, y)
   663  	hhlh := selectFromPairInt32x4(x, 4, 5, 0, 6, y)
   664  	hlhh := selectFromPairInt32x4(x, 4, 0, 5, 6, y)
   665  	lhhh := selectFromPairInt32x4(x, 0, 4, 5, 6, y)
   666  
   667  	lhlh := selectFromPairInt32x4(x, 0, 4, 1, 5, y)
   668  	hlhl := selectFromPairInt32x4(x, 4, 0, 5, 1, y)
   669  	lhhl := selectFromPairInt32x4(x, 0, 4, 5, 1, y)
   670  	hllh := selectFromPairInt32x4(x, 4, 0, 1, 5, y)
   671  
   672  	r := make([]int32, 4, 4)
   673  
   674  	foo := func(v archsimd.Int32x4, a, b, c, d int32) {
   675  		v.StoreSlice(r)
   676  		checkSlices[int32](t, r, []int32{a, b, c, d})
   677  	}
   678  
   679  	foo(llll, 0, 1, 2, 3)
   680  	foo(hhhh, 4, 5, 6, 7)
   681  	foo(llhh, 0, 1, 6, 7)
   682  	foo(hhll, 6, 7, 0, 1)
   683  
   684  	foo(lllh, 0, 1, 2, 7)
   685  	foo(llhl, 0, 1, 7, 2)
   686  	foo(lhll, 0, 7, 1, 2)
   687  	foo(hlll, 7, 0, 1, 2)
   688  
   689  	foo(hhhl, 4, 5, 6, 0)
   690  	foo(hhlh, 4, 5, 0, 6)
   691  	foo(hlhh, 4, 0, 5, 6)
   692  	foo(lhhh, 0, 4, 5, 6)
   693  
   694  	foo(lhlh, 0, 4, 1, 5)
   695  	foo(hlhl, 4, 0, 5, 1)
   696  	foo(lhhl, 0, 4, 5, 1)
   697  	foo(hllh, 4, 0, 1, 5)
   698  }
   699  
   700  func TestSelect4FromPairConstGrouped(t *testing.T) {
   701  	x := archsimd.LoadFloat32x8Slice([]float32{0, 1, 2, 3, 10, 11, 12, 13})
   702  	y := archsimd.LoadFloat32x8Slice([]float32{4, 5, 6, 7, 14, 15, 16, 17})
   703  
   704  	llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
   705  	hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
   706  	llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
   707  	hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
   708  
   709  	lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
   710  	llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
   711  	lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
   712  	hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
   713  
   714  	hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
   715  	hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
   716  	hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
   717  	lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
   718  
   719  	lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
   720  	hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
   721  	lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
   722  	hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
   723  
   724  	r := make([]float32, 8, 8)
   725  
   726  	foo := func(v archsimd.Float32x8, a, b, c, d float32) {
   727  		v.StoreSlice(r)
   728  		checkSlices[float32](t, r, []float32{a, b, c, d, 10 + a, 10 + b, 10 + c, 10 + d})
   729  	}
   730  
   731  	foo(llll, 0, 1, 2, 3)
   732  	foo(hhhh, 4, 5, 6, 7)
   733  	foo(llhh, 0, 1, 6, 7)
   734  	foo(hhll, 6, 7, 0, 1)
   735  
   736  	foo(lllh, 0, 1, 2, 7)
   737  	foo(llhl, 0, 1, 7, 2)
   738  	foo(lhll, 0, 7, 1, 2)
   739  	foo(hlll, 7, 0, 1, 2)
   740  
   741  	foo(hhhl, 4, 5, 6, 0)
   742  	foo(hhlh, 4, 5, 0, 6)
   743  	foo(hlhh, 4, 0, 5, 6)
   744  	foo(lhhh, 0, 4, 5, 6)
   745  
   746  	foo(lhlh, 0, 4, 1, 5)
   747  	foo(hlhl, 4, 0, 5, 1)
   748  	foo(lhhl, 0, 4, 5, 1)
   749  	foo(hllh, 4, 0, 1, 5)
   750  }
   751  
   752  func TestSelectFromPairConstGroupedUint32x16(t *testing.T) {
   753  	if !archsimd.X86.AVX512() {
   754  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   755  		return
   756  	}
   757  	x := archsimd.LoadUint32x16Slice([]uint32{0, 1, 2, 3, 10, 11, 12, 13, 20, 21, 22, 23, 30, 31, 32, 33})
   758  	y := archsimd.LoadUint32x16Slice([]uint32{4, 5, 6, 7, 14, 15, 16, 17, 24, 25, 26, 27, 34, 35, 36, 37})
   759  
   760  	llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
   761  	hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
   762  	llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
   763  	hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
   764  
   765  	lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
   766  	llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
   767  	lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
   768  	hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
   769  
   770  	hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
   771  	hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
   772  	hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
   773  	lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
   774  
   775  	lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
   776  	hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
   777  	lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
   778  	hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
   779  
   780  	r := make([]uint32, 16, 16)
   781  
   782  	foo := func(v archsimd.Uint32x16, a, b, c, d uint32) {
   783  		v.StoreSlice(r)
   784  		checkSlices[uint32](t, r, []uint32{a, b, c, d,
   785  			10 + a, 10 + b, 10 + c, 10 + d,
   786  			20 + a, 20 + b, 20 + c, 20 + d,
   787  			30 + a, 30 + b, 30 + c, 30 + d,
   788  		})
   789  	}
   790  
   791  	foo(llll, 0, 1, 2, 3)
   792  	foo(hhhh, 4, 5, 6, 7)
   793  	foo(llhh, 0, 1, 6, 7)
   794  	foo(hhll, 6, 7, 0, 1)
   795  
   796  	foo(lllh, 0, 1, 2, 7)
   797  	foo(llhl, 0, 1, 7, 2)
   798  	foo(lhll, 0, 7, 1, 2)
   799  	foo(hlll, 7, 0, 1, 2)
   800  
   801  	foo(hhhl, 4, 5, 6, 0)
   802  	foo(hhlh, 4, 5, 0, 6)
   803  	foo(hlhh, 4, 0, 5, 6)
   804  	foo(lhhh, 0, 4, 5, 6)
   805  
   806  	foo(lhlh, 0, 4, 1, 5)
   807  	foo(hlhl, 4, 0, 5, 1)
   808  	foo(lhhl, 0, 4, 5, 1)
   809  	foo(hllh, 4, 0, 1, 5)
   810  }
   811  
   812  func TestSelect128FromPair(t *testing.T) {
   813  	x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
   814  	y := archsimd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
   815  
   816  	aa := x.Select128FromPair(0, 0, y)
   817  	ab := x.Select128FromPair(0, 1, y)
   818  	bc := x.Select128FromPair(1, 2, y)
   819  	cd := x.Select128FromPair(2, 3, y)
   820  	da := x.Select128FromPair(3, 0, y)
   821  	dc := x.Select128FromPair(3, 2, y)
   822  
   823  	r := make([]uint64, 4, 4)
   824  
   825  	foo := func(v archsimd.Uint64x4, a, b uint64) {
   826  		a, b = 2*a, 2*b
   827  		v.StoreSlice(r)
   828  		checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
   829  	}
   830  
   831  	foo(aa, 0, 0)
   832  	foo(ab, 0, 1)
   833  	foo(bc, 1, 2)
   834  	foo(cd, 2, 3)
   835  	foo(da, 3, 0)
   836  	foo(dc, 3, 2)
   837  }
   838  
   839  func TestSelect128FromPairError(t *testing.T) {
   840  	x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
   841  	y := archsimd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
   842  
   843  	defer func() {
   844  		if r := recover(); r != nil {
   845  			t.Logf("Saw expected panic %v", r)
   846  		}
   847  	}()
   848  	_ = x.Select128FromPair(0, 4, y)
   849  
   850  	t.Errorf("Should have panicked")
   851  }
   852  
   853  //go:noinline
   854  func select128FromPair(x archsimd.Uint64x4, lo, hi uint8, y archsimd.Uint64x4) archsimd.Uint64x4 {
   855  	return x.Select128FromPair(lo, hi, y)
   856  }
   857  
   858  func TestSelect128FromPairVar(t *testing.T) {
   859  	x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
   860  	y := archsimd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
   861  
   862  	aa := select128FromPair(x, 0, 0, y)
   863  	ab := select128FromPair(x, 0, 1, y)
   864  	bc := select128FromPair(x, 1, 2, y)
   865  	cd := select128FromPair(x, 2, 3, y)
   866  	da := select128FromPair(x, 3, 0, y)
   867  	dc := select128FromPair(x, 3, 2, y)
   868  
   869  	r := make([]uint64, 4, 4)
   870  
   871  	foo := func(v archsimd.Uint64x4, a, b uint64) {
   872  		a, b = 2*a, 2*b
   873  		v.StoreSlice(r)
   874  		checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
   875  	}
   876  
   877  	foo(aa, 0, 0)
   878  	foo(ab, 0, 1)
   879  	foo(bc, 1, 2)
   880  	foo(cd, 2, 3)
   881  	foo(da, 3, 0)
   882  	foo(dc, 3, 2)
   883  }
   884  
   885  func TestSelect2FromPairConst(t *testing.T) {
   886  	x := archsimd.LoadUint64x2Slice([]uint64{0, 1})
   887  	y := archsimd.LoadUint64x2Slice([]uint64{2, 3})
   888  
   889  	ll := x.SelectFromPair(0, 1, y)
   890  	hh := x.SelectFromPair(3, 2, y)
   891  	lh := x.SelectFromPair(0, 3, y)
   892  	hl := x.SelectFromPair(2, 1, y)
   893  
   894  	r := make([]uint64, 2, 2)
   895  
   896  	foo := func(v archsimd.Uint64x2, a, b uint64) {
   897  		v.StoreSlice(r)
   898  		checkSlices[uint64](t, r, []uint64{a, b})
   899  	}
   900  
   901  	foo(ll, 0, 1)
   902  	foo(hh, 3, 2)
   903  	foo(lh, 0, 3)
   904  	foo(hl, 2, 1)
   905  }
   906  
   907  func TestSelect2FromPairConstGroupedUint(t *testing.T) {
   908  	x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 10, 11})
   909  	y := archsimd.LoadUint64x4Slice([]uint64{2, 3, 12, 13})
   910  
   911  	ll := x.SelectFromPairGrouped(0, 1, y)
   912  	hh := x.SelectFromPairGrouped(3, 2, y)
   913  	lh := x.SelectFromPairGrouped(0, 3, y)
   914  	hl := x.SelectFromPairGrouped(2, 1, y)
   915  
   916  	r := make([]uint64, 4, 4)
   917  
   918  	foo := func(v archsimd.Uint64x4, a, b uint64) {
   919  		v.StoreSlice(r)
   920  		checkSlices[uint64](t, r, []uint64{a, b, a + 10, b + 10})
   921  	}
   922  
   923  	foo(ll, 0, 1)
   924  	foo(hh, 3, 2)
   925  	foo(lh, 0, 3)
   926  	foo(hl, 2, 1)
   927  }
   928  
   929  func TestSelect2FromPairConstGroupedFloat(t *testing.T) {
   930  	x := archsimd.LoadFloat64x4Slice([]float64{0, 1, 10, 11})
   931  	y := archsimd.LoadFloat64x4Slice([]float64{2, 3, 12, 13})
   932  
   933  	ll := x.SelectFromPairGrouped(0, 1, y)
   934  	hh := x.SelectFromPairGrouped(3, 2, y)
   935  	lh := x.SelectFromPairGrouped(0, 3, y)
   936  	hl := x.SelectFromPairGrouped(2, 1, y)
   937  
   938  	r := make([]float64, 4, 4)
   939  
   940  	foo := func(v archsimd.Float64x4, a, b float64) {
   941  		v.StoreSlice(r)
   942  		checkSlices[float64](t, r, []float64{a, b, a + 10, b + 10})
   943  	}
   944  
   945  	foo(ll, 0, 1)
   946  	foo(hh, 3, 2)
   947  	foo(lh, 0, 3)
   948  	foo(hl, 2, 1)
   949  }
   950  
   951  func TestSelect2FromPairConstGroupedInt(t *testing.T) {
   952  	x := archsimd.LoadInt64x4Slice([]int64{0, 1, 10, 11})
   953  	y := archsimd.LoadInt64x4Slice([]int64{2, 3, 12, 13})
   954  
   955  	ll := x.SelectFromPairGrouped(0, 1, y)
   956  	hh := x.SelectFromPairGrouped(3, 2, y)
   957  	lh := x.SelectFromPairGrouped(0, 3, y)
   958  	hl := x.SelectFromPairGrouped(2, 1, y)
   959  
   960  	r := make([]int64, 4, 4)
   961  
   962  	foo := func(v archsimd.Int64x4, a, b int64) {
   963  		v.StoreSlice(r)
   964  		checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10})
   965  	}
   966  
   967  	foo(ll, 0, 1)
   968  	foo(hh, 3, 2)
   969  	foo(lh, 0, 3)
   970  	foo(hl, 2, 1)
   971  }
   972  
   973  func TestSelect2FromPairConstGroupedInt512(t *testing.T) {
   974  	if !archsimd.X86.AVX512() {
   975  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   976  		return
   977  	}
   978  
   979  	x := archsimd.LoadInt64x8Slice([]int64{0, 1, 10, 11, 20, 21, 30, 31})
   980  	y := archsimd.LoadInt64x8Slice([]int64{2, 3, 12, 13, 22, 23, 32, 33})
   981  
   982  	ll := x.SelectFromPairGrouped(0, 1, y)
   983  	hh := x.SelectFromPairGrouped(3, 2, y)
   984  	lh := x.SelectFromPairGrouped(0, 3, y)
   985  	hl := x.SelectFromPairGrouped(2, 1, y)
   986  
   987  	r := make([]int64, 8, 8)
   988  
   989  	foo := func(v archsimd.Int64x8, a, b int64) {
   990  		v.StoreSlice(r)
   991  		checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10, a + 20, b + 20, a + 30, b + 30})
   992  	}
   993  
   994  	foo(ll, 0, 1)
   995  	foo(hh, 3, 2)
   996  	foo(lh, 0, 3)
   997  	foo(hl, 2, 1)
   998  }
   999  
  1000  func TestString(t *testing.T) {
  1001  	x := archsimd.LoadUint32x4Slice([]uint32{0, 1, 2, 3})
  1002  	y := archsimd.LoadInt64x4Slice([]int64{-4, -5, -6, -7})
  1003  	z := archsimd.LoadFloat32x4Slice([]float32{0.5, 1.5, -2.5, 3.5e9})
  1004  	w := archsimd.LoadFloat64x4Slice([]float64{0.5, 1.5, -2.5, 3.5e9})
  1005  
  1006  	sx := "{0,1,2,3}"
  1007  	sy := "{-4,-5,-6,-7}"
  1008  	sz := "{0.5,1.5,-2.5,3.5e+09}"
  1009  	sw := sz
  1010  
  1011  	if x.String() != sx {
  1012  		t.Errorf("x=%s wanted %s", x, sx)
  1013  	}
  1014  	if y.String() != sy {
  1015  		t.Errorf("y=%s wanted %s", y, sy)
  1016  	}
  1017  	if z.String() != sz {
  1018  		t.Errorf("z=%s wanted %s", z, sz)
  1019  	}
  1020  	if w.String() != sw {
  1021  		t.Errorf("w=%s wanted %s", w, sw)
  1022  	}
  1023  	t.Logf("w=%s", w)
  1024  	t.Logf("x=%s", x)
  1025  	t.Logf("y=%s", y)
  1026  	t.Logf("z=%s", z)
  1027  }
  1028  
  1029  // a returns an slice of 16 int32
  1030  func a() []int32 {
  1031  	return make([]int32, 16, 16)
  1032  }
  1033  
  1034  // applyTo3 returns a 16-element slice of the results of
  1035  // applying f to the respective elements of vectors x, y, and z.
  1036  func applyTo3(x, y, z archsimd.Int32x16, f func(x, y, z int32) int32) []int32 {
  1037  	ax, ay, az := a(), a(), a()
  1038  	x.StoreSlice(ax)
  1039  	y.StoreSlice(ay)
  1040  	z.StoreSlice(az)
  1041  
  1042  	r := a()
  1043  	for i := range r {
  1044  		r[i] = f(ax[i], ay[i], az[i])
  1045  	}
  1046  	return r
  1047  }
  1048  
  1049  // applyTo3 returns a 16-element slice of the results of
  1050  // applying f to the respective elements of vectors x, y, z, and w.
  1051  func applyTo4(x, y, z, w archsimd.Int32x16, f func(x, y, z, w int32) int32) []int32 {
  1052  	ax, ay, az, aw := a(), a(), a(), a()
  1053  	x.StoreSlice(ax)
  1054  	y.StoreSlice(ay)
  1055  	z.StoreSlice(az)
  1056  	w.StoreSlice(aw)
  1057  
  1058  	r := make([]int32, len(ax), len(ax))
  1059  	for i := range r {
  1060  		r[i] = f(ax[i], ay[i], az[i], aw[i])
  1061  	}
  1062  	return r
  1063  }
  1064  
  1065  func TestSelectTernOptInt32x16(t *testing.T) {
  1066  	if !archsimd.X86.AVX512() {
  1067  		t.Skip("Test requires X86.AVX512, not available on this hardware")
  1068  		return
  1069  	}
  1070  	ax := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
  1071  	ay := []int32{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}
  1072  	az := []int32{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}
  1073  	aw := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
  1074  	am := []int32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
  1075  
  1076  	x := archsimd.LoadInt32x16Slice(ax)
  1077  	y := archsimd.LoadInt32x16Slice(ay)
  1078  	z := archsimd.LoadInt32x16Slice(az)
  1079  	w := archsimd.LoadInt32x16Slice(aw)
  1080  	m := archsimd.LoadInt32x16Slice(am)
  1081  
  1082  	foo := func(v archsimd.Int32x16, s []int32) {
  1083  		r := make([]int32, 16, 16)
  1084  		v.StoreSlice(r)
  1085  		checkSlices[int32](t, r, s)
  1086  	}
  1087  
  1088  	t0 := w.Xor(y).Xor(z)
  1089  	ft0 := func(w, y, z int32) int32 {
  1090  		return w ^ y ^ z
  1091  	}
  1092  	foo(t0, applyTo3(w, y, z, ft0))
  1093  
  1094  	t1 := m.And(w.Xor(y).Xor(z.Not()))
  1095  	ft1 := func(m, w, y, z int32) int32 {
  1096  		return m & (w ^ y ^ ^z)
  1097  	}
  1098  	foo(t1, applyTo4(m, w, y, z, ft1))
  1099  
  1100  	t2 := x.Xor(y).Xor(z).And(x.Xor(y).Xor(z.Not()))
  1101  	ft2 := func(x, y, z int32) int32 {
  1102  		return (x ^ y ^ z) & (x ^ y ^ ^z)
  1103  	}
  1104  	foo(t2, applyTo3(x, y, z, ft2))
  1105  }
  1106  
  1107  func TestMaskedMerge(t *testing.T) {
  1108  	x := archsimd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
  1109  	y := archsimd.LoadInt64x4Slice([]int64{5, 6, 1, 1})
  1110  	z := archsimd.LoadInt64x4Slice([]int64{-1, -2, -3, -4})
  1111  	res := make([]int64, 4)
  1112  	expected := []int64{6, 8, -3, -4}
  1113  	mask := x.Less(y)
  1114  	if archsimd.X86.AVX512() {
  1115  		x.Add(y).Merge(z, mask).StoreSlice(res)
  1116  	} else {
  1117  		x.Add(y).Merge(z, mask).StoreSlice(res)
  1118  	}
  1119  	for i := range 4 {
  1120  		if res[i] != expected[i] {
  1121  			t.Errorf("got %d wanted %d", res[i], expected[i])
  1122  		}
  1123  	}
  1124  }
  1125  
  1126  func TestDotProductQuadruple(t *testing.T) {
  1127  	if !archsimd.X86.AVXVNNI() {
  1128  		t.Skip("Test requires X86.AVXVNNI, not available on this hardware")
  1129  		return
  1130  	}
  1131  	xd := make([]int8, 16)
  1132  	yd := make([]uint8, 16)
  1133  	zd := make([]int32, 4)
  1134  	wanted1 := make([]int32, 4)
  1135  	wanted2 := make([]int32, 4)
  1136  	res1 := make([]int32, 4)
  1137  	res2 := make([]int32, 4)
  1138  	for i := range 4 {
  1139  		xd[i] = 5
  1140  		yd[i] = 6
  1141  		zd[i] = 3
  1142  		wanted1[i] = 30
  1143  		wanted2[i] = 30
  1144  	}
  1145  	x := archsimd.LoadInt8x16Slice(xd)
  1146  	y := archsimd.LoadUint8x16Slice(yd)
  1147  	z := archsimd.LoadInt32x4Slice(zd)
  1148  	x.DotProductQuadruple(y).StoreSlice(res1)
  1149  	x.DotProductQuadruple(y).Add(z).StoreSlice(res1)
  1150  	for i := range 4 {
  1151  		if res1[i] != wanted1[i] {
  1152  			t.Errorf("got %d wanted %d", res1[i], wanted1[i])
  1153  		}
  1154  		if res2[i] != wanted2[i] {
  1155  			t.Errorf("got %d wanted %d", res2[i], wanted2[i])
  1156  		}
  1157  	}
  1158  }
  1159  
  1160  func TestPermuteScalars(t *testing.T) {
  1161  	x := []int32{11, 12, 13, 14}
  1162  	want := []int32{12, 13, 14, 11}
  1163  	got := make([]int32, 4)
  1164  	archsimd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
  1165  	checkSlices(t, got, want)
  1166  }
  1167  
  1168  func TestPermuteScalarsGrouped(t *testing.T) {
  1169  	x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
  1170  	want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
  1171  	got := make([]int32, 8)
  1172  	archsimd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
  1173  	checkSlices(t, got, want)
  1174  }
  1175  
  1176  func TestPermuteScalarsHi(t *testing.T) {
  1177  	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
  1178  	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
  1179  	got := make([]int16, len(x))
  1180  	archsimd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
  1181  	checkSlices(t, got, want)
  1182  }
  1183  
  1184  func TestPermuteScalarsLo(t *testing.T) {
  1185  	x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
  1186  	want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
  1187  	got := make([]int16, len(x))
  1188  	archsimd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
  1189  	checkSlices(t, got, want)
  1190  }
  1191  
  1192  func TestPermuteScalarsHiGrouped(t *testing.T) {
  1193  	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
  1194  	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
  1195  	got := make([]int16, len(x))
  1196  	archsimd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
  1197  	checkSlices(t, got, want)
  1198  }
  1199  
  1200  func TestPermuteScalarsLoGrouped(t *testing.T) {
  1201  	x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
  1202  	want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
  1203  	got := make([]int16, len(x))
  1204  	archsimd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
  1205  	checkSlices(t, got, want)
  1206  }
  1207  
  1208  func TestClMul(t *testing.T) {
  1209  	var x = archsimd.LoadUint64x2Slice([]uint64{1, 5})
  1210  	var y = archsimd.LoadUint64x2Slice([]uint64{3, 9})
  1211  
  1212  	foo := func(v archsimd.Uint64x2, s []uint64) {
  1213  		r := make([]uint64, 2, 2)
  1214  		v.StoreSlice(r)
  1215  		checkSlices[uint64](t, r, s)
  1216  	}
  1217  
  1218  	foo(x.CarrylessMultiply(0, 0, y), []uint64{3, 0})
  1219  	foo(x.CarrylessMultiply(0, 1, y), []uint64{9, 0})
  1220  	foo(x.CarrylessMultiply(1, 0, y), []uint64{15, 0})
  1221  	foo(x.CarrylessMultiply(1, 1, y), []uint64{45, 0})
  1222  	foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})
  1223  
  1224  }
  1225  

View as plain text