@@ -104,7 +104,7 @@ public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai,
104104 }
105105
106106 // not in use: vector api implementation slower than scalar loop version
107- public static double rowMaxsVectMult_vector_api (double [] a , double [] b , int [] aix , int ai , int bi , int len ) {
107+ public static double rowMaxsVectMultVectorAPI (double [] a , double [] b , int [] aix , int ai , int bi , int len ) {
108108 double scalarMax = Double .NEGATIVE_INFINITY ;
109109
110110 int i = 0 ;
@@ -366,33 +366,33 @@ public static double vectSum(double[] a, int ai, int len) {
366366 for ( int i = ai +bn ; i < ai +len ; i +=8 ) {
367367 //read 64B cacheline of a, compute cval' = sum(a) + cval
368368 val += a [ i +0 ] + a [ i +1 ] + a [ i +2 ] + a [ i +3 ]
369- + a [ i +4 ] + a [ i +5 ] + a [ i +6 ] + a [ i +7 ];
369+ + a [ i +4 ] + a [ i +5 ] + a [ i +6 ] + a [ i +7 ];
370370 }
371371
372372 //scalar result
373373 return val ;
374374 }
375375 // not in use: vector api implementation slower than scalar loop version
376- public static double vectSum_vector_api (double [] a , int ai , int len ) {
377- double sum = 0d ;
378- int i = 0 ;
376+ public static double vectSumVectorAPI (double [] a , int ai , int len ) {
377+ double sum = 0d ;
378+ int i = 0 ;
379379
380- DoubleVector acc = DoubleVector .zero (SPECIES );
381- int upperBound = SPECIES .loopBound (len );
380+ DoubleVector acc = DoubleVector .zero (SPECIES );
381+ int upperBound = SPECIES .loopBound (len );
382382
383383 //unrolled vLen-block (for better instruction-level parallelism)
384- for (; i < upperBound ; i += SPECIES .length ()) {
385- DoubleVector v = DoubleVector .fromArray (SPECIES , a , ai + i );
386- acc = acc .add (v );
387- }
388- sum += acc .reduceLanes (VectorOperators .ADD );
389-
390- //rest, not aligned to vLen-blocks
391- for (; i < len ; i ++) {
392- sum += a [ai + i ];
393- }
394- return sum ;
395- }
384+ for (; i < upperBound ; i += SPECIES .length ()) {
385+ DoubleVector v = DoubleVector .fromArray (SPECIES , a , ai + i );
386+ acc = acc .add (v );
387+ }
388+ sum += acc .reduceLanes (VectorOperators .ADD );
389+
390+ //rest, not aligned to vLen-blocks
391+ for (; i < len ; i ++) {
392+ sum += a [ai + i ];
393+ }
394+ return sum ;
395+ }
396396
397397 public static double vectSum (double [] avals , int [] aix , int ai , int alen , int len ) {
398398 //forward to dense as column indexes not required here
@@ -544,7 +544,7 @@ public static void vectDivAdd(double[] a, double bval, double[] c, int[] aix, in
544544 }
545545
546546 // not in use: vector api implementation slower than scalar loop version
547- public static void vectDivAdd_vector_api (double [] a , double bval , double [] c , int [] aix , int ai , int ci , int alen , int len ) {
547+ public static void vectDivAddVectorAPI (double [] a , double bval , double [] c , int [] aix , int ai , int ci , int alen , int len ) {
548548
549549 final double inv = 1.0 / bval ;
550550 int i = 0 ;
@@ -576,7 +576,7 @@ public static void vectDivAdd(double bval, double[] a, double[] c, int[] aix, in
576576 }
577577
578578 // not in use: vector api implementation slower than scalar loop version
579- public static void vectDivAdd_vector_api (double bval , double [] a , double [] c , int [] aix , int ai , int ci , int alen , int len ) {
579+ public static void vectDivAddVectorAPI (double bval , double [] a , double [] c , int [] aix , int ai , int ci , int alen , int len ) {
580580 int i = 0 ;
581581 int upperBound = SPECIES .loopBound (alen );
582582 DoubleVector vb = DoubleVector .broadcast (SPECIES , bval );
@@ -607,7 +607,7 @@ public static double[] vectDivWrite(double[] a, double bval, int ai, int len) {
607607 }
608608
609609 // not in use: vector api implementation slower than scalar loop version
610- public static double [] vectDivWrite_vector_api (double [] a , double bval , int ai , int len ) {
610+ public static double [] vectDivWriteVectorAPI (double [] a , double bval , int ai , int len ) {
611611 double [] c = allocVector (len , false );
612612 final double inv = 1.0 / bval ;
613613 final DoubleVector vinv = DoubleVector .broadcast (SPECIES , inv );
@@ -636,7 +636,7 @@ public static double[] vectDivWrite(double bval, double[] a, int ai, int len) {
636636 }
637637
638638 // not in use: vector api implementation slower than scalar loop version
639- public static double [] vectDivWrite_vector_api (double bval , double [] a , int ai , int len ) {
639+ public static double [] vectDivWriteVectorAPI (double bval , double [] a , int ai , int len ) {
640640 double [] c = allocVector (len , false );
641641 final DoubleVector vb = DoubleVector .broadcast (SPECIES , bval );
642642 int i = 0 ;
@@ -663,7 +663,7 @@ public static double[] vectDivWrite(double[] a, double[] b, int ai, int bi, int
663663 }
664664
665665 // not in use: vector api implementation slower than scalar loop version
666- public static double [] vectDivWrite_vector_api (double [] a , double [] b , int ai , int bi , int len ) {
666+ public static double [] vectDivWriteVectorAPI (double [] a , double [] b , int ai , int bi , int len ) {
667667 double [] c = allocVector (len , false );
668668 int i = 0 ;
669669 int upper = SPECIES .loopBound (len );
@@ -1163,7 +1163,7 @@ public static double[] vectExpWrite(double[] a, int ai, int len) {
11631163
11641164 public static double [] vectExpWrite (double [] a , int [] aix , int ai , int alen , int len ) {
11651165 double [] c = allocVector (len , true , 1 ); //exp(0)=1
1166- for ( int j = ai ; j < ai +alen ; j ++ ) //overwrite
1166+ for ( int j = ai ; j < ai +alen ; j ++ ) //overwrite
11671167 c [aix [j ]] = FastMath .exp (a [j ]);
11681168 return c ;
11691169 }
@@ -1925,28 +1925,28 @@ public static double[] vectEqualWrite(double bval, double[] a, int ai, int len)
19251925
19261926
19271927 public static double [] vectEqualWrite (double [] a , double [] b , int ai , int bi , int len ) {
1928- double [] c = allocVector (len , false );
1929- final DoubleVector ones = DoubleVector .broadcast (SPECIES , 1.0 );
1930- final DoubleVector zeros = DoubleVector .zero (SPECIES );
1931- int i = 0 ;
1932- int upper = SPECIES .loopBound (len );
1933-
1934- //unrolled vLen-block (for better instruction-level parallelism)
1935- for (; i < upper ; i += vLen ) {
1936- DoubleVector aVec = DoubleVector .fromArray (SPECIES , a , ai + i );
1937- DoubleVector bVec = DoubleVector .fromArray (SPECIES , b , bi + i );
1938- VectorMask <Double > eq = aVec .compare (VectorOperators .EQ , bVec );
1939- DoubleVector out = zeros .blend (ones , eq );
1940-
1941- out .intoArray (c , i );
1942- }
1943-
1944- //rest, not aligned to vLen-blocks
1945- for (; i < len ; i ++) {
1946- c [i ] = (a [ai + i ] == b [bi + i ]) ? 1.0 : 0.0 ;
1947- }
1948- return c ;
1949- }
1928+ double [] c = allocVector (len , false );
1929+ final DoubleVector ones = DoubleVector .broadcast (SPECIES , 1.0 );
1930+ final DoubleVector zeros = DoubleVector .zero (SPECIES );
1931+ int i = 0 ;
1932+ int upper = SPECIES .loopBound (len );
1933+
1934+ //unrolled vLen-block (for better instruction-level parallelism)
1935+ for (; i < upper ; i += vLen ) {
1936+ DoubleVector aVec = DoubleVector .fromArray (SPECIES , a , ai + i );
1937+ DoubleVector bVec = DoubleVector .fromArray (SPECIES , b , bi + i );
1938+ VectorMask <Double > eq = aVec .compare (VectorOperators .EQ , bVec );
1939+ DoubleVector out = zeros .blend (ones , eq );
1940+
1941+ out .intoArray (c , i );
1942+ }
1943+
1944+ //rest, not aligned to vLen-blocks
1945+ for (; i < len ; i ++) {
1946+ c [i ] = (a [ai + i ] == b [bi + i ]) ? 1.0 : 0.0 ;
1947+ }
1948+ return c ;
1949+ }
19501950
19511951 public static double [] vectEqualWrite (double [] a , double bval , int [] aix , int ai , int alen , int len ) {
19521952 double init = (bval == 0 ) ? 1 : 0 ;
@@ -2019,29 +2019,29 @@ public static void vectNotequalAdd(double bval, double[] a, double[] c, int[] ai
20192019 }
20202020
20212021 public static double [] vectNotequalWrite (double [] a , double bval , int ai , int len ) {
2022- double [] c = allocVector (len , false );
2023- final DoubleVector bVec = DoubleVector .broadcast (SPECIES , bval );
2024- final DoubleVector ones = DoubleVector .broadcast (SPECIES , 1.0 );
2025- final DoubleVector zeros = DoubleVector .zero (SPECIES );
2022+ double [] c = allocVector (len , false );
2023+ final DoubleVector bVec = DoubleVector .broadcast (SPECIES , bval );
2024+ final DoubleVector ones = DoubleVector .broadcast (SPECIES , 1.0 );
2025+ final DoubleVector zeros = DoubleVector .zero (SPECIES );
20262026
2027- int i = 0 ;
2028- int upper = SPECIES .loopBound (len );
2027+ int i = 0 ;
2028+ int upper = SPECIES .loopBound (len );
20292029
20302030 //unrolled vLen-block (for better instruction-level parallelism)
2031- for (; i < upper ; i += vLen ) {
2032- DoubleVector aVec = DoubleVector .fromArray (SPECIES , a , ai + i );
2033- VectorMask <Double > ne = aVec .compare (VectorOperators .NE , bVec );
2034- DoubleVector out = zeros .blend (ones , ne );
2031+ for (; i < upper ; i += vLen ) {
2032+ DoubleVector aVec = DoubleVector .fromArray (SPECIES , a , ai + i );
2033+ VectorMask <Double > ne = aVec .compare (VectorOperators .NE , bVec );
2034+ DoubleVector out = zeros .blend (ones , ne );
20352035
2036- out .intoArray (c , i );
2037- }
2036+ out .intoArray (c , i );
2037+ }
20382038
20392039 //rest, not aligned to vLen-blocks
2040- for (; i < len ; i ++) {
2041- c [i ] = (a [ai + i ] != bval ) ? 1.0 : 0.0 ;
2042- }
2043- return c ;
2044- }
2040+ for (; i < len ; i ++) {
2041+ c [i ] = (a [ai + i ] != bval ) ? 1.0 : 0.0 ;
2042+ }
2043+ return c ;
2044+ }
20452045
20462046 public static double [] vectNotequalWrite (double bval , double [] a , int ai , int len ) {
20472047 return vectNotequalWrite (a , bval , ai , len );
@@ -2055,7 +2055,7 @@ public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi,
20552055 }
20562056
20572057 // not in use: vector api implementation slower than scalar loop version
2058- public static double [] vectNotequalWrite_vector_api (double [] a , double [] b , int ai , int bi , int len ) {
2058+ public static double [] vectNotequalWriteVectorAPI (double [] a , double [] b , int ai , int bi , int len ) {
20592059 double [] c = allocVector (len , false );
20602060 final DoubleVector ones = DoubleVector .broadcast (SPECIES , 1.0 );
20612061 final DoubleVector zeros = DoubleVector .zero (SPECIES );
@@ -2151,31 +2151,31 @@ public static void vectLessAdd(double bval, double[] a, double[] c, int[] aix, i
21512151 }
21522152
21532153 public static double [] vectLessWrite (double [] a , double bval , int ai , int len ) {
2154- double [] c = allocVector (len , false );
2155- final DoubleVector bVec = DoubleVector .broadcast (SPECIES , bval );
2156- final DoubleVector ones = DoubleVector .broadcast (SPECIES , 1.0 );
2157- final DoubleVector zeros = DoubleVector .zero (SPECIES );
2154+ double [] c = allocVector (len , false );
2155+ final DoubleVector bVec = DoubleVector .broadcast (SPECIES , bval );
2156+ final DoubleVector ones = DoubleVector .broadcast (SPECIES , 1.0 );
2157+ final DoubleVector zeros = DoubleVector .zero (SPECIES );
21582158
2159- int i = 0 ;
2160- int upper = SPECIES .loopBound (len );
2159+ int i = 0 ;
2160+ int upper = SPECIES .loopBound (len );
21612161
21622162 //unrolled vLen-block (for better instruction-level parallelism)
2163- for (; i < upper ; i += vLen ) {
2164- DoubleVector aVec = DoubleVector .fromArray (SPECIES , a , ai + i );
2163+ for (; i < upper ; i += vLen ) {
2164+ DoubleVector aVec = DoubleVector .fromArray (SPECIES , a , ai + i );
21652165
2166- VectorMask <Double > lt = aVec .compare (VectorOperators .LT , bVec );
2167- DoubleVector out = zeros .blend (ones , lt );
2166+ VectorMask <Double > lt = aVec .compare (VectorOperators .LT , bVec );
2167+ DoubleVector out = zeros .blend (ones , lt );
21682168
2169- out .intoArray (c , i );
2170- }
2169+ out .intoArray (c , i );
2170+ }
21712171
21722172 //rest, not aligned to vLen-blocks
2173- for (; i < len ; i ++) {
2174- c [i ] = (a [ai + i ] < bval ) ? 1.0 : 0.0 ;
2175- }
2173+ for (; i < len ; i ++) {
2174+ c [i ] = (a [ai + i ] < bval ) ? 1.0 : 0.0 ;
2175+ }
21762176
2177- return c ;
2178- }
2177+ return c ;
2178+ }
21792179
21802180
21812181 public static double [] vectLessWrite (double bval , double [] a , int ai , int len ) {
@@ -2281,31 +2281,31 @@ public static void vectLessequalAdd(double bval, double[] a, double[] c, int[] a
22812281 }
22822282
22832283 public static double [] vectLessequalWrite (double [] a , double bval , int ai , int len ) {
2284- double [] c = allocVector (len , false );
2285- final DoubleVector bVec = DoubleVector .broadcast (SPECIES , bval );
2286- final DoubleVector ones = DoubleVector .broadcast (SPECIES , 1.0 );
2287- final DoubleVector zeros = DoubleVector .zero (SPECIES );
2284+ double [] c = allocVector (len , false );
2285+ final DoubleVector bVec = DoubleVector .broadcast (SPECIES , bval );
2286+ final DoubleVector ones = DoubleVector .broadcast (SPECIES , 1.0 );
2287+ final DoubleVector zeros = DoubleVector .zero (SPECIES );
22882288
2289- int i = 0 ;
2290- int upper = SPECIES .loopBound (len );
2289+ int i = 0 ;
2290+ int upper = SPECIES .loopBound (len );
22912291
22922292 //unrolled vLen-block (for better instruction-level parallelism)
2293- for (; i < upper ; i += vLen ) {
2294- DoubleVector aVec = DoubleVector .fromArray (SPECIES , a , ai + i );
2293+ for (; i < upper ; i += vLen ) {
2294+ DoubleVector aVec = DoubleVector .fromArray (SPECIES , a , ai + i );
22952295
2296- VectorMask <Double > le = aVec .compare (VectorOperators .LE , bVec );
2297- DoubleVector out = zeros .blend (ones , le );
2296+ VectorMask <Double > le = aVec .compare (VectorOperators .LE , bVec );
2297+ DoubleVector out = zeros .blend (ones , le );
22982298
2299- out .intoArray (c , i );
2300- }
2299+ out .intoArray (c , i );
2300+ }
23012301
23022302 //rest, not aligned to vLen-blocks
2303- for (; i < len ; i ++) {
2304- c [i ] = (a [ai + i ] <= bval ) ? 1.0 : 0.0 ;
2305- }
2303+ for (; i < len ; i ++) {
2304+ c [i ] = (a [ai + i ] <= bval ) ? 1.0 : 0.0 ;
2305+ }
23062306
2307- return c ;
2308- }
2307+ return c ;
2308+ }
23092309
23102310 public static double [] vectLessequalWrite (double bval , double [] a , int ai , int len ) {
23112311 return vectGreaterWrite (a , bval , ai , len );
@@ -2410,30 +2410,30 @@ public static void vectGreaterAdd(double bval, double[] a, double[] c, int[] aix
24102410 }
24112411
24122412 public static double [] vectGreaterWrite (double [] a , double bval , int ai , int len ) {
2413- double [] c = allocVector (len , false );
2414- final DoubleVector bVec = DoubleVector .broadcast (SPECIES , bval );
2415- final DoubleVector ones = DoubleVector .broadcast (SPECIES , 1.0 );
2416- final DoubleVector zeros = DoubleVector .zero (SPECIES );
2413+ double [] c = allocVector (len , false );
2414+ final DoubleVector bVec = DoubleVector .broadcast (SPECIES , bval );
2415+ final DoubleVector ones = DoubleVector .broadcast (SPECIES , 1.0 );
2416+ final DoubleVector zeros = DoubleVector .zero (SPECIES );
24172417
2418- int i = 0 ;
2419- int upper = SPECIES .loopBound (len );
2418+ int i = 0 ;
2419+ int upper = SPECIES .loopBound (len );
24202420
24212421 //unrolled vLen-block (for better instruction-level parallelism)
2422- for (; i < upper ; i += vLen ) {
2423- DoubleVector aVec = DoubleVector .fromArray (SPECIES , a , ai + i );
2422+ for (; i < upper ; i += vLen ) {
2423+ DoubleVector aVec = DoubleVector .fromArray (SPECIES , a , ai + i );
24242424
2425- VectorMask <Double > gt = aVec .compare (VectorOperators .GT , bVec );
2426- DoubleVector out = zeros .blend (ones , gt );
2425+ VectorMask <Double > gt = aVec .compare (VectorOperators .GT , bVec );
2426+ DoubleVector out = zeros .blend (ones , gt );
24272427
2428- out .intoArray (c , i );
2429- }
2428+ out .intoArray (c , i );
2429+ }
24302430
24312431 //rest, not aligned to vLen-blocks
2432- for (; i < len ; i ++) {
2433- c [i ] = (a [ai + i ] > bval ) ? 1.0 : 0.0 ;
2432+ for (; i < len ; i ++) {
2433+ c [i ] = (a [ai + i ] > bval ) ? 1.0 : 0.0 ;
24342434 }
2435- return c ;
2436- }
2435+ return c ;
2436+ }
24372437
24382438 public static double [] vectGreaterWrite (double bval , double [] a , int ai , int len ) {
24392439 return vectLessWrite (a , bval , ai , len );
@@ -2447,7 +2447,7 @@ public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi,
24472447 }
24482448
24492449 // not in use: vector api implementation slower than scalar loop version
2450- public static double [] vectGreaterWrite_vector_api (double [] a , double [] b , int ai , int bi , int len ) {
2450+ public static double [] vectGreaterWriteVectorAPI (double [] a , double [] b , int ai , int bi , int len ) {
24512451 double [] c = allocVector (len , false );
24522452 final DoubleVector ones = DoubleVector .broadcast (SPECIES , 1.0 );
24532453 final DoubleVector zeros = DoubleVector .zero (SPECIES );
0 commit comments