Author: Michael R. Crusoe <michael.crusoe@gmail.com>
Description: leverage the SIMD Everywhere library
Forwarded: https://github.com/TimoLassmann/kalign/pull/20
--- a/src/alignment.c
+++ b/src/alignment.c
@@ -20,7 +20,8 @@
 
 */
 
-#include <xmmintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse.h>
 #include "alignment.h"
 
 
--- a/src/bisectingKmeans.c
+++ b/src/bisectingKmeans.c
@@ -24,13 +24,16 @@
 #include <omp.h>
 #endif
 
-#ifdef HAVE_AVX2
-#include <xmmintrin.h>
-#include <mm_malloc.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse.h>
+#if !defined(_SSE_NATIVE)
+  #include <stdlib.h>
+  #define _mm_malloc(size, align) aligned_alloc(align, size)
+  #define _mm_free free
+#else
+  #include <mm_malloc.h>
 #endif
 
-
-
 #include "tlrng.h"
 #include "msa.h"
 
@@ -506,13 +509,8 @@ int split(float** dm,int* samples, int n
                         score = 0.0f;
                         for(i = 0; i < num_samples;i++){
                                 s = samples[i];
-#ifdef HAVE_AVX2
                                 edist_256(dm[s], cl, num_anchors, &dl);
                                 edist_256(dm[s], cr, num_anchors, &dr);
-#else
-                                edist_serial(dm[s], cl, num_anchors, &dl);
-                                edist_serial(dm[s], cr, num_anchors, &dr);
-#endif
                                 score += MACRO_MIN(dl,dr);
 
                                 if(dr < dl){
--- a/src/bpm.c
+++ b/src/bpm.c
@@ -25,10 +25,8 @@
 
 #include "tlrng.h"
 
-
-
-#ifdef HAVE_AVX2
-#include <immintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/avx2.h>
 
 __m256i BROADCAST_MASK[16];
 
@@ -37,7 +35,6 @@ __m256i bitShiftRight256ymm (__m256i *da
 
 /* taken from Alexander Yee: http://www.numberworld.org/y-cruncher/internals/addition.html#ks_add */
  __m256i add256(uint32_t carry, __m256i A, __m256i B);
-#endif
 
 /* Below are test functions  */
 #ifdef BPM_UTEST
@@ -51,11 +48,9 @@ uint8_t dyn_256(const uint8_t* t,const u
 uint8_t dyn_256_print(const uint8_t* t,const uint8_t* p,int n,int m);
 int  mutate_seq(uint8_t* s, int len,int k,int L, struct rng_state* rng);
 
-#ifdef HAVE_AVX2
 /* For debugging */
 void print_256(__m256i X);
 void print_256_all(__m256i X);
-#endif
 
 /* The actual test.  */
 int bpm_test(void);
@@ -64,9 +59,7 @@ int main(void)
 {
 
         /* Important set_broadcast_mask has to be called before using bpm_256!!! */
-#ifdef HAVE_AVX2
         set_broadcast_mask();
-#endif
         RUN(bpm_test());
         return EXIT_SUCCESS;
 ERROR:
@@ -149,11 +142,7 @@ int bpm_test(void)
                 for (j =0 ; j < test_iter; j++){
                         RUN(mutate_seq(b,len,i,alphabet->L,rng));
                         dyn_score = dyn_256(a,b,len,len);
-#ifdef HAVE_AVX2
                         bpm_score = bpm_256(a,b,len,len);
-#else
-                        bpm_score = dyn_score;
-#endif
                         if( abs( dyn_score - bpm_score) != 0){
                                 fprintf(stdout,"Scores differ: %d (dyn) %d (bpm) (%d out of %d)\n", dyn_score,bpm_score, calc_errors , total_calc);
                                 calc_errors++;
@@ -200,13 +189,9 @@ int bpm_test(void)
         for(i = 0; i < 100;i+=10){
                 RUN(mutate_seq(b,len,i,alphabet->L,rng));
 
-#ifdef HAVE_AVX2
                 for(j = 0; j < timing_iter;j++){
                         bpm_score = bpm_256(a,b,len,len);
                 }
-#else
-                bpm_score = dyn_score;
-#endif
 
 
                 //ASSERT(dyn_score == bpm_score, "Scores differ: %d %d.",dyn_score, bpm_score);
@@ -373,7 +358,6 @@ ERROR:
 
 }
 
-#ifdef HAVE_AVX2
 void print_256(__m256i X)
 {
         alignas(32) uint64_t debug[4];
@@ -394,7 +378,6 @@ void print_256_all(__m256i X)
 }
 
 #endif
-#endif
 
 
 
@@ -462,7 +445,6 @@ uint8_t bpm(const uint8_t* t,const uint8
 }
 
 
-#ifdef HAVE_AVX2
 uint8_t bpm_256(const uint8_t* t,const uint8_t* p,int n,int m)
 {
         __m256i VP,VN,D0,HN,HP,X,NOTONE;
@@ -658,4 +640,3 @@ __m256i bitShiftRight256ymm (__m256i *da
         carryOut   = _mm256_xor_si256 (innerCarry, rotate);                        //FIXME: not sure if this is correct!!!
         return carryOut;
 }
-#endif
--- a/src/euclidean_dist.c
+++ b/src/euclidean_dist.c
@@ -22,23 +22,24 @@
 
 #include "euclidean_dist.h"
 #include "tlrng.h"
-#ifdef HAVE_AVX2
-#include <xmmintrin.h>
-#include <immintrin.h>
-#include <mm_malloc.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/avx.h>
+#if !defined(_SSE_NATIVE)
+  #include <stdlib.h>
+  #define _mm_malloc(size, align) aligned_alloc(align, size)
+  #define _mm_free free
+#else
+  #include <mm_malloc.h>
 #endif
 
 
-
 #include "float.h"
 
 #include "esl_stopwatch.h"
 /* These functions were taken from:  */
 /* https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86 */
-#ifdef HAVE_AVX2
 float hsum256_ps_avx(__m256 v);
 float hsum_ps_sse3(__m128 v);
-#endif
 
 #ifdef ITEST_EDIST
 int main(void)
@@ -74,7 +75,6 @@ int main(void)
                 }
         }
         LOG_MSG("Check for correctness.");
-#ifdef HAVE_AVX2
         for(i = 0; i < 100;i++){
                 for(j = 0; j <= i;j++){
                         edist_serial(mat[i], mat[j], num_element, &d1);
@@ -85,7 +85,6 @@ int main(void)
                         }
                 }
         }
-#endif
         DECLARE_TIMER(t);
 
         LOG_MSG("Timing serial");
@@ -101,7 +100,6 @@ int main(void)
         GET_TIMING(t);
         //LOG_MSG("%f\tsec.",GET_TIMING(t));
 
-#ifdef HAVE_AVX2
         LOG_MSG("Timing AVX");
         START_TIMER(t);
         for(c = 0; c < max_iter; c++){
@@ -117,7 +115,6 @@ int main(void)
         GET_TIMING(t);
         //LOG_MSG("%f\tsec.",GET_TIMING(t));
 
-#endif
         for(i = 0; i < 100;i++){
 #ifdef HAVE_AVX2
                 _mm_free(mat[i]);
@@ -169,7 +166,6 @@ int edist_serial_d(const double* a,const
         return OK;
 }
 
-#ifdef HAVE_AVX2
 
 int edist_256(const float* a,const float* b, const int len, float* ret)
 {
@@ -218,4 +214,3 @@ float hsum_ps_sse3(__m128 v)
         return        _mm_cvtss_f32(sums);
 }
 
-#endif
--- a/src/misc.c
+++ b/src/misc.c
@@ -23,9 +23,8 @@
 #include "tldevel.h"
 #include "tlrng.h"
 
-#ifdef HAVE_AVX2
-#include <immintrin.h>
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/avx.h>
 
 #include "misc.h"
 #include  <stdalign.h>
--- a/src/sequence_distance.c
+++ b/src/sequence_distance.c
@@ -22,9 +22,14 @@
 */
 
 #include "tldevel.h"
-#ifdef HAVE_AVX2
-#include <xmmintrin.h>
-#include <mm_malloc.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse.h>
+#if !defined(_SSE_NATIVE)
+  #include <stdlib.h>
+  #define _mm_malloc(size, align) aligned_alloc(align, size)
+  #define _mm_free free
+#else
+  #include <mm_malloc.h>
 #endif
 
 
@@ -66,9 +71,7 @@ float** d_estimation(struct msa* msa, in
         int len_b;
 
         int i,j;
-#if HAVE_AVX2
         set_broadcast_mask();
-#endif
 
         if(pair){
 
