c++ - 6 element double precision vector matrix vector multiply in AVX -

i need following operation in double precision:

enter image description here

the numbers represent how values stored in memory. want implement avx. best if padded columns of [qk] 8 elements first, , carried out matrix vector multiplication [x] , [qk] followed dot product?

edit: ok, decided implement float 32-bit version padded vectors shown below:

// perform matrix vector multiplication of qk*x             // load first 4 columns qk 4 ymm registers     ymm0 = _mm256_load_ps((float *)(qk));      ymm1 = _mm256_load_ps((float *)(qk+8));        ymm2 = _mm256_load_ps((float *)(qk+16));     ymm3 = _mm256_load_ps((float *)(qk+24));    // load first 4 values of x ymm4 = _mm256_broadcast_ss((float *)(x)); ymm5 = _mm256_broadcast_ss((float *)(x+1)); ymm6 = _mm256_broadcast_ss((float *)(x+2)); ymm7 = _mm256_broadcast_ss((float *)(x+3));  // multiply in place - frees ymm4,ymm5,ymm6,ymm7 ymm0 = _mm256_mul_ps(ymm0, ymm4); ymm1 = _mm256_mul_ps(ymm1, ymm5); ymm2 = _mm256_mul_ps(ymm2, ymm6); ymm3 = _mm256_mul_ps(ymm3, ymm7);  // add together, frees ymm1,ymm2,and ymm3 ymm0 = _mm256_add_ps(ymm0, ymm1); ymm2 = _mm256_add_ps(ymm2, ymm3); ymm0 = _mm256_add_ps(ymm0, ymm2);  // load next last columns of qk ymm1 = _mm256_load_ps((float *)(qk+32));      ymm2 = _mm256_load_ps((float *)(qk+40));   // load last 2 values of x ymm6 = _mm256_broadcast_ss((float *)(x+4)); ymm7 = _mm256_broadcast_ss((float *)(x+5));  // multiply in place ymm1 = _mm256_mul_ps(ymm1, ymm6); ymm2 = _mm256_mul_ps(ymm2, ymm7);  // add together, frees every register except ymm0 ymm0 = _mm256_add_ps(ymm0, ymm1); ymm0 = _mm256_add_ps(ymm0, ymm2);    // answer stored in ymm0 , ymm1    // calculate dot product of y*(qk*x) // load x ymm1 = _mm256_load_ps((float *)(y));     // dotproduct using horizontal multiply followed horizontal add // multiply in place ymm0 = _mm256_mul_ps(ymm0, ymm1);    // horizontal sum __m128 xmm1 = _mm256_extractf128_ps(ymm0, 1); __m128 xmm2 = _mm256_extractf128_ps(ymm0, 0); xmm2 = _mm_add_ps(xmm1, xmm2); xmm1 = _mm_movehl_ps(xmm1, xmm2); xmm2 = _mm_add_ps(xmm1, xmm2); xmm1 = _mm_shuffle_ps(xmm2, xmm2, 1); xmm2 = _mm_add_ss(xmm1, xmm2); ans[0] = _mm_cvtss_f32(xmm2);

as stands, runs 3 times faster than:

ans[0] = (qk[0]*x[0]+qk[8]*x[1]+qk[16]*x[2]+qk[24]*x[3]+qk[32]*x[4]+qk[40]*x[5])*y[0]+          (qk[1]*x[0]+qk[9]*x[1]+qk[17]*x[2]+qk[25]*x[3]+qk[33]*x[4]+qk[41]*x[5])*y[1]+          (qk[2]*x[0]+qk[10]*x[1]+qk[18]*x[2]+qk[26]*x[3]+qk[34]*x[4]+qk[42]*x[5])*y[2]+          (qk[3]*x[0]+qk[11]*x[1]+qk[19]*x[2]+qk[27]*x[3]+qk[35]*x[4]+qk[43]*x[5])*y[3]+          (qk[4]*x[0]+qk[12]*x[1]+qk[20]*x[2]+qk[28]*x[3]+qk[36]*x[4]+qk[44]*x[5])*y[4]+          (qk[5]*x[0]+qk[13]*x[1]+qk[21]*x[2]+qk[29]*x[3]+qk[37]*x[4]+qk[45]*x[5])*y[5];

for 500mil iterations, standard c version runs @ 9 seconds, , single precision avx version runs @ 3.5 seconds. if comment out horizonal sum @ end runs in 0.5 seconds. horizontal sum kills performance...

i created code efficiently. 4x speed (best can expect doubles on avx) using avx single thread. here results running on 2000, 32000, , 4000000 x , y six-component vectors on 6x6 matrix. these correspond l2, l3, , >>l3 cache size on system (each vector uses 48 bytes).

edit: added text (and code) @ end float. speedup 8x avx on single thread.

i5-3317u (2 core ivy bridge) compiled with: g++ m6.cpp -o m6 -o3 -mavx -fopenmp  nvec 2000, repeat 10000, 1 thread : time scalar/simd 3.95 nvec 32000, repeat 1000, 1 thread : time scalar/simd 3.53 nvec 4000000, repeat 10, 1 thread : time scalar/simd 3.28 1 thread both simd , non-simd functions   nvec 2000, repeat 10000, 2 threads: time scalar/simd 5.96 nvec 32000, repeat 1000, 2 threads: time scalar/simd 5.88 nvec 4000000, repeat 10, 2 threads: time scalar/simd 4.52 2 threads on simd function vs. 1 thread on non-simd function  compiled g++ m6.cpp -o m6 -o3 -msse4.2 -fopenmp nvec 2000, repeat 10000, 1 thread: time scalar/simd 2.15 nvec 32000, repeat 1000, 1 thread: time scalar/simd 2.06  nvec 4000000, repeat 10, 1 thread: time scalar/simd 2.00

the basic algorithm simd on x , y vectors, not on 6x6 matrix. 1 key point x , y vectors have arrays of blocks of struct of arrays. called array of struct of arrays (aosoa) or hybrid struct of arrays. can read more here "extending c-like language portable simd programming" http://www.sci.utah.edu/~wald/publications/2012///ppopp/download//vecimp.pdf

below code. function aos2aosoa converts arrays of 6 component vectors arrays of soa. application should generate vectors in form (not conversion) if want full benefit of simd. code uses agner fog's vectorclass http://www.agner.org/optimize/#vectorclass . it's header files. code work sse (but boost 2x expected).

one small caveat, arrays of x , y vectors , results have multiples of 4. however, number of vectors not.

compile this

g++ m6.cpp -o m6 -o3 -mavx -fopenmp  //system avx g++ m6.cpp -o m6 -o3 -msse4.2 -fopenmp //system sse

in visual studio put /arch:avx in compiler commmand line options

the code:

#include <stdio.h> #include <omp.h> #include "vectorclass.h" #include <stdlib.h>  double prod_scalar(double *x, double *m, double *y) {     double sum = 0.0f;     for(int i=0; i<6; i++) {         for(int j=0; j<6; j++) {             sum += x[i]*m[i*6 + j]*y[j];         }     }     return sum; }  void prod_block4(double *x, double *m, double *y, double *result) {     vec4d sum4 = 0.0f;     for(int i=0; i<6; i++) {         vec4d x4 = vec4d().load(&x[4*i]);         for(int j=0; j<6; j++) {             vec4d y4 = vec4d().load(&y[4*j]);             sum4 += x4*m[i*6 + j]*y4;         }     }     sum4.store(result); }  void prod_block4_unroll2(double *x, double *m, double *y, double *result) {     vec4d sum4_1 = 0.0f;     vec4d sum4_2 = 0.0f;     vec4d yrow[6];     for(int i=0; i<6; i++) {         yrow[i] = vec4d().load(&y[4*i]);     }     for(int i=0; i<6; i++) {         vec4d x4 = vec4d().load(&x[4*i]);         for(int j=0; j<6; j+=2) {             sum4_1 += x4*m[i*6 + j]*yrow[j];             sum4_2 += x4*m[i*6 + j+1]*yrow[j+1];         }     }     sum4_1 += sum4_2;     sum4_1.store(result); } void loop_scalar(double *x, double *m, double *y, double *result, const int nvec) { //  #pragma omp parallel     for(int i=0; i<nvec; i++) {         result[i] = prod_scalar(&x[6*i], m, &y[6*i]);     }    }  void loop_block4(double *x, double *m, double *y, double *result, const int nvec) { //  #pragma omp parallel     for(int i=0; i<(nvec/4); i++) { //      prod_block4(&x[i*24], m, &y[i*24], &result[4*i]);         prod_block4_unroll2(&x[i*24], m, &y[i*24], &result[4*i]);     } }   void aos2soa(double *in, double *out) {     int cnt = 0;     for(int i=0; i<6; i++) {         for(int j=0; j<4; j++) {             out[cnt++] = in[6*j + i];         }     } }  void aos2aosoa(double *in, double *out, const int nvec) {     for(int i=0; i<(nvec/4); i++) {         aos2soa(&in[i*24], &out[i*24]);     } }  double compare_results(double *r1, double * r2, const int nvec) {     double out = 0.0f;     for(int i=0; i<nvec; i++) {         out += r1[i] -r2[i];         //if(out!=0) printf("%f %f\n",r1[i], r2[i]);     }     return out; }  void loop(const int nvec, const int repeat) {     double *m = (double*)_mm_malloc(6*6*sizeof(double),32);     double *x_aos = (double*)_mm_malloc(nvec*6*sizeof(double),32);     double *y_aos = (double*)_mm_malloc(nvec*6*sizeof(double),32);     double *x_aosoa = (double*)_mm_malloc(nvec*6*sizeof(double),32);     double *y_aosoa = (double*)_mm_malloc(nvec*6*sizeof(double),32);     double *results_scalar = (double*)_mm_malloc(nvec*sizeof(double),32);     double *results_vector = (double*)_mm_malloc(nvec*sizeof(double),32);      for(int i=0; i<6; i++) {         for(int j=0; j<6; j++) {             m[j*6 + i] = i*6 + j;          }     }     for(int i=0; i<(6*nvec); i++) {         double r1 = (double)rand()/rand_max;         double r2 = (double)rand()/rand_max;         //x_aos[i] = i;         x_aos[i] = r1;         //y_aos[i] = i;         y_aos[i] = r2;      }       aos2aosoa(x_aos, x_aosoa, nvec);     aos2aosoa(y_aos, y_aosoa, nvec);      double dtime;     dtime = omp_get_wtime();     for(int i=0; i<repeat; i++) {         loop_scalar(x_aos, m, y_aos, results_scalar, nvec);     }     dtime = omp_get_wtime() - dtime;     printf("time scalar %f\n", dtime);      double dtime_old = dtime;     dtime = omp_get_wtime();     for(int i=0; i<repeat; i++) {         loop_block4(x_aosoa, m, y_aosoa, results_vector, nvec);     }     dtime = omp_get_wtime() - dtime;     printf("time vector %f\n", dtime);     printf("time scalar/vector %f\n", dtime_old/dtime);      printf("difference %f\n", compare_results(results_scalar, results_vector, nvec));      _mm_free(m);     _mm_free(x_aos);     _mm_free(y_aos);     _mm_free(x_aosoa);     _mm_free(y_aosoa);     _mm_free(results_scalar);     _mm_free(results_vector);  }  int main() {     int nveca[3];     nveca[0] = 2000; // 2000*2*6*8 = 192kb //l2     nveca[1] = 32000; // 32000*2*6*8 = 3mb //l3     nveca[2] = 4*1000000; //366mb     int nrepeat[3];     nrepeat[0] = 10000;     nrepeat[1] = 1000;     nrepeat[2] = 10;     for(int i=0; i<3; i++) {         printf("nvec %d, repeat %d\n", nveca[i], nrepeat[i]);         loop(nveca[i], nrepeat[i]);         printf("\n");     } }

here results float. boost 8x

nvec 2000, repeat 10000, 1 thread:  time scalar/simd 7.86 nvec 32000, repeat 1000, 1 thread:  time scalar/simd 7.63 nvec 5000000, repeat 100, 1 thread: time scalar/simd 6.63

here code float instead of double

#include <stdio.h> #include <omp.h> #include "vectorclass.h" #include <stdlib.h>  #define round_down(x, s) ((x) & ~((s)-1))  float prod_scalar(float *x, float *m, float *y) {     float sum = 0.0f;     for(int i=0; i<6; i++) {         for(int j=0; j<6; j++) {             sum += x[i]*m[i*6 + j]*y[j];         }     }     return sum; }  float prod_scalar_unroll2(float *x, float *m, float *y) {     float sum_1 = 0.0f;     float sum_2 = 0.0f;     for(int i=0; i<6; i++) {         for(int j=0; j<6; j+=2) {             sum_1 += x[i]*m[i*6 + j]*y[j];             sum_2 += x[i]*m[i*6 + j+1]*y[j+1];         }     }     return sum_1 + sum_2; }  void prod_block4(float *x, float *m, float *y, float *result) {     vec8f sum4 = 0.0f;     for(int i=0; i<6; i++) {         vec8f x4 = vec8f().load(&x[8*i]);         for(int j=0; j<6; j++) {             vec8f y4 = vec8f().load(&y[8*j]);             sum4 += x4*m[i*6 + j]*y4;         }     }     sum4.store(result); }  void prod_block4_unroll2(float *x, float *m, float *y, float *result) {     vec8f sum4_1 = 0.0f;     vec8f sum4_2 = 0.0f;     vec8f yrow[6];     for(int i=0; i<6; i++) {         yrow[i] = vec8f().load(&y[8*i]);     }     for(int i=0; i<6; i++) {         vec8f x4 = vec8f().load(&x[8*i]);         for(int j=0; j<6; j+=2) {             sum4_1 += x4*m[i*6 + j]*yrow[j];             sum4_2 += x4*m[i*6 + j+1]*yrow[j+1];         }     }     sum4_1 += sum4_2;     sum4_1.store(result); }  void loop_scalar(float *x, float *m, float *y, float *result, const int nvec) { //  #pragma omp parallel     for(int i=0; i<nvec; i++) {         result[i] = prod_scalar(&x[6*i], m, &y[6*i]);         //result[i] = prod_scalar_unroll2(&x[6*i], m, &y[6*i]);     }    }  void loop_simd(float *x, float *m, float *y, float *result, const int nvec) {     //#pragma omp parallel schedule(static,256)     //#pragma omp parallel schedule(static)     const int n = nvec/8;     //printf("chuck %d\n", n/2);     //omp_set_num_threads(2);      //#pragma omp parallel      {         //int nthreads = omp_get_num_threads();         //int ithread = omp_get_thread_num();          //int start = (ithread*n)/nthreads;         //int end = ((ithread+1)*n)/nthreads;         //printf("ithread, start %d, end %d, chunk %d\n",start, end, end-start);         //#pragma omp          for(int i=0; i<(nvec/8); i++) {         //for(int i=start; i<end; i++) {     //      prod_block4(&x[i*24], m, &y[i*24], &result[4*i]);         prod_block4_unroll2(&x[i*48], m, &y[i*48], &result[8*i]);         }     } }  void aos2soa(float *in, float *out) {     int cnt = 0;     for(int i=0; i<6; i++) {         for(int j=0; j<8; j++) {             out[cnt++] = in[6*j + i];         }     } }  void aos2aosoa(float *in, float *out, const int nvec) {     for(int i=0; i<(nvec/8); i++) {         aos2soa(&in[i*48], &out[i*48]);     } }  float compare_results(float *r1, float * r2, const int nvec) {     float out = 0.0f;     for(int i=0; i<nvec; i++) {         out += r1[i] -r2[i];         //if(out!=0) printf("%f %f\n",r1[i], r2[i]);     }     return out; }  void loop(const int nvec, const int repeat) {     float *m = (float*)_mm_malloc(6*6*sizeof(float),64);     float *x_aos = (float*)_mm_malloc(nvec*6*sizeof(float),64);     float *y_aos = (float*)_mm_malloc(nvec*6*sizeof(float),64);     float *x_aosoa = (float*)_mm_malloc(nvec*6*sizeof(float),64);     float *y_aosoa = (float*)_mm_malloc(nvec*6*sizeof(float),64);     float *results_scalar = (float*)_mm_malloc(nvec*sizeof(float),64);     float *results_simd = (float*)_mm_malloc(nvec*sizeof(float),64);      for(int i=0; i<6; i++) {         for(int j=0; j<6; j++) {             m[j*6 + i] = i*6 + j;          }     }     for(int i=0; i<(6*nvec); i++) {         float r1 = (float)rand()/rand_max;         float r2 = (float)rand()/rand_max;         //x_aos[i] = i;         x_aos[i] = r1;         //y_aos[i] = i;         y_aos[i] = r2;      }       aos2aosoa(x_aos, x_aosoa, nvec);     aos2aosoa(y_aos, y_aosoa, nvec);      float dtime;     dtime = omp_get_wtime();     for(int i=0; i<repeat; i++) {         loop_scalar(x_aos, m, y_aos, results_scalar, nvec);     }     dtime = omp_get_wtime() - dtime;     printf("time scalar %f\n", dtime);      float dtime_old = dtime;     dtime = omp_get_wtime();     for(int i=0; i<repeat; i++) {         loop_simd(x_aosoa, m, y_aosoa, results_simd, nvec);     }     dtime = omp_get_wtime() - dtime;     printf("time simd %f\n", dtime);     printf("time scalar/simd %f\n", dtime_old/dtime);      printf("difference %f\n", compare_results(results_scalar, results_simd, nvec));      _mm_free(m);     _mm_free(x_aos);     _mm_free(y_aos);     _mm_free(x_aosoa);     _mm_free(y_aosoa);     _mm_free(results_scalar);     _mm_free(results_simd);  }  int main() {     int nveca[3];     nveca[0] = 2000; // 2000*2*6*8 = 192kb //l2     nveca[1] = 32000; // 32000*2*6*8 = 3mb //l3     nveca[2] = 5*1000000; //366mb     int nrepeat[3];     nrepeat[0] = 10000;     nrepeat[1] = 1000;     nrepeat[2] = 100;     for(int i=0; i<3; i++) {         printf("nvec %d, repeat %d\n", nveca[i], nrepeat[i]);         loop(nveca[i], nrepeat[i]);         printf("\n");     } }

Search This Blog

Three

c++ - 6 element double precision vector matrix vector multiply in AVX -

Comments

Post a Comment

Popular posts from this blog

Socket.connect doesn't throw exception in Android -

SPSS keyboard combination alters encoding -

iphone - How do I keep MDScrollView from truncating my row headers and making my cells look bad? -