Beispiel: likwid Marker API in dgemm
- dgemm Benchmark mit Marker API bauen und Anzahl Threads konfigurieren
svn cp mt-dgemm{,.likwid}.c vim mt-dgemm.likwid.c svn diff
-
mt-dgemm.likwid.c
11 11 #include "mkl.h" 12 12 #endif 13 13 14 // include header file of likwid API 15 #include "likwid.h" 16 14 17 #define DGEMM_RESTRICT __restrict__ 15 18 16 19 // ------------------------------------------------------- // … … 83 86 exit(-1); 84 87 } 85 88 89 // initalize likwid marker API 90 LIKWID_MARKER_INIT; 91 #pragma omp parallel 92 LIKWID_MARKER_THREADINIT; 93 86 94 printf("Allocating Matrices...\n"); 87 95 88 96 double* DGEMM_RESTRICT matrixA = (double*) malloc(sizeof(double) * N * N); … … 119 127 // Repeat multiple times 120 128 for(r = 0; r < repeats; r++) { 121 129 #if defined( USE_MKL ) || defined (USE_CBLAS) 130 131 #pragma omp parallel 132 LIKWID_MARKER_START( "cblas_dgemm" ); 133 122 134 cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 123 135 N, N, N, alpha, matrixA, N, matrixB, N, beta, matrixC, N); 136 137 #pragma omp parallel 138 LIKWID_MARKER_STOP( "cblas_dgemm" ); 139 124 140 #elif defined( USE_NVBLAS ) 125 141 char transA = 'N'; 126 142 char transB = 'N'; … … 158 174 double final_sum = 0; 159 175 long long int count = 0; 160 176 177 #pragma omp parallel 178 LIKWID_MARKER_START( "validate" ); 179 161 180 #pragma omp parallel for reduction(+:final_sum, count) 162 181 for(i = 0; i < N; i++) { 163 182 for(j = 0; j < N; j++) { … … 166 185 } 167 186 } 168 187 188 #pragma omp parallel 189 LIKWID_MARKER_STOP( "validate" ); 190 169 191 double N_dbl = (double) N; 170 192 double matrix_memory = (3 * N_dbl * N_dbl) * ((double) sizeof(double)); 171 193 … … 207 229 free(matrixB); 208 230 free(matrixC); 209 231 232 // Close likwid marker API 233 LIKWID_MARKER_CLOSE; 234 210 235 return 0; 211 236 }
module purge export MKLROOT="/opt/intel/compilers_and_libraries_2018/linux/mkl" gcc -std=c11 -Ofast -march=native -flto -fopenmp \ -DUSE_MKL \ -DLIKWID_PERFMON \ mt-dgemm.likwid.c -o dgemm \ -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core \ -llikwid export OMP_NUM_THREADS=20 export OMP_DISPLAY_ENV=VERBOSE
-
- Messen
likwid-perfctr -m -g FLOPS_AVX -C 0-19 ./dgemm 6000
... =============================================================== Final Sum is: 6000.033333 -> Solution check PASSED successfully. Memory for Matrices: 823.974609 MB Multiply time: 18.395941 seconds FLOPs computed: 12962160000000.000000 GFLOP/s rate: 704.620654 GF/s =============================================================== -------------------------------------------------------------------------------- Region cblas_dgemm, Group 1: FLOPS_AVX ... +---------------------------+-------------+------------+------------+------------+ | Metric | Sum | Min | Max | Avg | +---------------------------+-------------+------------+------------+------------+ | Runtime (RDTSC) [s] STAT | 363.0182 | 18.0109 | 18.3517 | 18.1509 | | Runtime unhalted [s] STAT | 357.0114 | 17.7417 | 19.3684 | 17.8506 | | Clock [MHz] STAT | 58023.1584 | 2899.9989 | 2922.7940 | 2901.1579 | | CPI STAT | 6.0970 | 0.3036 | 0.3184 | 0.3049 | | Packed SP MFLOP/s STAT | 722124.9548 | 35710.2028 | 36385.8855 | 36106.2477 | | Packed DP MFLOP/s STAT | 361062.4777 | 17855.1014 | 18192.9427 | 18053.1239 | +---------------------------+-------------+------------+------------+------------+ Region validate, Group 1: FLOPS_AVX ... +---------------------------+------------+-----------+-----------+-----------+ | Metric | Sum | Min | Max | Avg | +---------------------------+------------+-----------+-----------+-----------+ | Runtime (RDTSC) [s] STAT | 0.0637 | 0.0031 | 0.0032 | 0.0032 | | Runtime unhalted [s] STAT | 0.0700 | 0.0035 | 0.0035 | 0.0035 | | Clock [MHz] STAT | 58000.2892 | 2899.9895 | 2900.0537 | 2900.0145 | | CPI STAT | 68.0285 | 3.2171 | 3.9582 | 3.4014 | | Packed SP MFLOP/s STAT | 45.6181 | 2.2688 | 2.3099 | 2.2809 | | Packed DP MFLOP/s STAT | 22.8091 | 1.1344 | 1.1549 | 1.1405 | +---------------------------+------------+-----------+-----------+-----------+
- AVX Fused multiply-add (FMA) Operation wird im Benchmark als Addition und Multiplikation also als zwei Operationen gezählt.
Last modified 13 months ago
Last modified on Mar 10, 2018, 1:12:09 PM