wiki:Tools/likwid/example_marker_api_dgemm

Beispiel: likwid Marker API in dgemm

  • dgemm Benchmark mit Marker API bauen und Anzahl Threads konfigurieren
    svn cp mt-dgemm{,.likwid}.c
    vim mt-dgemm.likwid.c
    svn diff
    
    • mt-dgemm.likwid.c

       
      1111#include "mkl.h"
      1212#endif
      1313
       14// include header file of likwid API
       15#include "likwid.h"
       16
      1417#define DGEMM_RESTRICT __restrict__
      1518
      1619// ------------------------------------------------------- //
       
      8386               exit(-1);
      8487       }
      8588
       89        // initalize likwid marker API
       90        LIKWID_MARKER_INIT;
       91        #pragma omp parallel
       92        LIKWID_MARKER_THREADINIT;
       93
      8694       printf("Allocating Matrices...\n");
      8795
      8896       double* DGEMM_RESTRICT matrixA = (double*) malloc(sizeof(double) * N * N);
       
      119127       // Repeat multiple times
      120128       for(r = 0; r < repeats; r++) {
      121129#if defined( USE_MKL ) || defined (USE_CBLAS)
       130
       131        #pragma omp parallel
       132        LIKWID_MARKER_START( "cblas_dgemm" );
       133
      122134        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
      123135            N, N, N, alpha, matrixA, N, matrixB, N, beta, matrixC, N);
       136
       137        #pragma omp parallel
       138        LIKWID_MARKER_STOP( "cblas_dgemm" );
       139
      124140#elif defined( USE_NVBLAS )
      125141               char transA = 'N';
      126142               char transB = 'N';
       
      158174       double final_sum = 0;
      159175       long long int count     = 0;
      160176
       177        #pragma omp parallel
       178        LIKWID_MARKER_START( "validate" );
       179
      161180       #pragma omp parallel for reduction(+:final_sum, count)
      162181       for(i = 0; i < N; i++) {
      163182               for(j = 0; j < N; j++) {
       
      166185               }
      167186       }
      168187
       188        #pragma omp parallel
       189        LIKWID_MARKER_STOP( "validate" );
       190
      169191       double N_dbl = (double) N;
      170192       double matrix_memory = (3 * N_dbl * N_dbl) * ((double) sizeof(double));
      171193
       
      207229       free(matrixB);
      208230       free(matrixC);
      209231
       232        // Close likwid marker API
       233        LIKWID_MARKER_CLOSE;
       234
      210235       return 0;
      211236}
    module purge
    export MKLROOT="/opt/intel/compilers_and_libraries_2018/linux/mkl"
    gcc -std=c11 -Ofast -march=native -flto -fopenmp \
        -DUSE_MKL \
        -DLIKWID_PERFMON \
         mt-dgemm.likwid.c -o dgemm \
         -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core \
        -llikwid
    export OMP_NUM_THREADS=20
    export OMP_DISPLAY_ENV=VERBOSE
    
  • Messen
    likwid-perfctr -m -g FLOPS_AVX -C 0-19 ./dgemm 6000
    
    ...
    ===============================================================
    Final Sum is:         6000.033333
     -> Solution check PASSED successfully.
    Memory for Matrices:  823.974609 MB
    Multiply time:        18.395941 seconds
    FLOPs computed:       12962160000000.000000
    GFLOP/s rate:         704.620654 GF/s
    ===============================================================
    
    --------------------------------------------------------------------------------
    Region cblas_dgemm, Group 1: FLOPS_AVX
    ...
    +---------------------------+-------------+------------+------------+------------+
    |           Metric          |     Sum     |     Min    |     Max    |     Avg    |
    +---------------------------+-------------+------------+------------+------------+
    |  Runtime (RDTSC) [s] STAT |    363.0182 |    18.0109 |    18.3517 |    18.1509 |
    | Runtime unhalted [s] STAT |    357.0114 |    17.7417 |    19.3684 |    17.8506 |
    |      Clock [MHz] STAT     |  58023.1584 |  2899.9989 |  2922.7940 |  2901.1579 |
    |          CPI STAT         |      6.0970 |     0.3036 |     0.3184 |     0.3049 |
    |   Packed SP MFLOP/s STAT  | 722124.9548 | 35710.2028 | 36385.8855 | 36106.2477 |
    |   Packed DP MFLOP/s STAT  | 361062.4777 | 17855.1014 | 18192.9427 | 18053.1239 |
    +---------------------------+-------------+------------+------------+------------+
    
    Region validate, Group 1: FLOPS_AVX
    ...
    +---------------------------+------------+-----------+-----------+-----------+
    |           Metric          |     Sum    |    Min    |    Max    |    Avg    |
    +---------------------------+------------+-----------+-----------+-----------+
    |  Runtime (RDTSC) [s] STAT |     0.0637 |    0.0031 |    0.0032 |    0.0032 |
    | Runtime unhalted [s] STAT |     0.0700 |    0.0035 |    0.0035 |    0.0035 |
    |      Clock [MHz] STAT     | 58000.2892 | 2899.9895 | 2900.0537 | 2900.0145 |
    |          CPI STAT         |    68.0285 |    3.2171 |    3.9582 |    3.4014 |
    |   Packed SP MFLOP/s STAT  |    45.6181 |    2.2688 |    2.3099 |    2.2809 |
    |   Packed DP MFLOP/s STAT  |    22.8091 |    1.1344 |    1.1549 |    1.1405 |
    +---------------------------+------------+-----------+-----------+-----------+
    
  • AVX Fused multiply-add (FMA) Operation wird im Benchmark als Addition und Multiplikation also als zwei Operationen gezählt.
Last modified 13 months ago Last modified on Mar 10, 2018, 1:12:09 PM