Tools/likwid/example_marker_api_dgemm
Example: likwid Marker API
in dgemm
Add likwid marker API to
dgemm
source codecp -av dgemm.multithread{,.likwid}.c vim dgemm.multithread.likwid.c diff -u dgemm.multithread{,.likwid}.c
--- dgemm.multithread.c 2021-09-22 10:21:38.306881793 +0200 +++ dgemm.multithread.likwid.c 2022-06-03 10:10:38.552486534 +0200 @@ -28,6 +28,9 @@ #include "stats.h" #include "timing.h" +// include header file of likwid API +#include "likwid.h" + // Add compiler hint: no pointer aliasing // See: https://en.wikipedia.org/wiki/Restrict #define DGEMM_RESTRICT __restrict__@@ -92,6 +95,15 @@ printf("Number of repetitions set to %i. Overwrite with command line option -m.\n", repeats); } + // initalize likwid marker API + likwid_markerInit(); + #pragma omp parallel + likwid_markerThreadInit(); + #pragma omp parallel + likwid_markerRegisterRegion( "cblas_dgemm" ); + #pragma omp parallel + likwid_markerRegisterRegion( "validate" ); + int divisor = 1; #if defined(USE_UNROLLED_4X) || defined(USE_UNROLLED_4X_SIMD_4X) divisor = 4;@@ -159,8 +171,14 @@ for (int r = 0; r < repeats; r++) { const double start = get_time_monotonic(); #if defined(USE_MKL) || defined(USE_CBLAS)+ #pragma omp parallel + likwid_markerStartRegion( "cblas_dgemm" ); + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, N, N, alpha, matrixA, N, matrixB, N, beta, matrixC, N);+ + #pragma omp parallel + likwid_markerStopRegion( "cblas_dgemm" ); #elif defined(USE_NVBLAS) char transA = 'N'; char transB = 'N';@@ -524,6 +542,8 @@ gigaFlops[r] = (flops_per_step / time_taken) / 1000000000.0; } + #pragma omp parallel + likwid_markerStartRegion( "validate" ); double infNorm = matrix_check( N,@@ -531,6 +551,9 @@ alpha, beta, repeats); + #pragma omp parallel + likwid_markerStopRegion( "validate" ); + printf("\n"); printf("===============================================================\n"); @@ -577,5 +600,8 @@ free(matrixB); free(matrixC); + // Close likwid marker API + likwid_markerClose(); + return 0; }
Build
dgemm
benchmark with likwid marker API and configure number of OpenMP and MKL threadsmodule purge module add \ compiler/gnunumlib/mkl/2022 gcc -std=c11 -Ofast -march=native -flto -fopenmp \ -DUSE_MKL \ -DLIKWID_PERFMON \ -o dgemm \ timing.c stats.c matrix_common.c dgemm.multithread.likwid.c -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core \ -lm -llikwid export OMP_NUM_THREADS=76 export MKL_NUM_THREADS=76
Measure
likwid-perfctr --marker --group FLOPS_AVX -C 0-75 ./dgemm -n 8000
-------------------------------------------------------------------------------- CPU name: Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz CPU type: Intel Icelake SP processor CPU clock: 2.39 GHz --------------------------------------------------------------------------------
Number of repetitions set to 30. Overwrite with command line option -m. Matrix size: 8000 Repeat multiply 30 times. Alpha = 1.000000 Beta = 1.000000 Allocating Matrices... Allocation complete, populating with values... Performing multiplication... Calculating matrix check... =============================================================== || E ||_∞: 0.000000E+00 -> Solution check PASSED successfully. Memory for Matrices: 1464.843750 MB Multiply time: 7.280696 seconds FLOPs computed: 30723840000000.000000 Min GFLOP/s: 3698.979764 GF/s Max GFLOP/s: 4607.866525 GF/s Average GFLOP/s: 4229.993211 GF/s Std. dev. GFLOP/s: 1105.144005 GF/s Median GFLOP/s: 4238.615980 GF/s MAD GFLOP/s: 103.226917 GF/s ===============================================================
-------------------------------------------------------------------------------- Region cblas_dgemm, Group 1: FLOPS_AVX ... +---------------------------+--------------+------------+------------+------------+ | Metric | Sum | Min | Max | Avg | +---------------------------+--------------+------------+------------+------------+ | Runtime (RDTSC) [s] STAT | 551.8220 | 7.2602 | 7.2614 | 7.2608 | | Runtime unhalted [s] STAT | 587.2250 | 7.6544 | 7.7978 | 7.7266 | | Clock [MHz] STAT | 196314.5096 | 2563.8646 | 2602.3425 | 2583.0857 | | CPI STAT | 35.0827 | 0.4533 | 0.4712 | 0.4616 | | Packed SP [MFLOP/s] STAT | 0 | 0 | 0 | 0 | | Packed DP [MFLOP/s] STAT | 4.236481e+06 | 54175.4609 | 57207.6180 | 55743.1735 | +---------------------------+--------------+------------+------------+------------+ Region validate, Group 1: FLOPS_AVX ... +---------------------------+-------------+-----------+-----------+-----------+ | Metric | Sum | Min | Max | Avg | +---------------------------+-------------+-----------+-----------+-----------+ | Runtime (RDTSC) [s] STAT | 0.2468 | 0.0032 | 0.0033 | 0.0032 | | Runtime unhalted [s] STAT | 0.3272 | 0.0042 | 0.0044 | 0.0043 | | Clock [MHz] STAT | 241927.7916 | 3116.2181 | 3192.6470 | 3183.2604 | | CPI STAT | 429.1363 | 4.5038 | 6.7668 | 5.6465 | | Packed SP [MFLOP/s] STAT | 0 | 0 | 0 | 0 | | Packed DP [MFLOP/s] STAT | 39346.8911 | 507.5717 | 528.7213 | 517.7223 | +---------------------------+-------------+-----------+-----------+-----------+