Tools/likwid/example_marker_api_stream
Example: likwid Marker API
in stream
Add likwid marker API to
stream
source codecp -av stream.OpenMP{,",likwid"}.c vim stream.OpenMP,likwid.c diff -u stream.OpenMP{,",likwid"}.c
--- stream.OpenMP.c 2022-06-02 15:32:36.195137149 +0200 +++ stream.OpenMP,likwid.c 2022-06-03 11:06:44.695540812 +0200 @@ -62,6 +62,8 @@ #error "OpenMP support required" #endif +#include "likwid.h" + /*----------------------------------------------------------------------- * INSTRUCTIONS: *@@ -320,6 +322,7 @@ void static inline tuned_STREAM_Copy() { #pragma omp parallel default(none) shared(STREAM_ARRAY_SIZE_thread) {+ likwid_markerStartRegion( "copy" ); #ifdef __INTEL_COMPILER // Instructs the compiler to use non-temporal (that is, streaming) stores #pragma vector nontemporal@@ -328,6 +331,7 @@ for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) { c[j] = a[j]; }+ likwid_markerStopRegion( "copy" ); } } @@ -340,6 +344,7 @@ void static inline tuned_STREAM_Scale(const STREAM_TYPE scalar) { #pragma omp parallel default(none) shared(scalar, STREAM_ARRAY_SIZE_thread) {+ likwid_markerStartRegion( "scale" ); #ifdef __INTEL_COMPILER // Instructs the compiler to use non-temporal (that is, streaming) stores #pragma vector nontemporal@@ -348,6 +353,7 @@ for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) { b[j] = scalar * c[j]; }+ likwid_markerStopRegion( "scale" ); } } @@ -360,6 +366,7 @@ void static inline tuned_STREAM_Add() { #pragma omp parallel default(none) shared(STREAM_ARRAY_SIZE_thread) {+ likwid_markerStartRegion( "add" ); #ifdef __INTEL_COMPILER // Instructs the compiler to use non-temporal (that is, streaming) stores #pragma vector nontemporal@@ -368,6 +375,7 @@ for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) { c[j] = a[j] + b[j]; }+ likwid_markerStopRegion( "add" ); } } @@ -380,6 +388,7 @@ void static inline tuned_STREAM_Triad(const STREAM_TYPE scalar) { #pragma omp parallel default(none) shared(scalar, STREAM_ARRAY_SIZE_thread) {+ likwid_markerStartRegion( "triad" ); #ifdef __INTEL_COMPILER // Instructs the compiler to use non-temporal (that is, streaming) stores #pragma vector nontemporal@@ -388,6 +397,7 @@ for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) { a[j] = b[j] + scalar * c[j]; }+ likwid_markerStopRegion( "triad" ); } } @@ -556,6 +566,17 @@ // Explicitly turn off dynamic threads omp_set_dynamic(0); + // initalize likwid marker API + likwid_markerInit(); + #pragma omp parallel + { + likwid_markerThreadInit(); + likwid_markerRegisterRegion( "copy" ); + likwid_markerRegisterRegion( "scale" ); + likwid_markerRegisterRegion( "add" ); + likwid_markerRegisterRegion( "triad" ); + } + // Number of Threads requested #pragma omp parallel default(none) shared(omp_num_threads_req) #pragma omp master@@ -699,6 +720,9 @@ times[TRIAD][NTIMES_count] = mysecond() - times[TRIAD][NTIMES_count]; } + // Close likwid marker API + likwid_markerClose(); + for (int j = 0; j < NUM_BENCHMARKS; j++) { // Sort times qsort(times[j], NTIMES, sizeof(double), double_compare);
GNU Compiler
Build
stream
benchmark with likwid marker API and configure number of OpenMP threadsmodule purge module add compiler/gnu/ gcc -std=c11 -Ofast -march=native -flto -fopenmp \ -DLIKWID_PERFMON \ -o stream \ stream.OpenMP,likwid.c -llikwid export OMP_NUM_THREADS=76
Measure
likwid-perfctr --marker --group MEM -C 0-75 ./stream -n 1000000000
-------------------------------------------------------------------------------- CPU name: Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz CPU type: Intel Icelake SP processor CPU clock: 2.39 GHz --------------------------------------------------------------------------------
------------------------------------------------------------- STREAM version $Revision: 5.10 $ ------------------------------------------------------------- This system uses 8 bytes per array element. ------------------------------------------------------------- Array size = 999999944 (elements) Memory per array = 7629.4 MiB (= 7.5 GiB). Total memory required = 22888.2 MiB (= 22.4 GiB). Each kernel will be executed 10 times. The *best* time for each kernel (excluding the first iteration) will be used to compute the reported bandwidth. ------------------------------------------------------------- OpenMP version (yyyymm): 201511 Number of Threads requested = 76 Number of Threads counted = 76 ------------------------------------------------------------- Your clock granularity appears to be 1000 ticks per microseconds. Each test below will take on the order of 55908 microseconds. (= 55908446 clock ticks) Increase the size of the arrays if this shows that you are not getting at least 20 clock ticks per test. ------------------------------------------------------------- WARNING -- The above is only a rough guideline. For best results, please be sure you know the precision of your system timer. ------------------------------------------------------------- Function Best Rate MB/s Med time Min time Max time Copy: 293309.4 0.054672 0.054550 0.056677 Scale: 299201.7 0.054348 0.053476 0.056038 Add: 308924.8 0.078704 0.077689 0.081481 Triad: 308298.5 0.077964 0.077847 0.080953 ------------------------------------------------------------- Solution Validates: avg error less than 1.000000e-13 on all three arrays -------------------------------------------------------------
-------------------------------------------------------------------------------- Region copy, Group 1: MEM ... +----------------------------------------+-------------+-----------+-------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+-------------+-----------+ | Runtime (RDTSC) [s] STAT | 44.7582 | 0.5775 | 0.5976 | 0.5889 | | Runtime unhalted [s] STAT | 59.3359 | 0.7663 | 0.7916 | 0.7807 | | Clock [MHz] STAT | 242588.4192 | 3191.1481 | 3192.4704 | 3191.9529 | | CPI STAT | 923.4100 | 11.9241 | 12.3188 | 12.1501 | | Memory read bandwidth [MBytes/s] STAT | 157984.9461 | 0 | 79160.1940 | 2078.7493 | | Memory read data volume [GBytes] STAT | 94.1370 | 0 | 47.1038 | 1.2386 | | Memory write bandwidth [MBytes/s] STAT | 151223.1620 | 0 | 75795.9096 | 1989.7784 | | Memory write data volume [GBytes] STAT | 90.1078 | 0 | 45.0735 | 1.1856 | | Memory bandwidth [MBytes/s] STAT | 309208.1082 | 0 | 154956.1037 | 4068.5277 | | Memory data volume [GBytes] STAT | 184.2448 | 0 | 92.1773 | 2.4243 | +----------------------------------------+-------------+-----------+-------------+-----------+ Region scale, Group 1: MEM ... +----------------------------------------+-------------+-----------+-------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+-------------+-----------+ | Runtime (RDTSC) [s] STAT | 43.6729 | 0.5583 | 0.5861 | 0.5746 | | Runtime unhalted [s] STAT | 56.0438 | 0.7172 | 0.7520 | 0.7374 | | Clock [MHz] STAT | 235113.7291 | 3093.0513 | 3098.6806 | 3093.6017 | | CPI STAT | 741.4033 | 9.4874 | 9.9494 | 9.7553 | | Memory read bandwidth [MBytes/s] STAT | 155054.9603 | 0 | 77735.3586 | 2040.1968 | | Memory read data volume [GBytes] STAT | 90.1942 | 0 | 45.1921 | 1.1868 | | Memory write bandwidth [MBytes/s] STAT | 150473.8550 | 0 | 75490.0827 | 1979.9191 | | Memory write data volume [GBytes] STAT | 87.5291 | 0 | 43.8269 | 1.1517 | | Memory bandwidth [MBytes/s] STAT | 305528.8153 | 0 | 153225.4413 | 4020.1160 | | Memory data volume [GBytes] STAT | 177.7233 | 0 | 89.0190 | 2.3385 | +----------------------------------------+-------------+-----------+-------------+-----------+ Region add, Group 1: MEM ... +----------------------------------------+-------------+-----------+-------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+-------------+-----------+ | Runtime (RDTSC) [s] STAT | 63.8526 | 0.8225 | 0.8575 | 0.8402 | | Runtime unhalted [s] STAT | 83.0766 | 1.0688 | 1.1200 | 1.0931 | | Clock [MHz] STAT | 238192.4262 | 3129.0514 | 3149.1971 | 3134.1109 | | CPI STAT | 915.9088 | 11.7839 | 12.3473 | 12.0514 | | Memory read bandwidth [MBytes/s] STAT | 208382.3277 | 0 | 104925.9668 | 2741.8727 | | Memory read data volume [GBytes] STAT | 177.2967 | 0 | 88.7102 | 2.3329 | | Memory write bandwidth [MBytes/s] STAT | 103585.0244 | 0 | 52163.7290 | 1362.9608 | | Memory write data volume [GBytes] STAT | 88.1325 | 0 | 44.0919 | 1.1596 | | Memory bandwidth [MBytes/s] STAT | 311967.3521 | 0 | 157089.6958 | 4104.8336 | | Memory data volume [GBytes] STAT | 265.4292 | 0 | 132.8021 | 3.4925 | +----------------------------------------+-------------+-----------+-------------+-----------+ Region triad, Group 1: MEM ... +----------------------------------------+-------------+-----------+-------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+-------------+-----------+ | Runtime (RDTSC) [s] STAT | 63.9644 | 0.8263 | 0.8523 | 0.8416 | | Runtime unhalted [s] STAT | 83.2508 | 1.0757 | 1.1107 | 1.0954 | | Clock [MHz] STAT | 238205.2459 | 3130.4694 | 3147.2296 | 3134.2796 | | CPI STAT | 917.8332 | 11.8600 | 12.2454 | 12.0768 | | Memory read bandwidth [MBytes/s] STAT | 208229.8586 | 0 | 104257.0588 | 2739.8666 | | Memory read data volume [GBytes] STAT | 177.0683 | 0 | 88.6341 | 2.3298 | | Memory write bandwidth [MBytes/s] STAT | 103424.9941 | 0 | 51765.9677 | 1360.8552 | | Memory write data volume [GBytes] STAT | 87.9475 | 0 | 44.0088 | 1.1572 | | Memory bandwidth [MBytes/s] STAT | 311654.8526 | 0 | 156023.0264 | 4100.7217 | | Memory data volume [GBytes] STAT | 265.0158 | 0 | 132.6429 | 3.4870 | +----------------------------------------+-------------+-----------+-------------+-----------+
Memory read data volume | Memory write data volume | Factor | # Load Ops. | # Store Ops. | Factor | |
---|---|---|---|---|---|---|
copy | 94.1 | 90.1 | 1.04 | 1 | 1 | 1 |
scale | 90.2 | 87.5 | 1.03 | 1 | 1 | 1 |
add | 177.3 | 88.1 | 2.01 | 2 | 1 | 2 |
triad | 177.1 | 87.9 | 2.01 | 2 | 1 | 2 |
-> GCC does not use non-temperal stores -> Cache line read for ownership (RFO) needed
Intel Compiler
Build
stream
benchmark with likwid marker API and configure number of OpenMP threadsmodule add compiler/intel/2022 icc -std=c11 -Ofast -xHost -ipo -qopenmp \ -DLIKWID_PERFMON \ -o stream \ stream.OpenMP,likwid.c -llikwid export OMP_NUM_THREADS=76
Messen
likwid-perfctr --marker --group MEM -C 0-75 ./stream -n 1000000000
-------------------------------------------------------------------------------- CPU name: Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz CPU type: Intel Icelake SP processor CPU clock: 2.39 GHz --------------------------------------------------------------------------------
------------------------------------------------------------- STREAM version $Revision: 5.10 $ ------------------------------------------------------------- This system uses 8 bytes per array element. ------------------------------------------------------------- Array size = 999999944 (elements) Memory per array = 7629.4 MiB (= 7.5 GiB). Total memory required = 22888.2 MiB (= 22.4 GiB). Each kernel will be executed 10 times. The *best* time for each kernel (excluding the first iteration) will be used to compute the reported bandwidth. ------------------------------------------------------------- OpenMP version (yyyymm): 201611 Number of Threads requested = 76 Number of Threads counted = 76 ------------------------------------------------------------- Your clock granularity appears to be 1000 ticks per microseconds. Each test below will take on the order of 52335 microseconds. (= 52335268 clock ticks) Increase the size of the arrays if this shows that you are not getting at least 20 clock ticks per test. ------------------------------------------------------------- WARNING -- The above is only a rough guideline. For best results, please be sure you know the precision of your system timer. ------------------------------------------------------------- Function Best Rate MB/s Med time Min time Max time Copy: 315179.0 0.050808 0.050765 0.052163 Scale: 313017.1 0.051187 0.051115 0.052063 Add: 319321.8 0.075316 0.075159 0.077444 Triad: 318246.4 0.075510 0.075413 0.076306 ------------------------------------------------------------- Solution Validates: avg error less than 1.000000e-13 on all three arrays -------------------------------------------------------------
Region copy, Group 1: MEM ... +----------------------------------------+-------------+-----------+-------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+-------------+-----------+ | Runtime (RDTSC) [s] STAT | 41.8482 | 0.5382 | 0.5564 | 0.5506 | | Runtime unhalted [s] STAT | 55.4945 | 0.7139 | 0.7373 | 0.7302 | | Clock [MHz] STAT | 242600.9243 | 3190.8415 | 3192.4624 | 3192.1174 | | CPI STAT | 734.1395 | 9.4443 | 9.7546 | 9.6597 | | Memory read bandwidth [MBytes/s] STAT | 161530.0713 | 0 | 80918.6365 | 2125.3957 | | Memory read data volume [GBytes] STAT | 88.6124 | 0 | 44.3561 | 1.1660 | | Memory write bandwidth [MBytes/s] STAT | 159778.0010 | 0 | 80092.7669 | 2102.3421 | | Memory write data volume [GBytes] STAT | 87.6511 | 0 | 43.8465 | 1.1533 | | Memory bandwidth [MBytes/s] STAT | 321308.0723 | 0 | 161011.4034 | 4227.7378 | | Memory data volume [GBytes] STAT | 176.2634 | 0 | 88.2026 | 2.3193 | +----------------------------------------+-------------+-----------+-------------+-----------+ Region scale, Group 1: MEM ... +----------------------------------------+-------------+-----------+-------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+-------------+-----------+ | Runtime (RDTSC) [s] STAT | 41.8878 | 0.5363 | 0.5592 | 0.5512 | | Runtime unhalted [s] STAT | 53.8308 | 0.6896 | 0.7182 | 0.7083 | | Clock [MHz] STAT | 235089.3170 | 3093.0067 | 3097.8999 | 3093.2805 | | CPI STAT | 712.1258 | 9.1233 | 9.5014 | 9.3701 | | Memory read bandwidth [MBytes/s] STAT | 160693.0404 | 0 | 80400.3782 | 2114.3821 | | Memory read data volume [GBytes] STAT | 89.1143 | 0 | 44.6628 | 1.1726 | | Memory write bandwidth [MBytes/s] STAT | 158364.8350 | 0 | 79465.7938 | 2083.7478 | | Memory write data volume [GBytes] STAT | 87.8224 | 0 | 43.9348 | 1.1556 | | Memory bandwidth [MBytes/s] STAT | 319057.8753 | 0 | 159866.1719 | 4198.1299 | | Memory data volume [GBytes] STAT | 176.9365 | 0 | 88.5503 | 2.3281 | +----------------------------------------+-------------+-----------+-------------+-----------+ Region add, Group 1: MEM ... +----------------------------------------+-------------+-----------+-------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+-------------+-----------+ | Runtime (RDTSC) [s] STAT | 62.0432 | 0.8066 | 0.8242 | 0.8164 | | Runtime unhalted [s] STAT | 81.1006 | 1.0548 | 1.0785 | 1.0671 | | Clock [MHz] STAT | 239109.9365 | 3140.5410 | 3150.6727 | 3146.1834 | | CPI STAT | 894.1243 | 11.6297 | 11.8899 | 11.7648 | | Memory read bandwidth [MBytes/s] STAT | 215032.0937 | 0 | 107714.6641 | 2829.3697 | | Memory read data volume [GBytes] STAT | 176.1806 | 0 | 88.2026 | 2.3182 | | Memory write bandwidth [MBytes/s] STAT | 107543.0152 | 0 | 53933.9665 | 1415.0397 | | Memory write data volume [GBytes] STAT | 88.1121 | 0 | 44.0605 | 1.1594 | | Memory bandwidth [MBytes/s] STAT | 322575.1088 | 0 | 161648.6306 | 4244.4093 | | Memory data volume [GBytes] STAT | 264.2928 | 0 | 132.2631 | 3.4775 | +----------------------------------------+-------------+-----------+-------------+-----------+ Region triad, Group 1: MEM ... +----------------------------------------+-------------+-----------+-------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+-------------+-----------+ | Runtime (RDTSC) [s] STAT | 62.0811 | 0.8065 | 0.8268 | 0.8169 | | Runtime unhalted [s] STAT | 81.1337 | 1.0546 | 1.0801 | 1.0675 | | Clock [MHz] STAT | 239105.4407 | 3139.8575 | 3151.7754 | 3146.1242 | | CPI STAT | 894.4848 | 11.6269 | 11.9085 | 11.7695 | | Memory read bandwidth [MBytes/s] STAT | 214584.7874 | 0 | 107525.1345 | 2823.4840 | | Memory read data volume [GBytes] STAT | 176.3537 | 0 | 88.2335 | 2.3204 | | Memory write bandwidth [MBytes/s] STAT | 107024.2402 | 0 | 53675.3900 | 1408.2137 | | Memory write data volume [GBytes] STAT | 87.9563 | 0 | 43.9887 | 1.1573 | | Memory bandwidth [MBytes/s] STAT | 321609.0276 | 0 | 161200.5245 | 4231.6977 | | Memory data volume [GBytes] STAT | 264.3099 | 0 | 132.2010 | 3.4778 | +----------------------------------------+-------------+-----------+-------------+-----------+
Memory read data volume | Memory write data volume | Factor | # Load Ops. | # Store Ops. | Factor | |
---|---|---|---|---|---|---|
copy | 88.6 | 87.6 | 1.01 | 1 | 1 | 1 |
scale | 89.1 | 87.8 | 1.01 | 1 | 1 | 1 |
add | 176.2 | 88.1 | 2.00 | 2 | 1 | 2 |
triad | 176.3 | 88.0 | 2.00 | 2 | 1 | 2 |
-> Non-temporal stores / streaming stores
Generate assembler-code to validate non-temporal stores usage
module add compiler/intel/2022 icc -std=c11 -Ofast -xHost -ipo -qopenmp \ -S -fverbose-asm -masm=intel \ -DLIKWID_PERFMON \ \ stream.OpenMP,likwid.c -llikwid # -> stream.OpenMP,likwid.s
C-code for scale loop (
b = scalar * c
)#pragma omp simd aligned (b, c : alignment_bytes) for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) [j] = scalar*c[j]; b
Assembler-code for scale loop
rax rdx rcx rbx rsi ymm0 # LOE .20: # Preds ..B7.20 ..B7.19 ..B7[5.00e+00] # Execution count vmulpd ymm1, ymm0, YMMWORD PTR [rcx+rdx*8] #553.27 vmovntpd YMMWORD PTR [rbx+rdx*8], ymm1 #553.13 add rdx, 4 #552.9 cmp rdx, rax #552.9 jb ..B7.20 # Prob 82% #552.9 rax rdx rcx rbx rsi ymm0 # LOE
vmulpd Multiply Packed Double-Precision Floating-Point Values
vmovntpd Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint
C-code for add loop (
c = a + b
)#pragma omp simd aligned (a, b, c : alignment_bytes) for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) [j] = a[j] + b[j]; c
Assembler-code for add loop
rax rdx rcx rsi rdi r8 # LOE .24: # Preds ..B7.24 ..B7.23 ..B7[5.00e+00] # Execution count # optimization reportLOOP WAS VECTORIZED # # SIMD LOOP6.402344 # VECTORIZATION SPEEDUP COEFFECIENT # VECTOR TRIP COUNT IS ESTIMATED CONSTANT4 # VECTOR LENGTH : 64-bits floating point # MAIN VECTOR TYPE # DEPENDENCY ANALYSIS WAS IGNORED # COST MODEL DECISION WAS IGNOREDvmovupd ymm0, YMMWORD PTR [rdi+rdx*8] #568.20 vaddpd ymm1, ymm0, YMMWORD PTR [rsi+rdx*8] #568.27 vmovntpd YMMWORD PTR [rcx+rdx*8], ymm1 #568.13 add rdx, 4 #567.9 cmp rdx, rax #567.9 jb ..B7.24 # Prob 82% #567.9 rax rdx rcx rsi rdi r8 # LOE .26: # Preds ..B7.24 ..B7.32 ..B7[1.00e+00] # Execution count
vmovupd Move Unaligned Packed Double-Precision Floating-Point Values
vaddpd Add Packed Double-Precision Floating-Point Values
vmovntpd Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint