Tools/likwid/example_marker_api_stream
Example: likwid Marker API
in stream
Add likwid marker API to
stream
source codecp -av stream.OpenMP{,",likwid"}.c vim stream.OpenMP,likwid.c diff -u stream.OpenMP{,",likwid"}.c
--- stream.OpenMP.c 2022-06-02 15:32:36.195137149 +0200 +++ stream.OpenMP,likwid.c 2022-06-03 11:06:44.695540812 +0200 @@ -62,6 +62,8 @@ #error "OpenMP support required" #endif +#include "likwid.h" + /*----------------------------------------------------------------------- * INSTRUCTIONS: *@@ -320,6 +322,7 @@ void static inline tuned_STREAM_Copy() { #pragma omp parallel default(none) shared(STREAM_ARRAY_SIZE_thread) {+ likwid_markerStartRegion( "copy" ); #ifdef __INTEL_COMPILER // Instructs the compiler to use non-temporal (that is, streaming) stores #pragma vector nontemporal@@ -328,6 +331,7 @@ for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) { c[j] = a[j]; }+ likwid_markerStopRegion( "copy" ); } } @@ -340,6 +344,7 @@ void static inline tuned_STREAM_Scale(const STREAM_TYPE scalar) { #pragma omp parallel default(none) shared(scalar, STREAM_ARRAY_SIZE_thread) {+ likwid_markerStartRegion( "scale" ); #ifdef __INTEL_COMPILER // Instructs the compiler to use non-temporal (that is, streaming) stores #pragma vector nontemporal@@ -348,6 +353,7 @@ for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) { b[j] = scalar * c[j]; }+ likwid_markerStopRegion( "scale" ); } } @@ -360,6 +366,7 @@ void static inline tuned_STREAM_Add() { #pragma omp parallel default(none) shared(STREAM_ARRAY_SIZE_thread) {+ likwid_markerStartRegion( "add" ); #ifdef __INTEL_COMPILER // Instructs the compiler to use non-temporal (that is, streaming) stores #pragma vector nontemporal@@ -368,6 +375,7 @@ for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) { c[j] = a[j] + b[j]; }+ likwid_markerStopRegion( "add" ); } } @@ -380,6 +388,7 @@ void static inline tuned_STREAM_Triad(const STREAM_TYPE scalar) { #pragma omp parallel default(none) shared(scalar, STREAM_ARRAY_SIZE_thread) {+ likwid_markerStartRegion( "triad" ); #ifdef __INTEL_COMPILER // Instructs the compiler to use non-temporal (that is, streaming) stores #pragma vector nontemporal@@ -388,6 +397,7 @@ for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) { a[j] = b[j] + scalar * c[j]; }+ likwid_markerStopRegion( "triad" ); } } @@ -556,6 +566,17 @@ // Explicitly turn off dynamic threads omp_set_dynamic(0); + // initalize likwid marker API + likwid_markerInit(); + #pragma omp parallel + { + likwid_markerThreadInit(); + likwid_markerRegisterRegion( "copy" ); + likwid_markerRegisterRegion( "scale" ); + likwid_markerRegisterRegion( "add" ); + likwid_markerRegisterRegion( "triad" ); + } + // Number of Threads requested #pragma omp parallel default(none) shared(omp_num_threads_req) #pragma omp master@@ -699,6 +720,9 @@ times[TRIAD][NTIMES_count] = mysecond() - times[TRIAD][NTIMES_count]; } + // Close likwid marker API + likwid_markerClose(); + for (int j = 0; j < NUM_BENCHMARKS; j++) { // Sort times qsort(times[j], NTIMES, sizeof(double), double_compare);
Prepare environment
module purge module add compiler/intel/2022
Build
stream
benchmark with likwid marker APIicc -std=c11 -Ofast -xHost -ipo -qopenmp \ -DLIKWID_PERFMON \ -o stream \ stream.OpenMP,likwid.c -llikwid
Run benchmark with 76 threads
# Number of threads and thread binding is handled by likwid-perfctr unset OMP_NUM_THREADS likwid-perfctr --marker --group MEM -C 0-75 \ -n 1000000000 ./stream
-------------------------------------------------------------------------------- CPU name: Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz CPU type: Intel Icelake SP processor CPU clock: 2.39 GHz --------------------------------------------------------------------------------
------------------------------------------------------------- STREAM version $Revision: 5.10 $ ------------------------------------------------------------- This system uses 8 bytes per array element. ------------------------------------------------------------- Array size = 999999944 (elements) Memory per array = 7629.4 MiB (= 7.5 GiB). Total memory required = 22888.2 MiB (= 22.4 GiB). Each kernel will be executed 10 times. The *best* time for each kernel (excluding the first iteration) will be used to compute the reported bandwidth. ------------------------------------------------------------- OpenMP version (yyyymm): 201611 Number of Threads requested = 76 Number of Threads counted = 76 ------------------------------------------------------------- Your clock granularity appears to be 1000 ticks per microseconds. Each test below will take on the order of 52335 microseconds. (= 52335268 clock ticks) Increase the size of the arrays if this shows that you are not getting at least 20 clock ticks per test. ------------------------------------------------------------- WARNING -- The above is only a rough guideline. For best results, please be sure you know the precision of your system timer. ------------------------------------------------------------- Function Best Rate MB/s Med time Min time Max time Copy: 315179.0 0.050808 0.050765 0.052163 Scale: 313017.1 0.051187 0.051115 0.052063 Add: 319321.8 0.075316 0.075159 0.077444 Triad: 318246.4 0.075510 0.075413 0.076306 ------------------------------------------------------------- Solution Validates: avg error less than 1.000000e-13 on all three arrays -------------------------------------------------------------
Region copy, Group 1: MEM ... +----------------------------------------+-------------+-----------+-------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+-------------+-----------+ | Runtime (RDTSC) [s] STAT | 41.8482 | 0.5382 | 0.5564 | 0.5506 | | Runtime unhalted [s] STAT | 55.4945 | 0.7139 | 0.7373 | 0.7302 | | Clock [MHz] STAT | 242600.9243 | 3190.8415 | 3192.4624 | 3192.1174 | | CPI STAT | 734.1395 | 9.4443 | 9.7546 | 9.6597 | | Memory read bandwidth [MBytes/s] STAT | 161530.0713 | 0 | 80918.6365 | 2125.3957 | | Memory read data volume [GBytes] STAT | 88.6124 | 0 | 44.3561 | 1.1660 | | Memory write bandwidth [MBytes/s] STAT | 159778.0010 | 0 | 80092.7669 | 2102.3421 | | Memory write data volume [GBytes] STAT | 87.6511 | 0 | 43.8465 | 1.1533 | | Memory bandwidth [MBytes/s] STAT | 321308.0723 | 0 | 161011.4034 | 4227.7378 | | Memory data volume [GBytes] STAT | 176.2634 | 0 | 88.2026 | 2.3193 | +----------------------------------------+-------------+-----------+-------------+-----------+ Region scale, Group 1: MEM ... +----------------------------------------+-------------+-----------+-------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+-------------+-----------+ | Runtime (RDTSC) [s] STAT | 41.8878 | 0.5363 | 0.5592 | 0.5512 | | Runtime unhalted [s] STAT | 53.8308 | 0.6896 | 0.7182 | 0.7083 | | Clock [MHz] STAT | 235089.3170 | 3093.0067 | 3097.8999 | 3093.2805 | | CPI STAT | 712.1258 | 9.1233 | 9.5014 | 9.3701 | | Memory read bandwidth [MBytes/s] STAT | 160693.0404 | 0 | 80400.3782 | 2114.3821 | | Memory read data volume [GBytes] STAT | 89.1143 | 0 | 44.6628 | 1.1726 | | Memory write bandwidth [MBytes/s] STAT | 158364.8350 | 0 | 79465.7938 | 2083.7478 | | Memory write data volume [GBytes] STAT | 87.8224 | 0 | 43.9348 | 1.1556 | | Memory bandwidth [MBytes/s] STAT | 319057.8753 | 0 | 159866.1719 | 4198.1299 | | Memory data volume [GBytes] STAT | 176.9365 | 0 | 88.5503 | 2.3281 | +----------------------------------------+-------------+-----------+-------------+-----------+ Region add, Group 1: MEM ... +----------------------------------------+-------------+-----------+-------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+-------------+-----------+ | Runtime (RDTSC) [s] STAT | 62.0432 | 0.8066 | 0.8242 | 0.8164 | | Runtime unhalted [s] STAT | 81.1006 | 1.0548 | 1.0785 | 1.0671 | | Clock [MHz] STAT | 239109.9365 | 3140.5410 | 3150.6727 | 3146.1834 | | CPI STAT | 894.1243 | 11.6297 | 11.8899 | 11.7648 | | Memory read bandwidth [MBytes/s] STAT | 215032.0937 | 0 | 107714.6641 | 2829.3697 | | Memory read data volume [GBytes] STAT | 176.1806 | 0 | 88.2026 | 2.3182 | | Memory write bandwidth [MBytes/s] STAT | 107543.0152 | 0 | 53933.9665 | 1415.0397 | | Memory write data volume [GBytes] STAT | 88.1121 | 0 | 44.0605 | 1.1594 | | Memory bandwidth [MBytes/s] STAT | 322575.1088 | 0 | 161648.6306 | 4244.4093 | | Memory data volume [GBytes] STAT | 264.2928 | 0 | 132.2631 | 3.4775 | +----------------------------------------+-------------+-----------+-------------+-----------+ Region triad, Group 1: MEM ... +----------------------------------------+-------------+-----------+-------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+-------------+-----------+ | Runtime (RDTSC) [s] STAT | 62.0811 | 0.8065 | 0.8268 | 0.8169 | | Runtime unhalted [s] STAT | 81.1337 | 1.0546 | 1.0801 | 1.0675 | | Clock [MHz] STAT | 239105.4407 | 3139.8575 | 3151.7754 | 3146.1242 | | CPI STAT | 894.4848 | 11.6269 | 11.9085 | 11.7695 | | Memory read bandwidth [MBytes/s] STAT | 214584.7874 | 0 | 107525.1345 | 2823.4840 | | Memory read data volume [GBytes] STAT | 176.3537 | 0 | 88.2335 | 2.3204 | | Memory write bandwidth [MBytes/s] STAT | 107024.2402 | 0 | 53675.3900 | 1408.2137 | | Memory write data volume [GBytes] STAT | 87.9563 | 0 | 43.9887 | 1.1573 | | Memory bandwidth [MBytes/s] STAT | 321609.0276 | 0 | 161200.5245 | 4231.6977 | | Memory data volume [GBytes] STAT | 264.3099 | 0 | 132.2010 | 3.4778 | +----------------------------------------+-------------+-----------+-------------+-----------+
Memory read data volume | Memory write data volume | Factor | # Load Ops. | # Store Ops. | Factor | |
---|---|---|---|---|---|---|
copy | 88.6 | 87.6 | 1.01 | 1 | 1 | 1 |
scale | 89.1 | 87.8 | 1.01 | 1 | 1 | 1 |
add | 176.2 | 88.1 | 2.00 | 2 | 1 | 2 |
triad | 176.3 | 88.0 | 2.00 | 2 | 1 | 2 |