Beispiel: likwid Marker API in stream
- Add likwid marker API to
stream
source codecp -av stream_aligned_alloc_restrict_simd_threadprivate{,.likwid}.c diff -u stream_aligned_alloc_restrict_simd_threadprivate{,.likwid}.c
-
stream_aligned_alloc_restrict_simd_threadprivate.
old new 62 62 #error "OpenMP support required" 63 63 #endif 64 64 65 #include "likwid.h" 66 65 67 /*----------------------------------------------------------------------- 66 68 * INSTRUCTIONS: 67 69 * … … 338 340 339 341 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 340 342 343 // initalize likwid marker API 344 likwid_markerInit(); 345 #pragma omp parallel 346 likwid_markerThreadInit(); 347 likwid_markerRegisterRegion( "copy" ); 348 likwid_markerRegisterRegion( "scale" ); 349 likwid_markerRegisterRegion( "add" ); 350 likwid_markerRegisterRegion( "triad" ); 351 341 352 scalar = 3.0; 342 353 for (int NTIMES_count = 0; NTIMES_count < NTIMES; NTIMES_count++) 343 354 { … … 358 369 times[3][NTIMES_count] = mysecond() - times[3][NTIMES_count]; 359 370 } 360 371 372 // Close likwid marker API 373 likwid_markerClose(); 374 361 375 /* --- SUMMARY --- */ 362 376 363 377 for (int NTIMES_count = 1; NTIMES_count < NTIMES; NTIMES_count++) { /* note -- skip first iteration */ … … 514 528 void inline tuned_STREAM_Copy() { 515 529 #pragma omp parallel 516 530 { 531 likwid_markerStartRegion( "copy" ); 517 532 #ifdef __INTEL_COMPILER 518 533 // Instructs the compiler to use non-temporal (that is, streaming) stores 519 534 #pragma vector nontemporal … … 521 536 #pragma omp simd aligned (a, c : alignment_bytes) 522 537 for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) 523 538 c[j] = a[j]; 539 likwid_markerStopRegion( "copy" ); 524 540 } 525 541 } 526 542 527 543 void inline tuned_STREAM_Scale(STREAM_TYPE scalar) { 528 544 #pragma omp parallel shared(scalar) 529 545 { 546 likwid_markerStartRegion( "scale" ); 530 547 #ifdef __INTEL_COMPILER 531 548 // Instructs the compiler to use non-temporal (that is, streaming) stores 532 549 #pragma vector nontemporal … … 534 551 #pragma omp simd aligned (b, c : alignment_bytes) 535 552 for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) 536 553 b[j] = scalar*c[j]; 554 likwid_markerStopRegion( "scale" ); 537 555 } 538 556 } 539 557 540 558 void inline tuned_STREAM_Add() { 541 559 #pragma omp parallel 542 560 { 561 likwid_markerStartRegion( "add" ); 543 562 #ifdef __INTEL_COMPILER 544 563 // Instructs the compiler to use non-temporal (that is, streaming) stores 545 564 #pragma vector nontemporal … … 547 566 #pragma omp simd aligned (a, b, c : alignment_bytes) 548 567 for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) 549 568 c[j] = a[j] + b[j]; 569 likwid_markerStopRegion( "add" ); 550 570 } 551 571 } 552 572 553 573 void inline tuned_STREAM_Triad(STREAM_TYPE scalar) { 554 574 #pragma omp parallel shared(scalar) 555 575 { 576 likwid_markerStartRegion( "triad" ); 556 577 #ifdef __INTEL_COMPILER 557 578 // Instructs the compiler to use non-temporal (that is, streaming) stores 558 579 #pragma vector nontemporal … … 560 581 #pragma omp simd aligned (a, b, c : alignment_bytes) 561 582 for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) 562 583 a[j] = b[j] + scalar * c[j]; 584 likwid_markerStopRegion( "triad" ); 563 585 } 564 586 }
-
GNU Compiler
- Build
stream
benchmark with likwid marker API and configure number of OpenMP threadsmodule purge module add compiler/gnu/7 gcc -std=c11 -Ofast -march=native -flto -fopenmp \ -DLIKWID_PERFMON \ stream_aligned_alloc_restrict_simd_threadprivate.likwid.c -o stream \ -llikwid export OMP_NUM_THREADS=20 export OMP_DISPLAY_ENV=VERBOSE
- Measure
likwid-perfctr -m -g MEM -C 0-19 ./stream -n 1000000000 ------------------------------------------------------------- STREAM version $Revision: 5.10 $ ------------------------------------------------------------- This system uses 8 bytes per array element. ------------------------------------------------------------- Array size = 1000000000 (elements) (elements) Memory per array = 7629.4 MiB (= 7.5 GiB). Total memory required = 22888.2 MiB (= 22.4 GiB). Each kernel will be executed 10 times. The *best* time for each kernel (excluding the first iteration) will be used to compute the reported bandwidth. ------------------------------------------------------------- Number of Threads requested = 20 Number of Threads counted = 20 ------------------------------------------------------------- Your clock granularity/precision appears to be 1 microseconds. Each test below will take on the order of 155283 microseconds. (= 155283 clock ticks) Increase the size of the arrays if this shows that you are not getting at least 20 clock ticks per test. ------------------------------------------------------------- WARNING -- The above is only a rough guideline. For best results, please be sure you know the precision of your system timer. ------------------------------------------------------------- Function Best Rate MB/s Avg time Min time Max time Copy: 100297.3 0.194358 0.159526 0.231492 Scale: 71806.5 0.267166 0.222821 0.285324 Add: 76779.7 0.330398 0.312583 0.344842 Triad: 73913.4 0.337314 0.324704 0.348187 ------------------------------------------------------------- Solution Validates: avg error less than 1.000000e-13 on all three arrays ------------------------------------------------------------- -------------------------------------------------------------------------------- Region copy, Group 1: MEM ... +----------------------------------------+------------+-----------+------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+------------+-----------+------------+-----------+ | Runtime (RDTSC) [s] STAT | 105.2816 | 1.8112 | 8.0703 | 5.2641 | | Runtime unhalted [s] STAT | 33.4022 | 1.5716 | 1.7510 | 1.6701 | | Clock [MHz] STAT | 58047.0517 | 2900.0345 | 2909.6461 | 2902.3526 | | CPI STAT | 138.8288 | 6.5340 | 7.2795 | 6.9414 | | Memory read bandwidth [MBytes/s] STAT | 26949.1583 | 0 | 20430.6773 | 1347.4579 | | Memory read data volume [GBytes] STAT | 95.8998 | 0 | 52.6062 | 4.7950 | | Memory write bandwidth [MBytes/s] STAT | 24411.6227 | 0 | 18698.1870 | 1220.5811 | | Memory write data volume [GBytes] STAT | 85.7317 | 0 | 46.1093 | 4.2866 | | Memory bandwidth [MBytes/s] STAT | 51360.7810 | 0 | 39128.8643 | 2568.0391 | | Memory data volume [GBytes] STAT | 181.6315 | 0 | 98.7155 | 9.0816 | +----------------------------------------+------------+-----------+------------+-----------+ Region scale, Group 1: MEM ... +----------------------------------------+------------+-----------+------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+------------+-----------+------------+-----------+ | Runtime (RDTSC) [s] STAT | 62.1135 | 2.3494 | 4.1016 | 3.1057 | | Runtime unhalted [s] STAT | 42.7436 | 2.0872 | 2.1730 | 2.1372 | | Clock [MHz] STAT | 58042.6544 | 2900.0211 | 2906.4216 | 2902.1327 | | CPI STAT | 126.9545 | 6.1979 | 6.4542 | 6.3477 | | Memory read bandwidth [MBytes/s] STAT | 57228.4835 | 0 | 28629.5335 | 2861.4242 | | Memory read data volume [GBytes] STAT | 155.2007 | 0 | 77.9430 | 7.7600 | | Memory write bandwidth [MBytes/s] STAT | 28149.4728 | 0 | 14268.1504 | 1407.4736 | | Memory write data volume [GBytes] STAT | 76.3357 | 0 | 38.5442 | 3.8168 | | Memory bandwidth [MBytes/s] STAT | 85377.9562 | 0 | 42867.1003 | 4268.8978 | | Memory data volume [GBytes] STAT | 231.5363 | 0 | 115.8019 | 11.5768 | +----------------------------------------+------------+-----------+------------+-----------+ Region add, Group 1: MEM ... +----------------------------------------+-------------+-----------+------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+------------+-----------+ | Runtime (RDTSC) [s] STAT | 62.6522 | 2.5966 | 4.6608 | 3.1326 | | Runtime unhalted [s] STAT | 57.3206 | 2.7614 | 2.9583 | 2.8660 | | Clock [MHz] STAT | 58045.4757 | 2899.9997 | 2910.5732 | 2902.2738 | | CPI STAT | 132.4399 | 6.3805 | 6.8341 | 6.6220 | | Memory read bandwidth [MBytes/s] STAT | 79073.0496 | 0 | 39603.8543 | 3953.6525 | | Memory read data volume [GBytes] STAT | 231.6670 | 0 | 117.0292 | 11.5833 | | Memory write bandwidth [MBytes/s] STAT | 26372.1563 | 0 | 13249.9267 | 1318.6078 | | Memory write data volume [GBytes] STAT | 77.2709 | 0 | 39.2871 | 3.8635 | | Memory bandwidth [MBytes/s] STAT | 105445.2059 | 0 | 52726.0839 | 5272.2603 | | Memory data volume [GBytes] STAT | 308.9379 | 0 | 156.3163 | 15.4469 | +----------------------------------------+-------------+-----------+------------+-----------+ Region triad, Group 1: MEM ... +----------------------------------------+-------------+-----------+------------+-----------+ | Metric | Sum | Min | Max | Avg | +----------------------------------------+-------------+-----------+------------+-----------+ | Runtime (RDTSC) [s] STAT | 71.8622 | 2.9886 | 4.5887 | 3.5931 | | Runtime unhalted [s] STAT | 56.5096 | 2.7858 | 2.8638 | 2.8255 | | Clock [MHz] STAT | 58047.6112 | 2900.0235 | 2908.3383 | 2902.3806 | | CPI STAT | 130.5558 | 6.4351 | 6.6171 | 6.5278 | | Memory read bandwidth [MBytes/s] STAT | 75582.4062 | 0 | 38766.0249 | 3779.1203 | | Memory read data volume [GBytes] STAT | 228.0437 | 0 | 115.9692 | 11.4022 | | Memory write bandwidth [MBytes/s] STAT | 25026.1645 | 0 | 12821.9244 | 1251.3082 | | Memory write data volume [GBytes] STAT | 75.5085 | 0 | 38.3570 | 3.7754 | | Memory bandwidth [MBytes/s] STAT | 100608.5707 | 0 | 51587.9493 | 5030.4285 | | Memory data volume [GBytes] STAT | 303.5522 | 0 | 154.3262 | 15.1776 | +----------------------------------------+-------------+-----------+------------+-----------+
Memory read data volume | Memory write data volume | Factor | # Load Ops. | # Store Ops. | Factor | |
---|---|---|---|---|---|---|
copy | 95.9 | 85.7 | 1.1 | 1 | 1 | 1 |
scale | 155.2 | 76.3 | 2.0 | 1 | 1 | 1 |
add | 231.7 | 77.3 | 3.0 | 2 | 1 | 2 |
triad | 228.0 | 75.5 | 3.0 | 2 | 1 | 2 |
-> GCC does not use non-temperal stores -> Cache line read for ownership (RFO) needed
Intel Compiler
- Build
stream
benchmark with likwid marker API and configure number of OpenMP threadsmodule add compiler/intel/18.0 devel/likwid icc -std=c11 -Ofast -xHost -ipo -qopenmp \ -DLIKWID_PERFMON \ stream_aligned_alloc_restrict_simd_threadprivate.likwid.c -o stream \ -llikwid export OMP_NUM_THREADS=1 # Paralle execution hangs export OMP_DISPLAY_ENV=VERBOSE
- Messen
likwid-perfctr -m -g MEM -C 0-19 ./stream -n 1000000000
------------------------------------------------------------- Function Best Rate MB/s Avg time Min time Max time Copy: 17196.8 1.041387 0.930408 1.159887 Scale: 17910.5 1.105390 0.893331 1.281133 Add: 16354.9 1.556997 1.467446 1.778082 Triad: 17146.9 1.540055 1.399667 1.629647 ------------------------------------------------------------- Region copy, Group 1: MEM ... +-----------------------------------+------------+ | Metric | Core 0 | +-----------------------------------+------------+ | Runtime (RDTSC) [s] | 10.4968 | | Runtime unhalted [s] | 12.5352 | | Clock [MHz] | 3299.5261 | | CPI | 2.6070 | | Memory read bandwidth [MBytes/s] | 7814.3566 | | Memory read data volume [GBytes] | 82.0255 | | Memory write bandwidth [MBytes/s] | 7687.9019 | | Memory write data volume [GBytes] | 80.6981 | | Memory bandwidth [MBytes/s] | 15502.2585 | | Memory data volume [GBytes] | 162.7236 | +-----------------------------------+------------+ Region scale, Group 1: MEM ... +-----------------------------------+------------+ | Metric | Core 0 | +-----------------------------------+------------+ | Runtime (RDTSC) [s] | 12.1564 | | Runtime unhalted [s] | 12.6785 | | Clock [MHz] | 3099.7527 | | CPI | 2.6364 | | Memory read bandwidth [MBytes/s] | 6749.1471 | | Memory read data volume [GBytes] | 82.0452 | | Memory write bandwidth [MBytes/s] | 6643.5625 | | Memory write data volume [GBytes] | 80.7617 | | Memory bandwidth [MBytes/s] | 13392.7096 | | Memory data volume [GBytes] | 162.8069 | +-----------------------------------+------------+ Region add, Group 1: MEM ... +-----------------------------------+------------+ | Metric | Core 0 | +-----------------------------------+------------+ | Runtime (RDTSC) [s] | 16.6954 | | Runtime unhalted [s] | 17.6440 | | Clock [MHz] | 3085.4745 | | CPI | 3.0575 | | Memory read bandwidth [MBytes/s] | 9767.7467 | | Memory read data volume [GBytes] | 163.0766 | | Memory write bandwidth [MBytes/s] | 4852.6880 | | Memory write data volume [GBytes] | 81.0177 | | Memory bandwidth [MBytes/s] | 14620.4347 | | Memory data volume [GBytes] | 244.0943 | +-----------------------------------+------------+ Region triad, Group 1: MEM ... +-----------------------------------+------------+ | Metric | Core 0 | +-----------------------------------+------------+ | Runtime (RDTSC) [s] | 15.2351 | | Runtime unhalted [s] | 16.9975 | | Clock [MHz] | 3099.4366 | | CPI | 2.9458 | | Memory read bandwidth [MBytes/s] | 10691.6703 | | Memory read data volume [GBytes] | 162.8891 | | Memory write bandwidth [MBytes/s] | 5308.4322 | | Memory write data volume [GBytes] | 80.8747 | | Memory bandwidth [MBytes/s] | 16000.1025 | | Memory data volume [GBytes] | 243.7638 | +-----------------------------------+------------+
Memory read data volume | Memory write data volume | Factor | # Load Ops. | # Store Ops. | Factor | |
---|---|---|---|---|---|---|
copy | 82.0 | 80.7 | 1.0 | 1 | 1 | 1 |
scale | 82.0 | 80.8 | 1.0 | 1 | 1 | 1 |
add | 163.1 | 81.0 | 2.0 | 2 | 1 | 2 |
triad | 162.9 | 80.9 | 2.0 | 2 | 1 | 2 |
-> Non-temporal stores / streaming stores
- Generate assembler-code to validate non-temporal stores usage
module add compiler/intel icc -std=c11 -Ofast -xHost -ipo -qopenmp \ -S -fverbose-asm -masm=intel \ -DLIKWID_PERFMON \ stream_aligned_alloc_restrict_simd_threadprivate.likwid.c \ -llikwid
- C-code for scale loop (
b = scalar * c
)#pragma omp simd aligned (b, c : alignment_bytes) for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) b[j] = scalar*c[j];
- Assembler-code for scale loop
# LOE rax rdx rcx rbx rsi ymm0 ..B7.20: # Preds ..B7.20 ..B7.19 # Execution count [5.00e+00] vmulpd ymm1, ymm0, YMMWORD PTR [rcx+rdx*8] #553.27 vmovntpd YMMWORD PTR [rbx+rdx*8], ymm1 #553.13 add rdx, 4 #552.9 cmp rdx, rax #552.9 jb ..B7.20 # Prob 82% #552.9 # LOE rax rdx rcx rbx rsi ymm0
- vmulpd
- Multiply Packed Double-Precision Floating-Point Values
- vmovntpd
- Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint
- C-code for add loop (
c = a + b
)#pragma omp simd aligned (a, b, c : alignment_bytes) for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) c[j] = a[j] + b[j];
- Assembler-code for add loop
# LOE rax rdx rcx rsi rdi r8 ..B7.24: # Preds ..B7.24 ..B7.23 # Execution count [5.00e+00] # optimization report # LOOP WAS VECTORIZED # SIMD LOOP # VECTORIZATION SPEEDUP COEFFECIENT 6.402344 # VECTOR TRIP COUNT IS ESTIMATED CONSTANT # VECTOR LENGTH 4 # MAIN VECTOR TYPE: 64-bits floating point # DEPENDENCY ANALYSIS WAS IGNORED # COST MODEL DECISION WAS IGNORED vmovupd ymm0, YMMWORD PTR [rdi+rdx*8] #568.20 vaddpd ymm1, ymm0, YMMWORD PTR [rsi+rdx*8] #568.27 vmovntpd YMMWORD PTR [rcx+rdx*8], ymm1 #568.13 add rdx, 4 #567.9 cmp rdx, rax #567.9 jb ..B7.24 # Prob 82% #567.9 # LOE rax rdx rcx rsi rdi r8 ..B7.26: # Preds ..B7.24 ..B7.32 # Execution count [1.00e+00]
- vmovupd
- Move Unaligned Packed Double-Precision Floating-Point Values
- vaddpd
- Add Packed Double-Precision Floating-Point Values
- vmovntpd
- Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint
Last modified 12 months ago
Last modified on Apr 9, 2018, 5:23:48 PM