wiki:Tools/likwid/example_marker_api_stream

Beispiel: likwid Marker API in stream

  • Add likwid marker API to stream source code
    cp -av  stream_aligned_alloc_restrict_simd_threadprivate{,.likwid}.c
    diff -u stream_aligned_alloc_restrict_simd_threadprivate{,.likwid}.c
    
    • stream_aligned_alloc_restrict_simd_threadprivate.

      old new  
      6262    #error "OpenMP support required"
      6363#endif
      6464
       65#include "likwid.h"
       66
      6567/*-----------------------------------------------------------------------
      6668 * INSTRUCTIONS:
      6769 *
       
      338340   
      339341    /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
      340342
       343    // initalize likwid marker API
       344    likwid_markerInit();
       345    #pragma omp parallel
       346    likwid_markerThreadInit();
       347    likwid_markerRegisterRegion( "copy" );
       348    likwid_markerRegisterRegion( "scale" );
       349    likwid_markerRegisterRegion( "add" );
       350    likwid_markerRegisterRegion( "triad" );
       351
      341352    scalar = 3.0;
      342353    for (int NTIMES_count = 0; NTIMES_count < NTIMES; NTIMES_count++)
      343354       {
       
      358369       times[3][NTIMES_count] = mysecond() - times[3][NTIMES_count];
      359370       }
      360371
       372    // Close likwid marker API
       373    likwid_markerClose();
       374
      361375    /* --- SUMMARY --- */
      362376
      363377    for (int NTIMES_count = 1; NTIMES_count < NTIMES; NTIMES_count++) { /* note -- skip first iteration */
       
      514528void inline tuned_STREAM_Copy() {
      515529    #pragma omp parallel
      516530    {
       531        likwid_markerStartRegion( "copy" );
      517532        #ifdef __INTEL_COMPILER
      518533            // Instructs the compiler to use non-temporal (that is, streaming) stores
      519534            #pragma vector nontemporal
       
      521536        #pragma omp simd aligned (a, c : alignment_bytes)
      522537        for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
      523538           c[j] = a[j];
       539        likwid_markerStopRegion( "copy" );
      524540    }
      525541}
      526542
      527543void inline tuned_STREAM_Scale(STREAM_TYPE scalar) {
      528544    #pragma omp parallel shared(scalar)
      529545    {
       546        likwid_markerStartRegion( "scale" );
      530547        #ifdef __INTEL_COMPILER
      531548            // Instructs the compiler to use non-temporal (that is, streaming) stores
      532549            #pragma vector nontemporal
       
      534551        #pragma omp simd aligned (b, c : alignment_bytes)
      535552        for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
      536553            b[j] = scalar*c[j];
       554        likwid_markerStopRegion( "scale" );
      537555    }
      538556}
      539557
      540558void inline tuned_STREAM_Add() {
      541559    #pragma omp parallel
      542560    {
       561        likwid_markerStartRegion( "add" );
      543562        #ifdef __INTEL_COMPILER
      544563            // Instructs the compiler to use non-temporal (that is, streaming) stores
      545564            #pragma vector nontemporal
       
      547566        #pragma omp simd aligned (a, b, c : alignment_bytes)
      548567        for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
      549568            c[j] = a[j] + b[j];
       569        likwid_markerStopRegion( "add" );
      550570    }
      551571}
      552572
      553573void inline tuned_STREAM_Triad(STREAM_TYPE scalar) {
      554574    #pragma omp parallel shared(scalar)
      555575    {
       576        likwid_markerStartRegion( "triad" );
      556577        #ifdef __INTEL_COMPILER
      557578            // Instructs the compiler to use non-temporal (that is, streaming) stores
      558579            #pragma vector nontemporal
       
      560581        #pragma omp simd aligned (a, b, c : alignment_bytes)
      561582        for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
      562583            a[j] = b[j] + scalar * c[j];
       584        likwid_markerStopRegion( "triad" );
      563585    }
      564586}

GNU Compiler

  • Build stream benchmark with likwid marker API and configure number of OpenMP threads
    module purge
    module add compiler/gnu/7
    
    gcc -std=c11 -Ofast -march=native -flto -fopenmp \
        -DLIKWID_PERFMON \
         stream_aligned_alloc_restrict_simd_threadprivate.likwid.c -o stream \
        -llikwid
    export OMP_NUM_THREADS=20
    export OMP_DISPLAY_ENV=VERBOSE
    
  • Measure
    likwid-perfctr -m -g MEM -C 0-19 ./stream -n 1000000000
    -------------------------------------------------------------
    STREAM version $Revision: 5.10 $
    -------------------------------------------------------------
    This system uses 8 bytes per array element.
    -------------------------------------------------------------
    Array size = 1000000000 (elements) (elements)
    Memory per array = 7629.4 MiB (= 7.5 GiB).
    Total memory required = 22888.2 MiB (= 22.4 GiB).
    Each kernel will be executed 10 times.
     The *best* time for each kernel (excluding the first iteration)
     will be used to compute the reported bandwidth.
    -------------------------------------------------------------
    Number of Threads requested = 20
    Number of Threads counted = 20
    -------------------------------------------------------------
    Your clock granularity/precision appears to be 1 microseconds.
    Each test below will take on the order of 155283 microseconds.
       (= 155283 clock ticks)
    Increase the size of the arrays if this shows that
    you are not getting at least 20 clock ticks per test.
    -------------------------------------------------------------
    WARNING -- The above is only a rough guideline.
    For best results, please be sure you know the
    precision of your system timer.
    -------------------------------------------------------------
    Function    Best Rate MB/s  Avg time     Min time     Max time
    Copy:          100297.3     0.194358     0.159526     0.231492
    Scale:          71806.5     0.267166     0.222821     0.285324
    Add:            76779.7     0.330398     0.312583     0.344842
    Triad:          73913.4     0.337314     0.324704     0.348187
    -------------------------------------------------------------
    Solution Validates: avg error less than 1.000000e-13 on all three arrays
    -------------------------------------------------------------
    --------------------------------------------------------------------------------
    Region copy, Group 1: MEM
    ...
    +----------------------------------------+------------+-----------+------------+-----------+
    |                 Metric                 |     Sum    |    Min    |     Max    |    Avg    |
    +----------------------------------------+------------+-----------+------------+-----------+
    |        Runtime (RDTSC) [s] STAT        |   105.2816 |    1.8112 |     8.0703 |    5.2641 |
    |        Runtime unhalted [s] STAT       |    33.4022 |    1.5716 |     1.7510 |    1.6701 |
    |            Clock [MHz] STAT            | 58047.0517 | 2900.0345 |  2909.6461 | 2902.3526 |
    |                CPI STAT                |   138.8288 |    6.5340 |     7.2795 |    6.9414 |
    |  Memory read bandwidth [MBytes/s] STAT | 26949.1583 |         0 | 20430.6773 | 1347.4579 |
    |  Memory read data volume [GBytes] STAT |    95.8998 |         0 |    52.6062 |    4.7950 |
    | Memory write bandwidth [MBytes/s] STAT | 24411.6227 |         0 | 18698.1870 | 1220.5811 |
    | Memory write data volume [GBytes] STAT |    85.7317 |         0 |    46.1093 |    4.2866 |
    |    Memory bandwidth [MBytes/s] STAT    | 51360.7810 |         0 | 39128.8643 | 2568.0391 |
    |    Memory data volume [GBytes] STAT    |   181.6315 |         0 |    98.7155 |    9.0816 |
    +----------------------------------------+------------+-----------+------------+-----------+
    
    Region scale, Group 1: MEM
    ...
    +----------------------------------------+------------+-----------+------------+-----------+
    |                 Metric                 |     Sum    |    Min    |     Max    |    Avg    |
    +----------------------------------------+------------+-----------+------------+-----------+
    |        Runtime (RDTSC) [s] STAT        |    62.1135 |    2.3494 |     4.1016 |    3.1057 |
    |        Runtime unhalted [s] STAT       |    42.7436 |    2.0872 |     2.1730 |    2.1372 |
    |            Clock [MHz] STAT            | 58042.6544 | 2900.0211 |  2906.4216 | 2902.1327 |
    |                CPI STAT                |   126.9545 |    6.1979 |     6.4542 |    6.3477 |
    |  Memory read bandwidth [MBytes/s] STAT | 57228.4835 |         0 | 28629.5335 | 2861.4242 |
    |  Memory read data volume [GBytes] STAT |   155.2007 |         0 |    77.9430 |    7.7600 |
    | Memory write bandwidth [MBytes/s] STAT | 28149.4728 |         0 | 14268.1504 | 1407.4736 |
    | Memory write data volume [GBytes] STAT |    76.3357 |         0 |    38.5442 |    3.8168 |
    |    Memory bandwidth [MBytes/s] STAT    | 85377.9562 |         0 | 42867.1003 | 4268.8978 |
    |    Memory data volume [GBytes] STAT    |   231.5363 |         0 |   115.8019 |   11.5768 |
    +----------------------------------------+------------+-----------+------------+-----------+
    
    Region add, Group 1: MEM
    ...
    +----------------------------------------+-------------+-----------+------------+-----------+
    |                 Metric                 |     Sum     |    Min    |     Max    |    Avg    |
    +----------------------------------------+-------------+-----------+------------+-----------+
    |        Runtime (RDTSC) [s] STAT        |     62.6522 |    2.5966 |     4.6608 |    3.1326 |
    |        Runtime unhalted [s] STAT       |     57.3206 |    2.7614 |     2.9583 |    2.8660 |
    |            Clock [MHz] STAT            |  58045.4757 | 2899.9997 |  2910.5732 | 2902.2738 |
    |                CPI STAT                |    132.4399 |    6.3805 |     6.8341 |    6.6220 |
    |  Memory read bandwidth [MBytes/s] STAT |  79073.0496 |         0 | 39603.8543 | 3953.6525 |
    |  Memory read data volume [GBytes] STAT |    231.6670 |         0 |   117.0292 |   11.5833 |
    | Memory write bandwidth [MBytes/s] STAT |  26372.1563 |         0 | 13249.9267 | 1318.6078 |
    | Memory write data volume [GBytes] STAT |     77.2709 |         0 |    39.2871 |    3.8635 |
    |    Memory bandwidth [MBytes/s] STAT    | 105445.2059 |         0 | 52726.0839 | 5272.2603 |
    |    Memory data volume [GBytes] STAT    |    308.9379 |         0 |   156.3163 |   15.4469 |
    +----------------------------------------+-------------+-----------+------------+-----------+
    
    Region triad, Group 1: MEM
    ...
    +----------------------------------------+-------------+-----------+------------+-----------+
    |                 Metric                 |     Sum     |    Min    |     Max    |    Avg    |
    +----------------------------------------+-------------+-----------+------------+-----------+
    |        Runtime (RDTSC) [s] STAT        |     71.8622 |    2.9886 |     4.5887 |    3.5931 |
    |        Runtime unhalted [s] STAT       |     56.5096 |    2.7858 |     2.8638 |    2.8255 |
    |            Clock [MHz] STAT            |  58047.6112 | 2900.0235 |  2908.3383 | 2902.3806 |
    |                CPI STAT                |    130.5558 |    6.4351 |     6.6171 |    6.5278 |
    |  Memory read bandwidth [MBytes/s] STAT |  75582.4062 |         0 | 38766.0249 | 3779.1203 |
    |  Memory read data volume [GBytes] STAT |    228.0437 |         0 |   115.9692 |   11.4022 |
    | Memory write bandwidth [MBytes/s] STAT |  25026.1645 |         0 | 12821.9244 | 1251.3082 |
    | Memory write data volume [GBytes] STAT |     75.5085 |         0 |    38.3570 |    3.7754 |
    |    Memory bandwidth [MBytes/s] STAT    | 100608.5707 |         0 | 51587.9493 | 5030.4285 |
    |    Memory data volume [GBytes] STAT    |    303.5522 |         0 |   154.3262 |   15.1776 |
    +----------------------------------------+-------------+-----------+------------+-----------+
    
Memory read data volume Memory write data volume Factor # Load Ops. # Store Ops. Factor
copy 95.9 85.7 1.1 1 1 1
scale 155.2 76.3 2.0 1 1 1
add 231.7 77.3 3.0 2 1 2
triad 228.0 75.5 3.0 2 1 2

-> GCC does not use non-temperal stores -> Cache line read for ownership (RFO) needed

Intel Compiler

  • Build stream benchmark with likwid marker API and configure number of OpenMP threads
    module add compiler/intel/18.0 devel/likwid
    icc -std=c11 -Ofast -xHost -ipo -qopenmp \
        -DLIKWID_PERFMON \
         stream_aligned_alloc_restrict_simd_threadprivate.likwid.c -o stream \
        -llikwid
    export OMP_NUM_THREADS=1        # Paralle execution hangs
    export OMP_DISPLAY_ENV=VERBOSE
    
  • Messen
    likwid-perfctr -m -g MEM -C 0-19 ./stream -n 1000000000
    
    -------------------------------------------------------------
    Function    Best Rate MB/s  Avg time     Min time     Max time
    Copy:           17196.8     1.041387     0.930408     1.159887
    Scale:          17910.5     1.105390     0.893331     1.281133
    Add:            16354.9     1.556997     1.467446     1.778082
    Triad:          17146.9     1.540055     1.399667     1.629647
    -------------------------------------------------------------
    
    Region copy, Group 1: MEM
    ...
    +-----------------------------------+------------+
    |               Metric              |   Core 0   |
    +-----------------------------------+------------+
    |        Runtime (RDTSC) [s]        |    10.4968 |
    |        Runtime unhalted [s]       |    12.5352 |
    |            Clock [MHz]            |  3299.5261 |
    |                CPI                |     2.6070 |
    |  Memory read bandwidth [MBytes/s] |  7814.3566 |
    |  Memory read data volume [GBytes] |    82.0255 |
    | Memory write bandwidth [MBytes/s] |  7687.9019 |
    | Memory write data volume [GBytes] |    80.6981 |
    |    Memory bandwidth [MBytes/s]    | 15502.2585 |
    |    Memory data volume [GBytes]    |   162.7236 |
    +-----------------------------------+------------+
    
    Region scale, Group 1: MEM
    ...
    +-----------------------------------+------------+
    |               Metric              |   Core 0   |
    +-----------------------------------+------------+
    |        Runtime (RDTSC) [s]        |    12.1564 |
    |        Runtime unhalted [s]       |    12.6785 |
    |            Clock [MHz]            |  3099.7527 |
    |                CPI                |     2.6364 |
    |  Memory read bandwidth [MBytes/s] |  6749.1471 |
    |  Memory read data volume [GBytes] |    82.0452 |
    | Memory write bandwidth [MBytes/s] |  6643.5625 |
    | Memory write data volume [GBytes] |    80.7617 |
    |    Memory bandwidth [MBytes/s]    | 13392.7096 |
    |    Memory data volume [GBytes]    |   162.8069 |
    +-----------------------------------+------------+
    
    Region add, Group 1: MEM
    ...
    +-----------------------------------+------------+
    |               Metric              |   Core 0   |
    +-----------------------------------+------------+
    |        Runtime (RDTSC) [s]        |    16.6954 |
    |        Runtime unhalted [s]       |    17.6440 |
    |            Clock [MHz]            |  3085.4745 |
    |                CPI                |     3.0575 |
    |  Memory read bandwidth [MBytes/s] |  9767.7467 |
    |  Memory read data volume [GBytes] |   163.0766 |
    | Memory write bandwidth [MBytes/s] |  4852.6880 |
    | Memory write data volume [GBytes] |    81.0177 |
    |    Memory bandwidth [MBytes/s]    | 14620.4347 |
    |    Memory data volume [GBytes]    |   244.0943 |
    +-----------------------------------+------------+
    
    Region triad, Group 1: MEM
    ...
    +-----------------------------------+------------+
    |               Metric              |   Core 0   |
    +-----------------------------------+------------+
    |        Runtime (RDTSC) [s]        |    15.2351 |
    |        Runtime unhalted [s]       |    16.9975 |
    |            Clock [MHz]            |  3099.4366 |
    |                CPI                |     2.9458 |
    |  Memory read bandwidth [MBytes/s] | 10691.6703 |
    |  Memory read data volume [GBytes] |   162.8891 |
    | Memory write bandwidth [MBytes/s] |  5308.4322 |
    | Memory write data volume [GBytes] |    80.8747 |
    |    Memory bandwidth [MBytes/s]    | 16000.1025 |
    |    Memory data volume [GBytes]    |   243.7638 |
    +-----------------------------------+------------+
    
Memory read data volume Memory write data volume Factor # Load Ops. # Store Ops. Factor
copy 82.0 80.7 1.0 1 1 1
scale 82.0 80.8 1.0 1 1 1
add 163.1 81.0 2.0 2 1 2
triad 162.9 80.9 2.0 2 1 2

-> Non-temporal stores / streaming stores

  • Generate assembler-code to validate non-temporal stores usage
    module add compiler/intel
    icc -std=c11 -Ofast -xHost -ipo -qopenmp \
        -S -fverbose-asm -masm=intel \
        -DLIKWID_PERFMON  \
        stream_aligned_alloc_restrict_simd_threadprivate.likwid.c \
        -llikwid
    
  • C-code for scale loop (b = scalar * c)
            #pragma omp simd aligned (b, c : alignment_bytes)
            for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
                b[j] = scalar*c[j];
    
  • Assembler-code for scale loop
                                    # LOE rax rdx rcx rbx rsi ymm0
    ..B7.20:                        # Preds ..B7.20 ..B7.19
                                    # Execution count [5.00e+00]
            vmulpd    ymm1, ymm0, YMMWORD PTR [rcx+rdx*8]           #553.27
            vmovntpd  YMMWORD PTR [rbx+rdx*8], ymm1                 #553.13
            add       rdx, 4                                        #552.9
            cmp       rdx, rax                                      #552.9
            jb        ..B7.20       # Prob 82%                      #552.9
                                    # LOE rax rdx rcx rbx rsi ymm0
    
vmulpd
Multiply Packed Double-Precision Floating-Point Values
vmovntpd
Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint
  • C-code for add loop (c = a + b)
            #pragma omp simd aligned (a, b, c : alignment_bytes)
            for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
                c[j] = a[j] + b[j];
    
  • Assembler-code for add loop
                                    # LOE rax rdx rcx rsi rdi r8
    ..B7.24:                        # Preds ..B7.24 ..B7.23
                                    # Execution count [5.00e+00]
                    # optimization report
                    # LOOP WAS VECTORIZED
                    # SIMD LOOP
                    # VECTORIZATION SPEEDUP COEFFECIENT 6.402344
                    # VECTOR TRIP COUNT IS ESTIMATED CONSTANT
                    # VECTOR LENGTH 4
                    # MAIN VECTOR TYPE: 64-bits floating point
                    # DEPENDENCY ANALYSIS WAS IGNORED
                    # COST MODEL DECISION WAS IGNORED
            vmovupd   ymm0,       YMMWORD PTR [rdi+rdx*8]           #568.20
            vaddpd    ymm1, ymm0, YMMWORD PTR [rsi+rdx*8]           #568.27
            vmovntpd  YMMWORD PTR [rcx+rdx*8], ymm1                 #568.13
            add       rdx, 4                                        #567.9
            cmp       rdx, rax                                      #567.9
            jb        ..B7.24       # Prob 82%                      #567.9
                                    # LOE rax rdx rcx rsi rdi r8
    ..B7.26:                        # Preds ..B7.24 ..B7.32
                                    # Execution count [1.00e+00]
    
vmovupd
Move Unaligned Packed Double-Precision Floating-Point Values
vaddpd
Add Packed Double-Precision Floating-Point Values
vmovntpd
Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint
Last modified 12 months ago Last modified on Apr 9, 2018, 5:23:48 PM