wiki:performance/compiler_optionen/intel/example_vec_report_stream

Example: Intel compiler optimization report for benchmark stream

  • Code fragments benchmark stream
    void inline tuned_STREAM_Scale(STREAM_TYPE scalar) {                                   // L.557
        #pragma omp parallel shared(scalar)                                                // L.558
        {                                                                                  // L.559
            #ifdef __INTEL_COMPILER                                                        // L.560
                // Instructs the compiler to use non-temporal (that is, streaming) stores  // L.561
                #pragma vector nontemporal                                                 // L.562
            #endif                                                                         // L.563
            #pragma omp simd aligned (b, c : alignment_bytes)                              // L.564
            for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)                        // L.565
                b[j] = scalar*c[j];                                                        // L.566
        }                                                                                  // L.567
    }                                                                                      // L.568
                                                                                           // L.569
    void inline tuned_STREAM_Add() {                                                       // L.570
        #pragma omp parallel                                                               // L.571
        {                                                                                  // L.572
            #ifdef __INTEL_COMPILER                                                        // L.573
                // Instructs the compiler to use non-temporal (that is, streaming) stores  // L.574
                #pragma vector nontemporal                                                 // L.575
            #endif                                                                         // L.576
            #pragma omp simd aligned (a, b, c : alignment_bytes)                           // L.577
            for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)                        // L.578
                c[j] = a[j] + b[j];                                                        // L.579
        }                                                                                  // L.580
    }                                                                                      // L.581
    
  • Compile benchmark with optimization report enabled
    module add compiler/intel/18.0
    icc -std=c11 -Ofast -xHost -ipo -qopenmp \
        -qopt-report=5 \
        -qopt-report-phase=vec \
        -qopt-report-stdout \
        stream.c
    
  • Output
    ...
    LOOP BEGIN at stream.c(565,9) inlined into stream.c(362,5)
       remark #15388: vectorization support: reference *b[j] has aligned access   [ stream.c(566,13) ]
       remark #15388: vectorization support: reference *c[j] has aligned access   [ stream.c(566,27) ]
       remark #15412: vectorization support: streaming store was generated for b   [ stream.c(566,13) ]
       remark #15305: vectorization support: vector length 4
       remark #15309: vectorization support: normalized vectorization overhead 0.600
       remark #15301: OpenMP SIMD LOOP WAS VECTORIZED
       remark #15448: unmasked aligned unit stride loads: 1 
       remark #15449: unmasked aligned unit stride stores: 1 
       remark #15467: unmasked aligned streaming stores: 1 
       remark #15475: --- begin vector cost summary ---
       remark #15476: scalar cost: 7 
       remark #15477: vector cost: 1.250 
       remark #15478: estimated potential speedup: 5.550 
       remark #15488: --- end vector cost summary ---
    LOOP END
    
    LOOP BEGIN at stream.c(565,9) inlined into stream.c(362,5)
    <Remainder loop for vectorization>
    LOOP END
    
    ...
    LOOP BEGIN at stream.c(578,9) inlined into stream.c(363,5)
       remark #15388: vectorization support: reference *c[j] has aligned access   [ stream.c(579,13) ]
       remark #15388: vectorization support: reference *a[j] has aligned access   [ stream.c(579,20) ]
       remark #15388: vectorization support: reference *b[j] has aligned access   [ stream.c(579,27) ]
       remark #15412: vectorization support: streaming store was generated for c   [ stream.c(579,13) ]
       remark #15305: vectorization support: vector length 4
       remark #15301: OpenMP SIMD LOOP WAS VECTORIZED
       remark #15448: unmasked aligned unit stride loads: 2 
       remark #15449: unmasked aligned unit stride stores: 1 
       remark #15467: unmasked aligned streaming stores: 1 
       remark #15475: --- begin vector cost summary ---
       remark #15476: scalar cost: 8 
       remark #15477: vector cost: 1.250 
       remark #15478: estimated potential speedup: 6.400 
       remark #15488: --- end vector cost summary ---
    LOOP END
    
    LOOP BEGIN at stream.c(578,9) inlined into stream.c(363,5)
    <Remainder loop for vectorization>
    LOOP END
    
    • Report on data alignment
    • Report on loads, stores and streaming store
    • Report on successful vectorization
Last modified 9 days ago Last modified on Apr 1, 2019, 1:40:53 PM