Example: GCC compiler optimization report for benchmark stream
- Code fragments benchmark
stream
void inline tuned_STREAM_Scale(STREAM_TYPE scalar) { // L.557 #pragma omp parallel shared(scalar) // L.558 { // L.559 #ifdef __INTEL_COMPILER // L.560 // Instructs the compiler to use non-temporal (that is, streaming) stores // L.561 #pragma vector nontemporal // L.562 #endif // L.563 #pragma omp simd aligned (b, c : alignment_bytes) // L.564 for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) // L.565 b[j] = scalar*c[j]; // L.566 } // L.567 } // L.568 // L.569 void inline tuned_STREAM_Add() { // L.570 #pragma omp parallel // L.571 { // L.572 #ifdef __INTEL_COMPILER // L.573 // Instructs the compiler to use non-temporal (that is, streaming) stores // L.574 #pragma vector nontemporal // L.575 #endif // L.576 #pragma omp simd aligned (a, b, c : alignment_bytes) // L.577 for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) // L.578 c[j] = a[j] + b[j]; // L.579 } // L.580 } // L.581
- Compile benchmark with vectorization report enabled
module add compiler/gnu gcc -std=c11 -Ofast -march=native -flto -fopenmp \ -fopt-info-vec \ stream.c
- Output
... stream.c:566:28: note: loop vectorized stream.c:579:21: note: loop vectorized ...
Last modified 12 months ago
Last modified on Apr 9, 2018, 4:13:47 PM