#include <algorithm>
#include <err.h>
#include <execution>
#include <iostream>
#include <ranges>

using std::cout, std::endl;

int main(int arg, char *argv[]) {
    const std::size_t size = 1 << 16;

    cout << "* Allocate memory on the host" << endl;
    double *a = (double *) malloc(size * sizeof(double));
    if (a == nullptr) {
        errx(1, "malloc a[] failed");
    }

    cout << "* Pre-process / initialize data on the host" << endl;
    cout << "  e.g. read data from storage" << endl;
    for (std::size_t i = 0; i < size; i++) {
        a[i] = 1.;
    }

    cout << "* Automatically allocate memory on the device" << endl;
    cout << "* Automatically copy data from the host to the device" << endl;
    cout << "* Compute on the device" << endl;

    // Without access to vector index
    std::for_each_n(
        std::execution::par_unseq, // parallel, unsequenced order
        a, size,
        // kernel expressed as lambda expression
        [](double &a_i) {
            a_i++;
        });

    // With access to vector index
    std::for_each_n(
        std::execution::par_unseq, // parallel, unsequenced order
        std::views::iota(0).begin(), size,
        // kernel expressed as lambda expression
        [&a](int i) {
            a[i]++;
        });

    cout << "* Automatically transfer data back from the device to the host" << endl;
    cout << "* Automatically delete data on the device" << endl;

    cout << "* Post-process data on the host" << endl;
    cout << "  e.g. write data to storage" << endl;
    for (int i = 0; i < size; i++) {
        if (a[i] != 3.) {
            cout << "a[" << i << "] = " << a[i] << endl;
            errx(2, "Computation on GPU failed");
        }
    }

    cout << "* Free memory on the host" << endl;
    free(a);
}