#include <cstdlib>
#include <err.h>
#include <iostream>

using std::cout, std::endl;

__global__ void inc_kernel(double *device_a) {
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    device_a[i]++;
}

int main(int arg, char *argv[]) {
    const unsigned int size = 1 << 16;

    cudaError_t error_id;

    // Platform information
    int runtime_version = 0;
    error_id = cudaRuntimeGetVersion(&runtime_version);
    cout << "CUDA version: "
         << runtime_version / 1000 << "." << (runtime_version % 100) / 10
         << endl;

    int driver_version = 0;
    error_id = cudaDriverGetVersion(&driver_version);
    cout << "CUDA driver version: "
         << driver_version / 1000 << "." << (driver_version % 100) / 10
         << endl;

    int num_devices = 0;
    error_id = cudaGetDeviceCount(&num_devices);
    if (error_id == cudaErrorNoDevice || num_devices == 0) {
        errx(1, "No CUDA device found");
    }
    cout << "Number of CUDA devices: " << num_devices << endl;

    int device_num = 0;
    error_id = cudaSetDevice(device_num);
    cout << "CUDA Device number: " << device_num << endl;

    size_t memory_free = 0, memory_total = 0;
    error_id = cudaMemGetInfo(&memory_free, &memory_total);
    cout << "Memory on CUDA device: "
         << memory_total / (1024. * 1024. * 1024.) << " GiB"
         << endl;
    cout << "Free Memory on CUDA device: "
         << memory_free / (1024. * 1024. * 1024.) << " GiB"
         << endl;

    cudaDeviceProp device_properties;
    error_id = cudaGetDeviceProperties(&device_properties, device_num);
    cout << "CUDA device name: "
         << device_properties.name
         << endl;
    cout << "CUDA capability: "
         << device_properties.major << "." << device_properties.minor
         << endl;
    cout << "CUDA device max clock rate: "
         << device_properties.clockRate / 1000000. << " GHz"
         << endl;
    cout << "CUDA device max memory clock rate: "
         << device_properties.memoryClockRate / 1000000. << " GHz"
         << endl;
    cout << "CUDA device compute mode: "
         << device_properties.computeMode
         << endl;

    cout << "* Allocate memory on the host" << endl;
    double *a = (double *) malloc(size * sizeof(double));
    if (a == NULL) {
        errx(1, "malloc a[] failed");
    }

    cout << "* Allocate memory on the device" << endl;
    double *device_a;
    if (cudaMalloc(&device_a, size * sizeof(double)) != cudaSuccess) {
        errx(1, "cudaMalloc device_a[] failed");
    }

    cout << "* Pre-process / initialize data on the host" << endl;
    cout << "  e.g. read data from storage" << endl;
    for (int i = 0; i < size; i++) {
        a[i] = 1.;
    }

    cout << "* Copy data from the host to the device" << endl;
    error_id = cudaMemcpy(device_a, a, size * sizeof(double), cudaMemcpyHostToDevice);

    cout << "* Compute on the device" << endl;
    inc_kernel<<<size / 256, 256>>>(device_a);

    cout << "* Transfer data back from the device to the host" << endl;
    error_id = cudaMemcpy(a, device_a, size * sizeof(double), cudaMemcpyDeviceToHost);

    cout << "* Delete data on the device" << endl;
    error_id = cudaFree(device_a);

    cout << "* Post-process data on the host" << endl;
    cout << "  e.g. write data to storage" << endl;
    for (int i = 0; i < size; i++) {
        if (a[i] != 2.) {
            errx(2, "Computation on GPU failed");
        }
    }

    cout << "* Free memory on the host" << endl;
    free(a);

    return 0;
}