#include <cstdlib> #include <err.h> #include <iostream> using std::cout, std::endl; __global__ void inc_kernel(double *device_a) { const int i = blockDim.x * blockIdx.x + threadIdx.x; device_a[i]++; } int main(int arg, char *argv[]) { const unsigned int size = 1 << 16; cudaError_t error_id; // Platform information int runtime_version = 0; error_id = cudaRuntimeGetVersion(&runtime_version); cout << "CUDA version: " << runtime_version / 1000 << "." << (runtime_version % 100) / 10 << endl; int driver_version = 0; error_id = cudaDriverGetVersion(&driver_version); cout << "CUDA driver version: " << driver_version / 1000 << "." << (driver_version % 100) / 10 << endl; int num_devices = 0; error_id = cudaGetDeviceCount(&num_devices); if (error_id == cudaErrorNoDevice || num_devices == 0) { errx(1, "No CUDA device found"); } cout << "Number of CUDA devices: " << num_devices << endl; int device_num = 0; error_id = cudaSetDevice(device_num); cout << "CUDA Device number: " << device_num << endl; size_t memory_free = 0, memory_total = 0; error_id = cudaMemGetInfo(&memory_free, &memory_total); cout << "Memory on CUDA device: " << memory_total / (1024. * 1024. * 1024.) << " GiB" << endl; cout << "Free Memory on CUDA device: " << memory_free / (1024. * 1024. * 1024.) << " GiB" << endl; cudaDeviceProp device_properties; error_id = cudaGetDeviceProperties(&device_properties, device_num); cout << "CUDA device name: " << device_properties.name << endl; cout << "CUDA capability: " << device_properties.major << "." << device_properties.minor << endl; cout << "CUDA device max clock rate: " << device_properties.clockRate / 1000000. << " GHz" << endl; cout << "CUDA device max memory clock rate: " << device_properties.memoryClockRate / 1000000. << " GHz" << endl; cout << "CUDA device compute mode: " << device_properties.computeMode << endl; cout << "* Allocate memory on the host" << endl; double *a = (double *) malloc(size * sizeof(double)); if (a == NULL) { errx(1, "malloc a[] failed"); } cout << "* Allocate memory on the device" << endl; double *device_a; if (cudaMalloc(&device_a, size * sizeof(double)) != cudaSuccess) { errx(1, "cudaMalloc device_a[] failed"); } cout << "* Pre-process / initialize data on the host" << endl; cout << " e.g. read data from storage" << endl; for (int i = 0; i < size; i++) { a[i] = 1.; } cout << "* Copy data from the host to the device" << endl; error_id = cudaMemcpy(device_a, a, size * sizeof(double), cudaMemcpyHostToDevice); cout << "* Compute on the device" << endl; inc_kernel<<<size / 256, 256>>>(device_a); cout << "* Transfer data back from the device to the host" << endl; error_id = cudaMemcpy(a, device_a, size * sizeof(double), cudaMemcpyDeviceToHost); cout << "* Delete data on the device" << endl; error_id = cudaFree(device_a); cout << "* Post-process data on the host" << endl; cout << " e.g. write data to storage" << endl; for (int i = 0; i < size; i++) { if (a[i] != 2.) { errx(2, "Computation on GPU failed"); } } cout << "* Free memory on the host" << endl; free(a); return 0; }