//драйвер NVIDIA (CUDA 8.0) пока не поддерживает OpenCL2.0, но выдает warning-и для OpenCL1.2 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS #include #include #include #include #include #define checkError(func) \ if (errcode != CL_SUCCESS)\ {\ printf("Error in " #func "\nError code = %d\n", errcode);\ exit(1);\ } #define checkErrorEx(command) \ command; \ checkError(command); int main() { int device_index = 0, platform_index = 0; cl_int errcode; int N = 10*1000*1000; float *host_a, *host_b, *host_c, *host_c_check; host_a = malloc(N*sizeof(float)); host_b = malloc(N*sizeof(float)); host_c = malloc(N*sizeof(float)); host_c_check = malloc(N*sizeof(float)); for (int i = 0; i < N; i++) { host_a[i] = i; host_b[i] = 2*i; } clock_t startCPU = clock(); //#pragma omp parallel for for (int i = 0; i < N; i++) host_c_check[i] = host_a[i] + host_b[i]; double elapsedTimeCPU = (double)(clock()-startCPU)/CLOCKS_PER_SEC; //код kernel-функции char* source = "\n\ __kernel void sum(__global float *a, __global float *b, __global float *c, int N)\n\ {\n\ int id = get_global_id(0);\n\ int threadsNum = get_global_size(0);\n\ for (int i = id; i < N; i += threadsNum)\n\ c[i] = a[i]+b[i];\n\ }"; //получаем список доступных OpenCL-платформ (драйверов OpenCL) cl_platform_id platform[10]; //массив в который будут записываться идентификаторы платформ cl_uint num_platforms; errcode = clGetPlatformIDs(10, platform, &num_platforms); checkError(clGetPlatformIDs); printf("OpenCL platforms found: %d\n", num_platforms); //в полученном списке платформ находим устройство GPU (видеокарту) cl_device_id devices[10]; cl_uint num_devices; clGetDeviceIDs(platform[platform_index], CL_DEVICE_TYPE_GPU, 10, devices, &num_devices); printf("GPGPU devices found: %d\n", num_devices); if (num_devices == 0) { printf("Warning: YOU DON'T HAVE GPGPU. Then CPU will be used instead.\n"); errcode = clGetDeviceIDs( platform[0], CL_DEVICE_TYPE_CPU, 10, devices, &num_devices); checkError(clGetDeviceIDs); printf("CPU devices found: %d\n", num_devices); if (num_devices == 0) {printf("Error: CPU devices not found\n"); exit(-1);} } size_t valueSize; clGetDeviceInfo(devices[device_index], CL_DEVICE_NAME, 0, NULL, &valueSize); char* value = (char*) malloc(valueSize); clGetDeviceInfo(devices[device_index], CL_DEVICE_NAME, valueSize, value, NULL); printf("Use device #%d: %s\n", device_index, value); free(value); //создаем контекст на видеокарте cl_context context; context = clCreateContext(NULL, 1, &devices[device_index], NULL, NULL, &errcode); checkError(clCreateContext); //создаем очередь задач для контекста cl_command_queue queue = clCreateCommandQueue (context, devices[device_index], CL_QUEUE_PROFILING_ENABLE, &errcode); // третий параметр - свойства checkError(clCreateCommandQueue); //создаем обьект-программу с заданным текстом программы cl_program program = clCreateProgramWithSource(context, 1, (const char **) &source, NULL, &errcode); checkError(clCreateProgramWithSource); //компилируем и линкуем программу для видеокарты errcode = clBuildProgram(program, 1, &devices[device_index], "-cl-fast-relaxed-math -cl-no-signed-zeros -cl-mad-enable", NULL, NULL); if (errcode != CL_SUCCESS) { size_t len; char *build_log; printf("There were error during build kernel code. Please, check program code. Errcode = %d\n", errcode); clGetProgramBuildInfo(program, devices[device_index], CL_PROGRAM_BUILD_LOG, 0, NULL, &len); build_log = malloc(len+1); clGetProgramBuildInfo(program, devices[device_index], CL_PROGRAM_BUILD_LOG, len, build_log, NULL); build_log[len] = '\0'; printf("BUILD LOG: %s\n", build_log); return 1; } //создаем объект - точку входа GPU-программы cl_kernel kernel = clCreateKernel(program, "sum", &errcode); checkError(clCreateKernel); //создаем буфферы в видеопамяти cl_mem dev_a, dev_b, dev_c; checkErrorEx(dev_a = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N*sizeof(float), host_a, &errcode )); checkErrorEx(dev_b = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N*sizeof(float), host_b, &errcode )); checkErrorEx(dev_c = clCreateBuffer( context, CL_MEM_READ_WRITE, N*sizeof(float), NULL, &errcode )); //инициализируем аргументы функции checkErrorEx(errcode = clSetKernelArg(kernel, 0, sizeof(dev_a), &dev_a);); checkErrorEx(errcode = clSetKernelArg(kernel, 1, sizeof(dev_a), &dev_b);); checkErrorEx(errcode = clSetKernelArg(kernel, 2, sizeof(dev_a), &dev_c);); checkErrorEx(errcode = clSetKernelArg(kernel, 3, sizeof(N), &N);); size_t globalSize = 12*1024; // ставим задачу в очередь. // 3й аргумент - размерность пространства рабочих, // 6й аргумент - размер work-group // пследние 3 аргумента для событий clock_t t0 = clock(); cl_event event; errcode = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL, 0, NULL, &event); checkError(clEnqueueNDRangeKernel); checkErrorEx(errcode = clWaitForEvents(1, &event)); clock_t t1 = clock(); cl_ulong time_start, time_end; checkErrorEx(errcode = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL)); checkErrorEx(errcode = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL)); double elapsedTimeGPU; if (errcode == CL_PROFILING_INFO_NOT_AVAILABLE) elapsedTimeGPU = (double)(t1-t0)/CLOCKS_PER_SEC; else elapsedTimeGPU = (double)(time_end - time_start)/1e9; clReleaseEvent(event); checkErrorEx(errcode = clEnqueueReadBuffer( queue, dev_c, CL_TRUE, 0, N*sizeof(float), host_c, 0, NULL, NULL)); checkErrorEx(errcode = clReleaseMemObject(dev_a)); checkErrorEx(errcode = clReleaseMemObject(dev_b)); checkErrorEx(errcode = clReleaseMemObject(dev_c)); // check for (int i = 0; i < N; i++) if (abs(host_c[i] - host_c_check[i]) > 1e-6) { printf("Error in element N %d: c[i] = %g c_check[i] = %g\n",i,host_c[i], host_c_check[i]); exit(1); } printf( "CPU sum time = %g ms\n", elapsedTimeCPU*1000); printf( "CPU memory throughput = %g Gb/s\n", 3*N*sizeof(float)/elapsedTimeCPU/1024/1024/1024 ); printf( "GPU sum time = %g ms\n", elapsedTimeGPU*1000); printf( "GPU memory throughput = %g Gb/s\n", 3*N*sizeof(float)/elapsedTimeGPU/1024/1024/1024 ); return 0; }