1. 程式人生 > >【Altera SoC體驗之旅】+ 正式開啟OpenCL模式

【Altera SoC體驗之旅】+ 正式開啟OpenCL模式

  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <math.h>
  4. #include "CL/opencl.h"
  5. #include "AOCL_Utils.h"
  6. using namespace aocl_utils;
  7. // OpenCL runtime configuration
  8. cl_platform_id platform = NULL;
  9. unsigned num_devices = 0;
  10. scoped_array<cl_device_id> device; // num_devices elements
  11. cl_context context = NULL;
  12. scoped_array<cl_command_queue> queue; // num_devices elements
  13. cl_program program = NULL;
  14. scoped_array<cl_kernel> kernel; // num_devices elements
  15. scoped_array<cl_mem> input_a_buf; // num_devices elements
  16. scoped_array<cl_mem> input_b_buf; // num_devices elements
  17. scoped_array<cl_mem> output_buf; // num_devices elements
  18. // Problem data.
  19. const unsigned N = 1000000; // problem size
  20. scoped_array<scoped_aligned_ptr<float> > input_a, input_b; // num_devices elements
  21. scoped_array<scoped_aligned_ptr<float> > output; // num_devices elements
  22. scoped_array<scoped_array<float> > ref_output; // num_devices elements
  23. scoped_array<unsigned> n_per_device; // num_devices elements
  24. // Function prototypes
  25. float rand_float();
  26. bool init_opencl();
  27. void init_problem();
  28. void run();
  29. void cleanup();
  30. // Entry point.
  31. int main() {
  32.   // Initialize OpenCL.
  33.   if(!init_opencl()) {
  34.     return -1;
  35.   }
  36.   // Initialize the problem data.
  37.   // Requires the number of devices to be known.
  38.   init_problem();
  39.   // Run the kernel.
  40.   run();
  41.   // Free the resources allocated
  42.   cleanup();
  43.   return 0;
  44. }
  45. /////// HELPER FUNCTIONS ///////
  46. // Randomly generate a floating-point number between -10 and 10.
  47. float rand_float() {
  48.   return float(rand()) / float(RAND_MAX) * 20.0f - 10.0f;
  49. }
  50. // Initializes the OpenCL objects.
  51. bool init_opencl() {
  52.   cl_int status;
  53.   printf("Initializing OpenCL\n");
  54.   if(!setCwdToExeDir()) {
  55.     return false;
  56.   }
  57.   // Get the OpenCL platform.
  58.   platform = findPlatform("Altera");
  59.   if(platform == NULL) {
  60.     printf("ERROR: Unable to find Altera OpenCL platform.\n");
  61.     return false;
  62.   }
  63.   // Query the available OpenCL device.
  64.   device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
  65.   printf("Platform: %s\n", getPlatformName(platform).c_str());
  66.   printf("Using %d device(s)\n", num_devices);
  67.   for(unsigned i = 0; i < num_devices; ++i) {
  68.     printf("  %s\n", getDeviceName(device[i]).c_str());
  69.   }
  70.   // Create the context.
  71.   context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status);
  72.   checkError(status, "Failed to create context");
  73.   // Create the program for all device. Use the first device as the
  74.   // representative device (assuming all device are of the same type).
  75.   std::string binary_file = getBoardBinaryFile("vectorAdd", device[0]);
  76.   printf("Using AOCX: %s\n", binary_file.c_str());
  77.   program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices);
  78.   // Build the program that was just created.
  79.   status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
  80.   checkError(status, "Failed to build program");
  81.   // Create per-device objects.
  82.   queue.reset(num_devices);
  83.   kernel.reset(num_devices);
  84.   n_per_device.reset(num_devices);
  85.   input_a_buf.reset(num_devices);
  86.   input_b_buf.reset(num_devices);
  87.   output_buf.reset(num_devices);
  88.   for(unsigned i = 0; i < num_devices; ++i) {
  89.     // Command queue.
  90.     queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status);
  91.     checkError(status, "Failed to create command queue");
  92.     // Kernel.
  93.     const char *kernel_name = "vectorAdd";
  94.     kernel[i] = clCreateKernel(program, kernel_name, &status);
  95.     checkError(status, "Failed to create kernel");
  96.     // Determine the number of elements processed by this device.
  97.     n_per_device[i] = N / num_devices; // number of elements handled by this device
  98.     // Spread out the remainder of the elements over the first
  99.     // N % num_devices.
  100.     if(i < (N % num_devices)) {
  101.       n_per_device[i]++;
  102.     }
  103.     // Input buffers.
  104.     input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
  105.         n_per_device[i] * sizeof(float), NULL, &status);
  106.     checkError(status, "Failed to create buffer for input A");
  107.     input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
  108.         n_per_device[i] * sizeof(float), NULL, &status);
  109.     checkError(status, "Failed to create buffer for input B");
  110.     // Output buffer.
  111.     output_buf[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
  112.         n_per_device[i] * sizeof(float), NULL, &status);
  113.     checkError(status, "Failed to create buffer for output");
  114.   }
  115.   return true;
  116. }
  117. // Initialize the data for the problem. Requires num_devices to be known.
  118. void init_problem() {
  119.   if(num_devices == 0) {
  120.     checkError(-1, "No devices");
  121.   }
  122.   input_a.reset(num_devices);
  123.   input_b.reset(num_devices);
  124.   output.reset(num_devices);
  125.   ref_output.reset(num_devices);
  126.   // Generate input vectors A and B and the reference output consisting
  127.   // of a total of N elements.
  128.   // We create separate arrays for each device so that each device has an
  129.   // aligned buffer.
  130.   for(unsigned i = 0; i < num_devices; ++i) {
  131.     input_a[i].reset(n_per_device[i]);
  132.     input_b[i].reset(n_per_device[i]);
  133.     output[i].reset(n_per_device[i]);
  134.     ref_output[i].reset(n_per_device[i]);
  135.     for(unsigned j = 0; j < n_per_device[i]; ++j) {
  136.       input_a[i][j] = rand_float();
  137.       input_b[i][j] = rand_float();
  138.       ref_output[i][j] = input_a[i][j] + input_b[i][j];
  139.     }
  140.   }
  141. }
  142. void run() {
  143.   cl_int status;
  144.   const double start_time = getCurrentTimestamp();
  145.   // Launch the problem for each device.
  146.   scoped_array<cl_event> kernel_event(num_devices);
  147.   scoped_array<cl_event> finish_event(num_devices);
  148.   for(unsigned i = 0; i < num_devices; ++i) {
  149.     // Transfer inputs to each device. Each of the host buffers supplied to
  150.     // clEnqueueWriteBuffer here is already aligned to ensure that DMA is used
  151.     // for the host-to-device transfer.
  152.     cl_event write_event[2];
  153.     status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_FALSE,
  154.         0, n_per_device[i] * sizeof(float), input_a[i], 0, NULL, &write_event[0]);
  155.     checkError(status, "Failed to transfer input A");
  156.     status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE,
  157.         0, n_per_device[i] * sizeof(float), input_b[i], 0, NULL, &write_event[1]);
  158.     checkError(status, "Failed to transfer input B");
  159.     // Set kernel arguments.
  160.     unsigned argi = 0;
  161.     status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
  162.     checkError(status, "Failed to set argument %d", argi - 1);
  163.     status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]);
  164.     checkError(status, "Failed to set argument %d", argi - 1);
  165.     status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
  166.     checkError(status, "Failed to set argument %d", argi - 1);
  167.     // Enqueue kernel.
  168.     // Use a global work size corresponding to the number of elements to add
  169.     // for this device.
  170.     //
  171.     // We don't specify a local work size and let the runtime choose
  172.     // (it'll choose to use one work-group with the same size as the global
  173.     // work-size).
  174.     //
  175.     // Events are used to ensure that the kernel is not launched until
  176.     // the writes to the input buffers have completed.
  177.     const size_t global_work_size = n_per_device[i];
  178.     printf("Launching for device %d (%d elements)\n", i, global_work_size);
  179.     status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
  180.         &global_work_size, NULL, 2, write_event, &kernel_event[i]);
  181.     checkError(status, "Failed to launch kernel");
  182.     // Read the result. This the final operation.
  183.     status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
  184.         0, n_per_device[i] * sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
  185.     // Release local events.
  186.     clReleaseEvent(write_event[0]);
  187.     clReleaseEvent(write_event[1]);
  188.   }
  189.   // Wait for all devices to finish.
  190.   clWaitForEvents(num_devices, finish_event);
  191.   const double end_time = getCurrentTimestamp();
  192.   // Wall-clock time taken.
  193.   printf("\nTime: %0.3f ms\n", (end_time - start_time) * 1e3);
  194.   // Get kernel times using the OpenCL event profiling API.
  195.   for(unsigned i = 0; i < num_devices; ++i) {
  196.     cl_ulong time_ns = getStartEndTime(kernel_event[i]);
  197.     printf("Kernel time (device %d): %0.3f ms\n", i, double(time_ns) * 1e-6);
  198.   }
  199.   // Release all events.
  200.   for(unsigned i = 0; i < num_devices; ++i) {
  201.     clReleaseEvent(kernel_event[i]);
  202.     clReleaseEvent(finish_event[i]);
  203.   }
  204.   // Verify results.
  205.   bool pass = true;
  206.   for(unsigned i = 0; i < num_devices && pass; ++i) {
  207.     for(unsigned j = 0; j < n_per_device[i] && pass; ++j) {
  208.       if(fabsf(output[i][j] - ref_output[i][j]) > 1.0e-5f) {
  209.         printf("Failed verification @ device %d, index %d\nOutput: %f\nReference: %f\n",
  210.             i, j, output[i][j], ref_output[i][j]);
  211.         pass = false;
  212.       }
  213.     }
  214.   }
  215.   printf("\nVerification: %s\n", pass ? "PASS" : "FAIL");
  216. }
  217. // Free the resources allocated during initialization
  218. void cleanup() {
  219.   for(unsigned i = 0; i < num_devices; ++i) {
  220.     if(kernel && kernel[i]) {
  221.       clReleaseKernel(kernel[i]);
  222.     }
  223.     if(queue && queue[i]) {
  224.       clReleaseCommandQueue(queue[i]);
  225.     }
  226.     if(input_a_buf && input_a_buf[i]) {
  227.       clReleaseMemObject(input_a_buf[i]);
  228.     }
  229.     if(input_b_buf && input_b_buf[i]) {
  230.       clReleaseMemObject(input_b_buf[i]);
  231.     }
  232.     if(output_buf && output_buf[i]) {
  233.       clReleaseMemObject(output_buf[i]);
  234.     }
  235.   }
  236.   if(program) {
  237.     clReleaseProgram(program);
  238.   }
  239.   if(context) {
  240.     clReleaseContext(context);
  241.   }
  242. }