【Altera SoC體驗之旅】+ 正式開啟OpenCL模式

阿新 • • 發佈：2018-12-10

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "CL/opencl.h"
#include "AOCL_Utils.h"
using namespace aocl_utils;
// OpenCL runtime configuration
cl_platform_id platform = NULL;
unsigned num_devices = 0;
scoped_array<cl_device_id> device; // num_devices elements

cl_context context = NULL;
scoped_array<cl_command_queue> queue; // num_devices elements
cl_program program = NULL;
scoped_array<cl_kernel> kernel; // num_devices elements
scoped_array<cl_mem> input_a_buf; // num_devices elements
scoped_array<cl_mem> input_b_buf; // num_devices elements

scoped_array<cl_mem> output_buf; // num_devices elements
// Problem data.
const unsigned N = 1000000; // problem size
scoped_array<scoped_aligned_ptr<float> > input_a, input_b; // num_devices elements
scoped_array<scoped_aligned_ptr<float> > output; // num_devices elements
scoped_array<scoped_array<float> > ref_output; // num_devices elements

scoped_array<unsigned> n_per_device; // num_devices elements
// Function prototypes
float rand_float();
bool init_opencl();
void init_problem();
void run();
void cleanup();
// Entry point.
int main() {
// Initialize OpenCL.
if(!init_opencl()) {
return -1;
}
// Initialize the problem data.
// Requires the number of devices to be known.
init_problem();
// Run the kernel.
run();
// Free the resources allocated
cleanup();
return 0;
}
/////// HELPER FUNCTIONS ///////
// Randomly generate a floating-point number between -10 and 10.
float rand_float() {
return float(rand()) / float(RAND_MAX) * 20.0f - 10.0f;
}
// Initializes the OpenCL objects.
bool init_opencl() {
cl_int status;
printf("Initializing OpenCL\n");
if(!setCwdToExeDir()) {
return false;
}
// Get the OpenCL platform.
platform = findPlatform("Altera");
if(platform == NULL) {
printf("ERROR: Unable to find Altera OpenCL platform.\n");
return false;
}
// Query the available OpenCL device.
device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
printf("Platform: %s\n", getPlatformName(platform).c_str());
printf("Using %d device(s)\n", num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
printf(" %s\n", getDeviceName(device[i]).c_str());
}
// Create the context.
context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status);
checkError(status, "Failed to create context");
// Create the program for all device. Use the first device as the
// representative device (assuming all device are of the same type).
std::string binary_file = getBoardBinaryFile("vectorAdd", device[0]);
printf("Using AOCX: %s\n", binary_file.c_str());
program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices);
// Build the program that was just created.
status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
checkError(status, "Failed to build program");
// Create per-device objects.
queue.reset(num_devices);
kernel.reset(num_devices);
n_per_device.reset(num_devices);
input_a_buf.reset(num_devices);
input_b_buf.reset(num_devices);
output_buf.reset(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
// Command queue.
queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status);
checkError(status, "Failed to create command queue");
// Kernel.
const char *kernel_name = "vectorAdd";
kernel[i] = clCreateKernel(program, kernel_name, &status);
checkError(status, "Failed to create kernel");
// Determine the number of elements processed by this device.
n_per_device[i] = N / num_devices; // number of elements handled by this device
// Spread out the remainder of the elements over the first
// N % num_devices.
if(i < (N % num_devices)) {
n_per_device[i]++;
}
// Input buffers.
input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input A");
input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input B");
// Output buffer.
output_buf[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for output");
}
return true;
}
// Initialize the data for the problem. Requires num_devices to be known.
void init_problem() {
if(num_devices == 0) {
checkError(-1, "No devices");
}
input_a.reset(num_devices);
input_b.reset(num_devices);
output.reset(num_devices);
ref_output.reset(num_devices);
// Generate input vectors A and B and the reference output consisting
// of a total of N elements.
// We create separate arrays for each device so that each device has an
// aligned buffer.
for(unsigned i = 0; i < num_devices; ++i) {
input_a[i].reset(n_per_device[i]);
input_b[i].reset(n_per_device[i]);
output[i].reset(n_per_device[i]);
ref_output[i].reset(n_per_device[i]);
for(unsigned j = 0; j < n_per_device[i]; ++j) {
input_a[i][j] = rand_float();
input_b[i][j] = rand_float();
ref_output[i][j] = input_a[i][j] + input_b[i][j];
}
}
}
void run() {
cl_int status;
const double start_time = getCurrentTimestamp();
// Launch the problem for each device.
scoped_array<cl_event> kernel_event(num_devices);
scoped_array<cl_event> finish_event(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
// Transfer inputs to each device. Each of the host buffers supplied to
// clEnqueueWriteBuffer here is already aligned to ensure that DMA is used
// for the host-to-device transfer.
cl_event write_event[2];
status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), input_a[i], 0, NULL, &write_event[0]);
checkError(status, "Failed to transfer input A");
status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), input_b[i], 0, NULL, &write_event[1]);
checkError(status, "Failed to transfer input B");
// Set kernel arguments.
unsigned argi = 0;
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
// Enqueue kernel.
// Use a global work size corresponding to the number of elements to add
// for this device.
//
// We don't specify a local work size and let the runtime choose
// (it'll choose to use one work-group with the same size as the global
// work-size).
//
// Events are used to ensure that the kernel is not launched until
// the writes to the input buffers have completed.
const size_t global_work_size = n_per_device[i];
printf("Launching for device %d (%d elements)\n", i, global_work_size);
status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
&global_work_size, NULL, 2, write_event, &kernel_event[i]);
checkError(status, "Failed to launch kernel");
// Read the result. This the final operation.
status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
// Release local events.
clReleaseEvent(write_event[0]);
clReleaseEvent(write_event[1]);
}
// Wait for all devices to finish.
clWaitForEvents(num_devices, finish_event);
const double end_time = getCurrentTimestamp();
// Wall-clock time taken.
printf("\nTime: %0.3f ms\n", (end_time - start_time) * 1e3);
// Get kernel times using the OpenCL event profiling API.
for(unsigned i = 0; i < num_devices; ++i) {
cl_ulong time_ns = getStartEndTime(kernel_event[i]);
printf("Kernel time (device %d): %0.3f ms\n", i, double(time_ns) * 1e-6);
}
// Release all events.
for(unsigned i = 0; i < num_devices; ++i) {
clReleaseEvent(kernel_event[i]);
clReleaseEvent(finish_event[i]);
}
// Verify results.
bool pass = true;
for(unsigned i = 0; i < num_devices && pass; ++i) {
for(unsigned j = 0; j < n_per_device[i] && pass; ++j) {
if(fabsf(output[i][j] - ref_output[i][j]) > 1.0e-5f) {
printf("Failed verification @ device %d, index %d\nOutput: %f\nReference: %f\n",
i, j, output[i][j], ref_output[i][j]);
pass = false;
}
}
}
printf("\nVerification: %s\n", pass ? "PASS" : "FAIL");
}
// Free the resources allocated during initialization
void cleanup() {
for(unsigned i = 0; i < num_devices; ++i) {
if(kernel && kernel[i]) {
clReleaseKernel(kernel[i]);
}
if(queue && queue[i]) {
clReleaseCommandQueue(queue[i]);
}
if(input_a_buf && input_a_buf[i]) {
clReleaseMemObject(input_a_buf[i]);
}
if(input_b_buf && input_b_buf[i]) {
clReleaseMemObject(input_b_buf[i]);
}
if(output_buf && output_buf[i]) {
clReleaseMemObject(output_buf[i]);
}
}
if(program) {
clReleaseProgram(program);
}
if(context) {
clReleaseContext(context);
}
}

【Altera SoC體驗之旅】+ 正式開啟OpenCL模式

#include <stdio.h> #include <stdlib.h> #include <math.h> #include "CL/opencl.h" #include "AOCL_Utils.h" using namespace aocl_utils

【CTF刷題之旅】XCTF嘉年華體驗賽逆向題re2的writeup

這道題採用動靜結合的方法嘗試了一下動態除錯的時候想加快進度跳過sleep() 遇到那兩個jle跳轉直接修改SF標誌位為0來修改執行流程可以看到左側箭頭會變成虛線對了第三個jle在s

【CTF刷題之旅】XCTF嘉年華體驗賽逆向題re1的最詳細writeup

看了xctf訓練平臺發現了這道題可以用兩種方法最簡單的就是用angr跑一下過程不再列舉(我試過了可以成功) 具體方法可以看安裝使用Angr符號執行來求解CTF逆向題還有就是用指令碼跑一下載入IDA x32(雖然後綴是.p

【CTF刷題之旅】XCTF嘉年華體驗賽逆向題re1的writeup

看了xctf訓練平臺發現了這道題可以用兩種方法最簡單的就是用angr跑一下過程不再列舉(我試過了可以成功) 還有就是用指令碼跑一下載入IDA x32(雖然後綴是.ppp 但是一猜就知道elf 不知到位數就先用32位IDA試一下唄)

【輕松前端之旅】HTML的塊元素、行內元素和空元素

mod charts 內聯元素學習編程 https -s 網址 tip htm 塊(block)元素顯示成一塊，前後有換行。塊元素常用於web頁面的主要構造模塊。例如:<div>，<p>，<h1>~<h6>,<bl

【輕松前端之旅】CSS盒子模型

webp 技術分享 activity 屬性概念 type title border eight 盒子模型，也叫框模型，在CSS裏是很重要的概念。每個元素都可以看做一個盒子。盒子包含四個部分：外邊距(margin)、邊框(border)、內邊距(padding)

【jarvisoj刷題之旅】pwn題目Tell Me Something的writeup

題目資訊： file一下發現是64位的ELF checksec檢查下安全性 objdump -t 檔名可以檢視符號表 [email protected]:~/Desktop/jarvisOJ$ objdump -

【reversing.kr逆向之旅】ImagePrc的writeup

看到這道題有點懵執行後只有一個Check按鈕中間是空白的不知道什麼鬼(最後才知道是個繪圖板) PEiD載入發現還是沒殼 Vc++6.0編寫的程式 IDA載入進行分析首先還是Shif

【reversing.kr逆向之旅】Music Player的writeup

VB寫的程式且沒有加殼執行看看還真是一個音樂播放器但是隻能播放一分鐘下面三個按鈕據我分析分別就是播放，暫停，重新開始播放當滑動到1分鐘時就會停止並彈窗1？ ?????

【reversing.kr逆向之旅】Replace的writeup

無殼 vc++程式載入Olydbg動態除錯分析就好可以通過下API斷點GetWindowTextW來找到關鍵程式碼因為它就是用來捕獲我們的輸入的隨意輸入(這裡可以發現只可以輸入數字長度什麼的倒是沒有限制) 就像下面一樣&nbs

【reversing.kr逆向之旅】Easy ELF的writeup

這道題直接IDA Pro靜態分析就可以 shift+f12就可以找到關鍵字串圖形檢視下也可以看清楚反彙編main() int __cdecl main() { write(1, "Reversing.Kr Easy ELF\n\

【jarvisoj刷題之旅】逆向題目DDCTF

Android Normal 下載後輸入解壓密碼進行解壓得到Readme.txt與DDCTF-Normal.apk 將apk載入模擬器執行（順便吐槽下藍疊咋不能豎屏。。。）輸入123456789 出現Wrong 載入jeb 反編譯成ja

【reversing.kr逆向之旅】Easy Unpack的writeup

查殼工具貌似不好使啊不過OD和IDA都提示還有題目名字明顯告訴我們加了殼下載的壓縮包解壓後可以看到ReadMe.txt 提示我們只要找到像00401000這樣的OEP即可

【reversing.kr逆向之旅】Easy Keygen的writeup

先看ReadMe.txt 意思是讓我們找到當序列號是5B134977135E7D13時對應的名字查殼無殼 vc++程式執行發現當我們Name與Serial不對應時程式直接退出檢視關鍵字串雙擊字串再雙擊引用 F5 然後進行分析

【reversing.kr逆向之旅】Direct3D FPS的writeup

Direct3D FPS 看到這個名字就感覺會不會是個射擊遊戲然後執行果然是玩了一會兒看到有個幾個小胖人打不死走過去就Game Over了 PEiD看到使用C++寫的直接載入IDA 分析字串很明顯與Game Clear！就是與失敗相對

【reversing.kr逆向之旅】Ransomware的writeup

Exeinfope查到有UPX殼使用脫殼機進行脫殼載入IDA 發現直接顯示太大無法展示空格轉為文字檢視可以很明顯知道下面紅框中的就是一段段花指令檢視最後結束的位置就在0x0044A775 直接IDC指令碼將他們都NOP掉

【reversing.kr逆向之旅】Position的writeup

有提示是說flag就是當Serial為76876-77776時的Name 有多解提示有四位且最後一位是p ReversingKr KeygenMe Find the Name when the Serial is 76876-77776 This pr

【Kylin 踩坑之旅】kylin sum() avg() 無法返回預期的結果

在使用kylin 的時候遇到了avg()函式無法求出預期值的情況，通過檢查發現sum()函式也無法得出預期值，所以通過查詢多方資料找到了問題所在 sum() 函式與avg() 函式無法返回正確的結果之前使用select語句求和的時候遇到了

【Linux探索之旅】第一部分第三課：測試並安裝Ubuntu

u盤 nco 過程 sans ubunt windows u盤啟動盤系統 .com 內容簡單介紹 1、第一部分第三課：測試並安裝Ubuntu 2、第一部分第四課預告：磁盤分區測試並安裝Ubuntu 大家好，經過前兩個比較偏理論（是否

【SSH之旅】一步步學習Hibernate框架（一）：關於持久化

stc localhost 對象 schema hbm.xml java let pass [] 在不引用不論什麽框架下，我們會通過平庸的代碼不停的對數據庫進行操作，產生了非常多冗余的可是又有規律的底層代碼，這樣頻繁的操作數據庫和大量的底層代碼的反復

【Altera SoC體驗之旅】+ 正式開啟OpenCL模式

相關推薦