GPUとCPUプログラミング：処理時間の不一致

現在、画像トラッキングに取り組んでいます。カメラのおかげで、Androidシステムとやりとりする指先を追跡しています。 OpenCLを使ってGPUで画像処理が行われます。カメラの出力を白黒のフレームに変換して、白い部分を取得します。この方法で処理時間は65msです。私の目的はプログラムを円滑にすることなので、私はOpenCVメソッドを使ってCPU上で同じ操作を実行しました。これは115msの処理時間を与えます。問題は、OpenCVメソッドの方が反応が早く、プログラムが速く感じられることです。この場合、処理時間が長くなることは理解できません。これは私にとっては矛盾しているようです。測定にはは、私は次のように進みます。ここGPUとCPUプログラミング：処理時間の不一致

start= clock(); 
finish = clock(); 
double time =((double)finish -start)/CLOCKS_PER_SEC; 
std::cout<<"process time : "<< time<<std::endl;

は私のコードです：

static cv::Mat    original_Right,binary_Right; 
static cv::Mat    original_Left, binary_Left; 
int     width, height; 
clock_t     start,finish; 
double time = 0.0; 

width = (int) this->camera_Right.getCapture().get(cv::CAP_PROP_FRAME_WIDTH); 
height = (int) this->camera_Right.getCapture().get(cv::CAP_PROP_FRAME_HEIGHT); 
original_Right.create(height, width, CV_8UC3); 


//--------------------------- Camera 2 --------------------------------- 
int width_2 = (int) this->camera_Left.getCapture().get(cv::CAP_PROP_FRAME_WIDTH); 
int height_2 = (int) this->camera_Left.getCapture().get(cv::CAP_PROP_FRAME_HEIGHT); 
original_Left.create(height_2, width_2, CV_8UC3); 


binary_Right.create(height, width, CV_32F); // FOR GPU 
binary_Left.create(height_2, width_2, CV_32F); // FOR GPU 
//binary_Right.create(height, width, CV_8UC1); // FOR CPU 
//binary_Left.create(height_2, width_2, CV_8UC1); // FOR CPU 

Core::running_ = true; 


//------------------------------------ SET UP THE GPU ----------------------------------------- 
cl_context    context; 
cl_context_properties properties [3]; 
cl_kernel    kernel; 
cl_command_queue  command_queue; 
cl_program    program; 
cl_int     err; 
cl_uint     num_of_platforms=0; 
cl_platform_id   platform_id; 
cl_device_id   device_id; 
cl_uint     num_of_devices=0; 
cl_mem     input, output; 

size_t     global; 

int      data_size =height*width*3; 


//load opencl source 
FILE *fp; 
char fileName[] = "./helloTedKrissV2.cl"; 
char *source_str; 

//Load the source code containing the kernel 
fp = fopen(fileName, "r"); 
if (!fp) { 
fprintf(stderr, "Failed to load kernel.\n"); 
exit(1); 
} 
source_str = (char*)malloc(MAX_SOURCE_SIZE); 
global = fread(source_str, 1, MAX_SOURCE_SIZE, fp); 
fclose(fp); 


//retreives a list of platforms available 
if(clGetPlatformIDs(1,&platform_id, &num_of_platforms)!=CL_SUCCESS){ 
    std::cout<<"unable to get a platform_id"<<std::endl; 
}; 

// to get a supported GPU device 
if(clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_GPU,1,&device_id, &num_of_devices)!= CL_SUCCESS){ 
    std::cout<<"unable to get a device_id"<<std::endl;  
}; 

//context properties list - must be terminated with 0 
properties[0]=CL_CONTEXT_PLATFORM; 
properties[1]=(cl_context_properties) platform_id; 
properties[2]=0; 

// create a context with the gpu device 
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err); 

//create command queue using the context and device 
command_queue = clCreateCommandQueue(context,device_id,0,&err); 

//create a program from the kernel source code 
program= clCreateProgramWithSource(context,1,(const char **) &source_str, NULL,&err); 

// compile the program 
if(clBuildProgram(program,0,NULL,NULL,NULL,NULL)!=CL_SUCCESS){ 
    size_t length; 
    std::cout<<"Error building program"<<std::endl; 
    char buffer[4096]; 
    clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG, sizeof(buffer),buffer,&length); 
    std::cout<< buffer <<std::endl; 
} 

//specify which kernel from the program to execute 
kernel = clCreateKernel(program,"imageProcessing",&err); 




while (this->isRunning() == true) { 

    start= clock(); //--------------------- START---------------------- 

    //----------------------FRAME--------------------- 
    this->camera_Right.readFrame(original_Right); 
    if (original_Right.empty() == true) { 
     std::cerr << "[Core/Error] Original frame is empty." << std::endl; 
     break; 
    } 

    this->camera_Left.readFrame(original_Left); 
    if (original_Left.empty() == true) { 
     std::cerr << "[Core/Error] Original 2 frame is empty." << std::endl; 
     break; 
    } 
    //----------------------FRAME--------------------- 



    //------------------------------------------------IMP GPU ------------------------------------------------------ 

    input = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR , sizeof(unsigned char)*data_size,NULL,NULL); 
    output =clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*data_size/3,NULL,NULL); 

    if(clEnqueueWriteBuffer(command_queue,input,CL_TRUE,0,sizeof(unsigned char)*data_size, original_Right.data ,0,NULL,NULL)!= CL_SUCCESS){}; 

    //set the argument list for the kernel command 
    clSetKernelArg(kernel,0,sizeof(cl_mem), &input); 
    clSetKernelArg(kernel,1,sizeof(cl_mem), &output); 
    global = data_size ; 
    //enqueue the kernel command for execution 
    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL,0,NULL,NULL); 
    clFinish(command_queue); 
    //copy the results from out of the output buffer 
    if(clEnqueueReadBuffer(command_queue,output,CL_TRUE ,0,sizeof(float)*data_size/3,binary_Right.data,0,NULL,NULL)!= CL_SUCCESS){}; 

    clReleaseMemObject(input); 
    clReleaseMemObject(output); 

    //------------------------------------------------IMP GPU ------------------------------------------------------ 

    input = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR , sizeof(unsigned char)*data_size,NULL,NULL); 
    output =clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*data_size/3,NULL,NULL); 

    if(clEnqueueWriteBuffer(command_queue,input,CL_TRUE,0,sizeof(unsigned char)*data_size, original_Left.data ,0,NULL,NULL)!= CL_SUCCESS){}; 

    //set the argument list for the kernel command 
    clSetKernelArg(kernel,0,sizeof(cl_mem), &input); 
    clSetKernelArg(kernel,1,sizeof(cl_mem), &output); 
    global = data_size ; 
    //enqueue the kernel command for execution 
    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL,0,NULL,NULL); 
    clFinish(command_queue); 
    //copy the results from out of the output buffer 
    if(clEnqueueReadBuffer(command_queue,output,CL_TRUE ,0,sizeof(float)*data_size/3,binary_Left.data,0,NULL,NULL)!= CL_SUCCESS){}; 

    clReleaseMemObject(input); 
    clReleaseMemObject(output); 

    //------------------------------------------------IMP GPU ------------------------------------------------------ 

    // CPU METHOD 
    // adok::processing::doImageProcessing(original_Right, binary_Right); 
    // adok::processing::doImageProcessing(original_Left, binary_Left); 

    //-------------------------------------------------------------- TRACKING ------------------------------------------------------ 

adok::tracking::doFingerContoursTracking(binary_Right,binary_Left, this->fingerContours, this->perspective_Right,this->perspective_Left, this->distortion_Right,this->distortion_Left, this); 

    //------------------------------------------- TRACKING ----------------------------------------- 

//------------------------------SEND COORDINATES TO ANDROID BOARD-------------------- 
if (getSideRight() && !getSideLeft()) { 
     std::cout<<"RIGHT : "<<std::endl; 
     this->uart_.sendAll(this->fingerContours, this->perspective_Right.getPerspectiveMatrix(), RIGHT); 
    }else if (!getSideRight() && getSideLeft()){ 
     std::cout<<"LEFT : "<<std::endl; 
     this->uart_.sendAll(this->fingerContours, this->perspective_Left.getPerspectiveMatrix(), LEFT); 
    }else if (getSideRight() && getSideLeft()){ 
     std::cout<<"RIGHT & LEFT : "<<std::endl; 
     this->uart_.sendAll(this->fingerContours, this->perspective_Right.getPerspectiveMatrix(), this->perspective_Left.getPerspectiveMatrix()); 

    } 

this->setSideRight(0); 
this->setSideLeft(0); 

finish = clock(); 
time =(double)(finish - start)/CLOCKS_PER_SEC; 
std::cout << "Time: " << time << std::endl; // ------------END----------- 

} 
clReleaseCommandQueue(command_queue); 
clReleaseProgram(program); 
clReleaseKernel(kernel); 
clReleaseContext(context); 
this->stop();

}

私はCPUにつかむための時間だときに奇妙な何かが、もありますフレームは5msですが、GPUでは15msなので、なぜそれが増えるのか分かりません。

私はアンドロイドxu4に取り組んでいます。

出典

2016-10-26 A. Kriss

ありがとうございました！私は、なぜそれが停止を感じるのかを知った：それはフレームをつかむための時間が5msから15msに渡るためです。それは、帯域幅を減少させるバッファを作成するためかもしれません。 GPUでのプログラミングはCPUより高速ですが、イメージ/秒に影響します。と私は（カメラごと）これを2回やっているので、その理由は次のとおりです。私はOpenCLのに働いている

if(clEnqueueWriteBuffer(command_queue,input,CL_TRUE,0,sizeof(unsigned char)*data_size, original_Right.data ,0,NULL,NULL)!= CL_SUCCESS){}; 
    //set the argument list for the kernel command 
    clSetKernelArg(kernel,0,sizeof(cl_mem), &input); 
    clSetKernelArg(kernel,1,sizeof(cl_mem), &output); 
    global = data_size ; 
    //enqueue the kernel command for execution 
    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL,0,NULL,NULL); 
    clFinish(command_queue); 
    //copy the results from out of the output buffer 
    if(clEnqueueReadBuffer(command_queue,output,CL_TRUE ,0,sizeof(unsigned char)*data_size,original_Right.data,0,NULL,NULL)!= CL_SUCCESS){};

出典

2016-10-28 07:56:18

GPUの計算では、計算にはCPUよりも時間がかかることがあります。 GPUの計算のために、メインプロセスはGPUメモリにデータを送り、数学的計算の後にGPUはCPUにデータを送り返します。したがって、データ転送とCPUに戻って受信するには時間がかかります。計算されたバッファサイズが大きく転送時間が長い場合は、GPU計算に時間がかかることがあります。 CUDNNライブラリと一緒にGPUプロセッサを使用すると、何倍も高速になります。したがって、プログラムがCUDNNを使用していない場合、速度が遅くなる可能性があります。

出典

2016-10-26 10:32:10

ので、私はCUDNNライブラリを使用することができないこととします。しかし、これはなぜ時間を測定するのかを説明するものではありません。 –

フレームの寸法は？ –

これを参照してくださいhttp://opencv-users.1802565.n2.nabble.com/Poor-OpenCL-performance-td7584466.html –

イベントを使用して、データの書き込みにかかる時間と処理に要する時間を確認できます。
一般的にclFinishを使用するのは良い考えではありません。 Enqueueコマンドからイベントを取得してそれをRead Dataに渡すと、処理が終了すると同時に読み込みデータが発生します。別の問題は、同じデータサイズを持つ限り、毎回バッファオブジェクトを再作成する必要がないことです。一度作成して再利用してください。

出典

2016-10-26 21:53:14

GPUとCPUプログラミング：処理時間の不一致

答えて

関連する問題