CUDA結果は、私は、デバイスとサイズのホスト配列のnを作成して割り当てるn個スレッドを作成したカーネルを起動するテストプログラムを作成しています非常に大きなアレイを使用してゴミを返しますが、レポートエラーなし

デバイスアレイ内の各位置に一定値0.95fを設定します。完了後、デバイスアレイがホストアレイにコピーされ、すべてのエントリが合計され、最終的な合計が表示されます。CUDA結果は、私は、デバイスとサイズのホスト配列のnを作成して割り当てるn個スレッドを作成したカーネルを起動するテストプログラムを作成しています非常に大きなアレイを使用してゴミを返しますが、レポートエラーなし

以下のプログラムは、最大約6,000万の浮動小数点数の配列サイズに対してうまく機能しているようですが、正しい結果を非常に迅速に返しますが、7千万に達するとプログラムはしばらくハングしてしまい、。 6,000万回の実行後にホストアレイを検査すると、0.95fが正しく入力されていますが、7,000,000回の実行後に検査すると、NANが表示されます。私が知る限り、CUDAコールはエラーを返さない。

は、私は私の1024年の最大ブロックサイズと私は似たような達成するためのより良い方法があると確信しています2147483647

の最大グリッド次元を与え、2ギガバイトGT640m（計算3.0）を使用しています、と私は希望提案を聞くのが好きです。しかし、私はここで何が間違っているのか理解したいので、私はそれから学ぶことができます。

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include <stdio.h> 
#include <fstream> 

void cudaErrorHandler(cudaError_t status) 
{ 
    // Cuda call returned an error, just print error for now 
    if(status != cudaSuccess) 
    { 
     printf("Error"); 
    } 
} 

__global__ void addKernel(float* _Results, int _TotalCombinations) 
{ 
    // Get thread Id 
    unsigned int Id = (blockDim.x * blockDim.y * blockIdx.x) + (blockDim.x * threadIdx.y) + threadIdx.x; 

    //If the Id is within simulation range, log it 
    if(Id < _TotalCombinations) 
    { 
     _Results[Id] = 0.95f; 
    } 
} 

#define BLOCK_DIM_X 32 
#define BLOCK_DIM_Y 32 
#define BLOCK_SIZE BLOCK_DIM_X * BLOCK_DIM_Y // Statc block size of 32*32 (1024) 
#define CUDA_CALL(x) cudaErrorHandler(x) 

int main() 
{ 
    // The number of simulations to run 
    unsigned int totalCombinations = 45000000; 

    int gridsize = 1; 

    // Work out how many blocks of size 1024 are required to perform all of totalCombinations 
    for(unsigned int totalsize = gridsize * BLOCK_SIZE; totalsize < totalCombinations; 
     gridsize++, totalsize = gridsize * BLOCK_SIZE) 
     ; 

    // Allocate host memory 
    float* host_results = new float[totalCombinations]; 
    memset(host_results, 0, sizeof(float) * totalCombinations); 
    float *dev_results = 0; 

    cudaSetDevice(0); 

    // Allocate device memory 
    CUDA_CALL(cudaMalloc((void**)&dev_results, totalCombinations * sizeof(float))); 

    dim3 grid, block; 

    block = dim3(BLOCK_DIM_X, BLOCK_DIM_Y); 

    grid = dim3(gridsize); 

    // Launch kernel 
    addKernel<<<gridsize, block>>>(dev_results, totalCombinations); 

    // Wait for synchronize 
    CUDA_CALL(cudaDeviceSynchronize()); 

    // Copy device data back to host 
    CUDA_CALL(cudaMemcpy(host_results, dev_results, totalCombinations * sizeof(float), cudaMemcpyDeviceToHost)); 

    double total = 0.0; 

    // Total the results in the host array 
    for(unsigned int i = 0; i < totalCombinations; i++) 
     total+=host_results[i]; 

    // Print results to screen 
    printf("Total %f\n", total); 

    delete[] host_results; 

    return 0; 
}

出典

2012-11-23 TVOHM

エラー処理方法が機能していません。これを証明するには、ブロックdim xとyを50に変更します（不正な2500個のスレッドを生成します）、エラーは表示されません。エラー処理を修正すると、問題が検出されます。あなたの失敗点で動作しない理由は、グリッドサイズ（1Dグリッドを起動しています）がXディメンションの最大グリッドサイズ（デフォルトでは65535）を超えているためです。大きなグリッドサイズを利用するには、 '-arch = sm_30'スイッチでコンパイルする必要があります。別の注意として、ブロックXの次元が22であることが推奨されていません。 –

@RobertCrovella私はエラーチェック（そしてブロックX、それは私の部分のタイプミス）を修正し、コマンドラインにスイッチを追加しました。これは私の問題を解決し、すべてが正しく動作しています。これを回答として提出したい場合は、私はそれを受け入れます。 – TVOHM

あなたが発見したように、エラー処理方法は機能していません。以下は、頻繁に使用するエラーチェックメソッドを使用してコードのバージョンを貼り付けたものです。障害ポイントで動作しない理由は、グリッドサイズ（1Dグリッドを起動しています）がXディメンションの最大グリッドサイズ（デフォルトでは65535、つまり計算能力が最大2.xの場合）を超えているためです。大きなグリッドサイズのディメンション（2^31 -1が計算機能3.0で限界）を利用したい場合は、-arch=sm_30スイッチでコンパイルする必要があります。

ここでは、頻繁に使用するエラーチェック方法を示すコードのバージョンを示します。

#include <stdio.h> 
#include <fstream> 


#define cudaCheckErrors(msg) \ 
    do { \ 
     cudaError_t __err = cudaGetLastError(); \ 
     if (__err != cudaSuccess) { \ 
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ 
       msg, cudaGetErrorString(__err), \ 
       __FILE__, __LINE__); \ 
      fprintf(stderr, "*** FAILED - ABORTING\n"); \ 
      exit(1); \ 
     } \ 
    } while (0) 

__global__ void addKernel(float* _Results, int _TotalCombinations) 
{ 
    // Get thread Id 
    unsigned int Id = (blockDim.x * blockDim.y * blockIdx.x) + (blockDim.x * threadIdx.y) + threadIdx.x; 

    //If the Id is within simulation range, log it 
    if(Id < _TotalCombinations) 
    { 
     _Results[Id] = 0.95f; 
    } 
} 

#define BLOCK_DIM_X 32 
#define BLOCK_DIM_Y 32 
#define BLOCK_SIZE BLOCK_DIM_X * BLOCK_DIM_Y // Statc block size of 32*32 (1024) 

int main() 
{ 
    // The number of simulations to run 
    unsigned int totalCombinations = 65000000; 

    int gridsize = 1; 

    // Work out how many blocks of size 1024 are required to perform all of totalCombinations 
    for(unsigned int totalsize = gridsize * BLOCK_SIZE; totalsize < totalCombinations; 
     gridsize++, totalsize = gridsize * BLOCK_SIZE) 
     ; 
    printf("gridsize = %d, blocksize = %d\n", gridsize, BLOCK_SIZE); 
    // Allocate host memory 
    float* host_results = new float[totalCombinations]; 
    memset(host_results, 0, sizeof(float) * totalCombinations); 
    float *dev_results = 0; 

    cudaSetDevice(0); 

    // Allocate device memory 
    cudaMalloc((void**)&dev_results, totalCombinations * sizeof(float)); 
    cudaCheckErrors("cudaMalloc fail"); 

    dim3 grid, block; 

    block = dim3(BLOCK_DIM_X, BLOCK_DIM_Y); 

    grid = dim3(gridsize); 

    // Launch kernel 
    addKernel<<<gridsize, block>>>(dev_results, totalCombinations); 
    cudaCheckErrors("kernel fail"); 
    // Wait for synchronize 
    cudaDeviceSynchronize(); 
    cudaCheckErrors("sync fail"); 

    // Copy device data back to host 
    cudaMemcpy(host_results, dev_results, totalCombinations * sizeof(float), cudaMemcpyDeviceToHost); 
    cudaCheckErrors("cudaMemcpy 2 fail"); 

    double total = 0.0; 

    // Total the results in the host array 
    for(unsigned int i = 0; i < totalCombinations; i++) 
     total+=host_results[i]; 

    // Print results to screen 
    printf("Total %f\n", total); 

    delete[] host_results; 

    return 0; 
}

出典

2012-11-23 16:36:55

お手数をおかけしていただきありがとうございます。今後、このエラーチェック方法を使用します。 – TVOHM

すばらしい答え。今日でも助けてくれました。ありがとうございました！ –

答えて

関連する問題