CUDA BFS Giant Graph（seg.fault）

私はCUDAのBFSアルゴリズムをテストしています（これはいくつかの同期問題があるとわかっていますが、それは私の仕事でテストしています）。 1M +サイズのグラフの使用（または作成）に関する問題ここでCUDA BFS Giant Graph（seg.fault）

は、私はそれらを作成するために使用するコードです：

#include <stdio.h> 
#include <stdlib.h> 
#include <time.h> 


#define GRAPHSIZE 1000000 

struct Node 
{ 
    int begin;  // comeco da sub-string de vizinhos 
    int num; // tamanho da sub-string de vizinhos 
}; 



int getSize() 
{ 
int size,arcs; 

    printf("Size of the graph: \nNodes:\n>>"); 
    scanf ("%d", &size); 

return size; 
} 



void createEdges(int graphSize, int* Edges) 
{ 

int j,value, aux, rndIdx; 

int edgesSize = 2*GRAPHSIZE; 

srand(time(NULL)); 



printf ("\nGS : %d\n", graphSize); 

j = 1; 

    for (int i=0; i < edgesSize; i++) //first it creates an ordered array of edges 
    { 

     if (j < GRAPHSIZE) 
     { 
     Edges [i] = j; 
     j++; 
     } 
      else 
      { 
      j=1;   
      Edges [i] = j; 
      j++; 
      } 

    } 



    for (int i=0; i < edgesSize; i++) //now, it randomly swaps the edges array 
    { 

    rndIdx = rand()%graphSize; 

    aux = Edges[rndIdx]; 
    Edges[rndIdx] = Edges [i]; 
    Edges [i] = aux; 

    } 

} 


int main() 
{ 

int size,graphAtts[2]; 

int edgesSize = 2*GRAPHSIZE; 

int Edges[edgesSize]; 

struct Node node[GRAPHSIZE]; 

FILE *file; 



printf("____________________________\nRandom graph generator in compact format, optmized for CUDA algorithms by Ianuarivs Severvs.\nFor details about this format read the README. \n"); 

//size = getSize(graphAtts); 

//printf ("%d,%d",size,arcs); 

createEdges(GRAPHSIZE,Edges); // or size? 

/* 
    for (int i = 0; i < edgesSize ; i ++) 
    { 
    printf ("-- %d --", Edges[i]); 
    } 

*/ 

printf("\nEdges:\n"); 
for (int i=0; i < edgesSize; i++) 
printf("%d,",Edges[i]); 


    for (int i=0,j=0 ; i < GRAPHSIZE; i++,j+=2) // now, completes the graph 
    { 
    node[i].begin=j; 
    node[i].num=2; 
    printf ("\n node %d : begin = %d, num = 2",i,j); 
    } 

printf("\n"); 

//writes file: 
file = fopen ("graph1M.g","wb"); 
fwrite (&Edges, edgesSize * sizeof(int),1,file); 
fwrite (&node, GRAPHSIZE * sizeof(struct Node),1,file); 
fclose(file); 


    for (int i = 0; i < edgesSize ; i ++) 
    { 
    printf ("-- %d --", Edges[i]); 
    } 


    for (int i = 0; i < GRAPHSIZE ; i ++) 
    { 
    printf ("* %d *", i); 
    } 


}

そして、ここでは（CUDA上の）私のBFSのコードは次のとおりです。

#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 
#include <cuda.h> 
#include <cutil.h> 

#define GRAPHSIZE 1000000 

struct Node 
{ 
    int begin;  // begining of the substring 
    int num; // size of the sub-string 
}; 

__global__ void BFS (Node *Va, int *Ea, bool *Fa, bool *Xa, int *Ca, bool *parada) // memory races on both Xa and Ca 
{ 

    int tid = threadIdx.x + blockIdx.x * blockDim.x; 
    if (tid > GRAPHSIZE) 
      *parada=true; 


     if (Fa[tid] == true && Xa[tid] == false) 
     { 
     Fa[tid] = false; 
     Xa[tid] = true; 
     //__syncthreads(); // this solves the memrace problem as long as the threads are all on the same block 
        for (int i = Va[tid].begin; i < (Va[tid].begin + Va[tid].num); i++) // Va begin is where it's edges' subarray begins, Va is it's                  number of elements 
      {    
       int nid = Ea[i]; 

       if (Xa[nid] == false) 
       { 
       Ca[nid] = Ca[tid] + 1;    
       Fa[nid] = true;    
       *parada = true; 
       } 

      }     

     } 

} 

// The BFS frontier corresponds to all the nodes being processed at the current level. 


int main() 
{ 

    // for the time couting: 
    cudaEvent_t start, stop; 
    float time; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 


    FILE * file; 

    printf("\nLoading graph file...\n"); 


    struct Node node[GRAPHSIZE]; 
    int edgesSize = 2*GRAPHSIZE; 
    int edges[edgesSize]; 


    file = fopen ("graph1M.g","rb"); 
    printf("abriu");  
    fread (&edges, edgesSize * sizeof(int),1,file); 
    fread (&node, GRAPHSIZE * sizeof(struct Node),1,file); 
    fclose(file); 

//For file read test propouses only: 

/* 
    for (int i = 0; i < edgesSize ; i ++) 
    { 
    printf ("-- %d --", edges[i]); 
    } 


    for (int i = 0; i < GRAPHSIZE ; i ++) 
    { 
    printf ("* %d *", i); 
    } 
*/ 



    bool frontier[GRAPHSIZE]={false}; 
    bool visited[GRAPHSIZE]={false}; 
    int custo[GRAPHSIZE]={0}; 

    int source=0; 
    frontier[source]=true; 

    Node* Va; 
    cudaMalloc((void**)&Va,sizeof(Node)*GRAPHSIZE); 
    cudaMemcpy(Va,node,sizeof(Node)*GRAPHSIZE,cudaMemcpyHostToDevice); 

    int* Ea; 
    cudaMalloc((void**)&Ea,sizeof(Node)*GRAPHSIZE); 
    cudaMemcpy(Ea,edges,sizeof(Node)*GRAPHSIZE,cudaMemcpyHostToDevice); 

    bool* Fa; 
    cudaMalloc((void**)&Fa,sizeof(bool)*GRAPHSIZE); 
    cudaMemcpy(Fa,frontier,sizeof(bool)*GRAPHSIZE,cudaMemcpyHostToDevice); 

    bool* Xa; 
    cudaMalloc((void**)&Xa,sizeof(bool)*GRAPHSIZE); 
    cudaMemcpy(Xa,visited,sizeof(bool)*GRAPHSIZE,cudaMemcpyHostToDevice); 

    int* Ca; 
    cudaMalloc((void**)&Ca,sizeof(int)*GRAPHSIZE); 
    cudaMemcpy(Ca,custo,sizeof(int)*GRAPHSIZE,cudaMemcpyHostToDevice); 


    dim3 grid(100,100,1); //blocks per grid 
    dim3 threads(100,1,1); // threads per block 




    bool para; 
    bool* parada; 
    cudaMalloc((void**)&parada,sizeof(bool)); 
    printf("_____________________________________________\n"); 
    int count=0; 

    cudaEventRecord(start, 0); 
    do{ 
     count ++; 
     para=false; 
     cudaMemcpy(parada,&para,sizeof(bool),cudaMemcpyHostToDevice);  
     BFS <<<grid,threads,0>>>(Va,Ea,Fa,Xa,Ca,parada);  
     CUT_CHECK_ERROR("kernel1 execution failed"); 
     cudaMemcpy(&para,parada,sizeof(bool),cudaMemcpyDeviceToHost); 


    }while(para); 

    cudaEventRecord(stop, 0); 
    cudaEventSynchronize(stop); 


    //printf("\nFinal:\n"); 
    cudaMemcpy(custo,Ca,sizeof(int)*GRAPHSIZE,cudaMemcpyDeviceToHost); 
/* 
    printf("\n_____________________________________________\n"); 
    for(int i=0;i<GRAPHSIZE;i++) 
     printf("%d ",custo[i]); 
    printf("\n"); 

    printf("_____________________________________________\n"); 
*/ 

    cudaEventElapsedTime(&time, start, stop); 
    printf ("\nTime for the kernel: %lf s \n", time/1000); 
    printf ("Number of kernel calls : %d \n", count); 


    file = fopen ("graph125MPar","w"); 


    for(int i=0;i<GRAPHSIZE;i++) 
     fprintf(file,"%d ",custo[i]); 
    fprintf(file,"\n"); 
    fclose(file); 



}

のためにそれを実行しようとしているとき、私はsegmantation障害を抱えています1M +グラフ（私はLinuxの 'ulimit -s 16384'コマンドでシステムのスタックサイズを変更したことに注意してください）

誰かが助けることができますか？

出典

2011-11-07 Imperian

グラフに静的に割り当てられたホスト配列を使用しないでください。動的メモリ割り当てを代わりに使用してください。 ulimitコマンドはスタックサイズを16384 kbに設定していますが、グラフエントリあたり5*sizeof(int) + 2*sizeof(bool)のようなものが必要です。おそらくエントリあたり22バイトです。 100万エントリのスタックスペースを使い果たす場所はかなりわかりやすいです。

出典

2011-11-07 17:11:54 talonmies

ええと、私はその解決策から離れていました（難しい実装です）が、バグの唯一の方法だと思います。 – Imperian

どのように実装するのが難しいですか？現在のコードでは3行だけ正確に変更しなければならず、後で3行を追加してストレージの割り当てを解除する必要があります。 – talonmies

私のポイントは、主に並列algorthmsでポインタとhandligは、通常私の頭痛です.. – Imperian

CUDA BFS Giant Graph（seg.fault）

答えて

関連する問題