cudaMemcpyホストが動作しないホストになる

デバイスからホストへのアレイのコピーをホストに戻そうとするとNode.m [0]の値がゼロになります。カーネルのノードを印刷するとその値が表示されます値が適切に設定されます。残念ながら、自分で間違いを発見することはできませんので、お手数をおかけします。私はビジュアルスタジオコンパイラと計算能力3でコードをコンパイルします。コードthisの回答が私のために働いています。cudaMemcpyホストが動作しないホストになる

私は全体のコードを貼り付けますが、唯一意味のある部分は、あなたのカーネルで

__global__ void divideLeft(Node * nodes,float * leftSide){...}

と

divideLeft<<<1,1>>>(dNodes,dLeftSide); 
ERRCHECK(cudaDeviceSynchronize()); 
ERRCHECK(cudaGetLastError()); 
ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost)); 
printNode(nodes[3]);

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include <stdio.h> 
#include <conio.h> 
#include <new> 
#include <cmath> 

#define ERRCHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true,bool wait=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (wait) getch(); 
     if (abort) exit(code); 
    } 
} 

#define MSIZE 36 
#define INPUT_SIZE(N) N*5 - 3*2 
#define PARENT(i) (i-1)/2 
#define LEFT(i) 2*i + 1 
#define RIGHT(i) 2*i + 2 
#define BOTTOM_HEAP_NODES_COUNT(N) (N-2)/3 //size of input must be 2+3n,n>1 
#define HEAP_SIZE(N) 2*BOTTOM_HEAP_NODES_COUNT(N)-1 
#define FIRST_LEVEL_SIZE 19 
#define ROW_LENGTH 5 
#define FIRST_LVL_MAT_SIZE 5 
#define XY(x,y) x*6+y 

__constant__ int dHigherTreeLevelThreshold; 
__constant__ int dNodesCount; 
__constant__ int dLeftSize; 
__constant__ int dHeapSize; 
__constant__ int dBottomNodes; 
__constant__ int dRemainingNodes; 
__constant__ int dRightCols; 
__constant__ int dInputCount; 

struct Node 
{ 
    float m[MSIZE]; 
    float *x; 
}; 

__device__ __host__ void printNode(Node node); 
__global__ void divideLeft(Node * nodes,float * leftSide) 
{ 
    int idx = blockIdx.x*blockDim.x + threadIdx.x; 
    if(idx>=dBottomNodes) 
     return; 
    int nodeIdx = idx + dRemainingNodes - (idx >= dHigherTreeLevelThreshold)*dBottomNodes; 
// printf("%d %d\n",idx,nodeIdx); 
    Node node = nodes[nodeIdx]; 
    idx*=5*3; 
    node.m[XY(3,3)] = leftSide[idx+2]/3; 
    node.m[XY(3,2)] = leftSide[idx+3]/2; 
    node.m[XY(3,1)] = leftSide[idx+4]; 

    node.m[XY(2,3)] = leftSide[idx+6]/2; 
    node.m[XY(2,2)] = leftSide[idx+7]*2/3; 
    node.m[XY(2,1)] = leftSide[idx+8]; 
    node.m[XY(2,4)] = leftSide[idx+9]; 

    node.m[XY(1,3)] = leftSide[idx+10]; 
    node.m[XY(1,2)] = leftSide[idx+11]; 
    node.m[XY(1,1)] = leftSide[idx+12]; 
    node.m[XY(1,4)] = leftSide[idx+13]; 
    node.m[XY(1,5)] = leftSide[idx+14]; 

    node.m[XY(4,2)] = leftSide[idx+15]; 
    node.m[XY(4,1)] = leftSide[idx+16]; 
    node.m[XY(4,4)] = leftSide[idx+17]*2/3; 
    node.m[XY(4,5)] = leftSide[idx+18]/2; 

    node.m[XY(5,1)] = leftSide[idx+20]; 
    node.m[XY(5,4)] = leftSide[idx+21]/2; 
    node.m[XY(5,5)] = leftSide[idx+22]/3; 
    printNode(node); 
} 

void leftSideInit(float * leftSide,int size) 
{ 
    for(int i = 0;i<size;i++) 
    { 
     leftSide[i] = 1;//(i+1)%26; 
    } 
} 

int main(){ 
    ERRCHECK(cudaSetDevice(0)); 

    int leftCount = 11; 
    int leftSize = leftCount*5; 
    int rightSize = 10; 
    int heapSize = HEAP_SIZE(leftCount); 
    int bottomNodes = BOTTOM_HEAP_NODES_COUNT(leftCount); 
    int greatestPowerOfTwo = pow(2,(int)log2(bottomNodes)); 
    int remainingNodes = heapSize - greatestPowerOfTwo; 

    ERRCHECK(cudaMemcpyToSymbol(dBottomNodes,&bottomNodes,sizeof(int))); 
    ERRCHECK(cudaMemcpyToSymbol(dHigherTreeLevelThreshold,&greatestPowerOfTwo,sizeof(int))); 
    ERRCHECK(cudaMemcpyToSymbol(dRemainingNodes,&remainingNodes,sizeof(int))); 
    ERRCHECK(cudaMemcpyToSymbol(dRightCols,&rightSize,sizeof(int))); 
    ERRCHECK(cudaMemcpyToSymbol(dHeapSize,&heapSize,sizeof(int))); 

    float * leftSide = new float[leftSize]; 
    float * rightSide = new float[rightSize]; 
    Node * nodes = new Node[heapSize]; 
    Node * dNodes = nullptr; 
    float * dLeftSide =nullptr; 
    leftSideInit(leftSide,leftSize); 

    ERRCHECK(cudaMalloc(&dNodes,sizeof(Node)* heapSize)); 
    ERRCHECK(cudaMemset(dNodes,0,sizeof(Node)*heapSize)); 
    ERRCHECK(cudaMalloc(&dLeftSide,leftSize*sizeof(float))); 
    ERRCHECK(cudaMemcpy(dLeftSide,leftSide,leftSize*sizeof(float),cudaMemcpyHostToDevice)); 
    divideLeft<<<1,1>>>(dNodes,dLeftSide); 
    ERRCHECK(cudaDeviceSynchronize()); 
    ERRCHECK(cudaGetLastError()); 
    ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost)); 
    printNode(nodes[3]); 
    delete [] nodes; 
    cudaFree(dNodes); 

    ERRCHECK(cudaDeviceReset()); 

    getch(); 
    return 0; 
} 

__device__ __host__ void printNode(Node node) 
{ 
    for(int i= 0;i<6;i++) 
     printf("%.3f %.3f %.3f %.3f %.3f %.3f\n",node.m[XY(i,0)],node.m[XY(i,1)],node.m[XY(i,2)],node.m[XY(i,3)],node.m[XY(i,4)],node.m[XY(i,5)]); 

}

出典

2017-02-11 quirell

あなたのコードに基づいて、私は何がうまくいかないのかを調べることはできませんが、ノードクラス/構造体の定義方法を投稿したいかもしれません。 –

@ Tae-SungShin 'struct node'は投稿されたコードで定義されています。 –

ているあなたは、あなたが作業しているNodeのローカルコピーを作りましたオン：

Node node = nodes[nodeIdx];

カーネルの残りの部分は、ローカルコピーのnodeという要素を変更します。

すべての変更が完了した後は、ローカルコピーをグローバルコピーにコピーしないので、グローバルコピーは変更されません。

これを修正するには、一つの可能性はあなたのカーネルの終わりに次の行を追加することです：

struct Node 
{ 
    float m[MSIZE]; 
    float *x; 
};

：余談として

nodes[nodeIdx] = node;

、私はあなたのstruct Nodeは、ポインタ変数が含まれていることに注意してください

組み込みポインタを持つ構造体の配列を使用すると、いくつかの特殊な複雑さがあるかもしれないという事実に注意する必要があります。あなたは実際にその変数（x）を実際に使用していないので、私はこれをコメントとして単に言及します。このコンセプトに関する標準的な質問については、cuda tag info pageを参照してください（「CUDAでのポインタの配列の使用」）。

出典

2017-02-11 19:30:37

ありがとう、私は年齢のC + +を使用しなかったので、私はこの単純な問題に気付かなかった。 – quirell

cudaMemcpyホストが動作しないホストになる

答えて

関連する問題