CUDAを使用した配列とそのインデックスの最小値の検索__shfl_down関数

CUDAを使用して最小値と1D配列で値が見つかったインデックスを見つける関数を記述しています。CUDAを使用した配列とそのインデックスの最小値の検索__shfl_down関数

まず、1d配列の値の合計を求めるためのリダクションコードを変更しました。このコードはsum関数でうまく動作しますが、最小値を見つけるために働くことはできません。もし私がやっている間違いを指摘してくれれば、メッセージにコードをつけています。

実際の機能は以下の通りです。テスト例では、配列のサイズは1024です。したがって、シャッフル削減部分を使用しています。問題は、ブロックあたりのg_oIdxs（インデックスを与える）の出力値であり、ブロックあたりのg_odata（最小値を与える）は、通常のシーケンシャルCPUコードと比べて間違っています。

また、g_odataの値は、ホストで印刷するとすべてゼロです。

ありがとうございます！

#include <stdio.h> 
#include <stdlib.h> 
#include <cuda.h> 
#include <math.h> 
#include <unistd.h> 
#include <sys/time.h> 

#if __DEVICE_EMULATION__ 
#define DEBUG_SYNC __syncthreads(); 
#else 
#define DEBUG_SYNC 
#endif 

#ifndef MIN 
#define MIN(x,y) ((x < y) ? x : y) 
#endif 

#ifndef MIN_IDX 
#define MIN_IDX(x,y, idx_x, idx_y) ((x < y) ? idx_x : idx_y) 
#endif 

#if (__CUDA_ARCH__ < 200) 
#define int_mult(x,y) __mul24(x,y) 
#else 
#define int_mult(x,y) x*y 
#endif 

#define inf 0x7f800000 

bool isPow2(unsigned int x) 
{ 
    return ((x&(x-1))==0); 
} 

unsigned int nextPow2(unsigned int x) 
{ 
    --x; 
    x |= x >> 1; 
    x |= x >> 2; 
    x |= x >> 4; 
    x |= x >> 8; 
    x |= x >> 16; 
    return ++x; 
} 

// Utility class used to avoid linker errors with extern 
// unsized shared memory arrays with templated type 
template<class T> 
struct SharedMemory { 
    __device__ inline operator T *() { 
     extern __shared__ int __smem[]; 
     return (T *) __smem; 
    } 

    __device__ inline operator const T *() const { 
     extern __shared__ int __smem[]; 
     return (T *) __smem; 
    } 
}; 

// specialize for double to avoid unaligned memory 
// access compile errors 
template<> 
struct SharedMemory<double> { 
    __device__ inline operator double *() { 
     extern __shared__ double __smem_d[]; 
     return (double *) __smem_d; 
    } 

    __device__ inline operator const double *() const { 
     extern __shared__ double __smem_d[]; 
     return (double *) __smem_d; 
    } 
}; 

/* 
This version adds multiple elements per thread sequentially. This reduces the overall 
cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n). 
(Brent's Theorem optimization) 

Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory. 
In other words if blockSize <= 32, allocate 64*sizeof(T) bytes. 
If blockSize > 32, allocate blockSize*sizeof(T) bytes. 
*/ 
template<class T, unsigned int blockSize, bool nIsPow2> 
__global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) { 
    T *sdata = SharedMemory<T>(); 

    // perform first level of reduction, 
    // reading from global memory, writing to shared memory 
    unsigned int tid = threadIdx.x; 
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; 
    unsigned int gridSize = blockSize * 2 * gridDim.x; 

    T mySum = 0; 

    // we reduce multiple elements per thread. The number is determined by the 
    // number of active thread blocks (via gridDim). More blocks will result 
    // in a larger gridSize and therefore fewer elements per thread 
    while (i < n) { 
     mySum += g_idata[i]; 

     // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays 
     if (nIsPow2 || i + blockSize < n) 
      mySum += g_idata[i + blockSize]; 

     i += gridSize; 
    } 

    // each thread puts its local sum into shared memory 
    sdata[tid] = mySum; 
    __syncthreads(); 

    // do reduction in shared mem 
    if ((blockSize >= 512) && (tid < 256)) { 
     sdata[tid] = mySum = mySum + sdata[tid + 256]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 256) && (tid < 128)) { 
     sdata[tid] = mySum = mySum + sdata[tid + 128]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 128) && (tid < 64)) { 
     sdata[tid] = mySum = mySum + sdata[tid + 64]; 
    } 

    __syncthreads(); 

#if (__CUDA_ARCH__ >= 300) 
    if (tid < 32) { 
     // Fetch final intermediate sum from 2nd warp 
     if (blockSize >= 64) 
      mySum += sdata[tid + 32]; 
     // Reduce final warp using shuffle 
     for (int offset = warpSize/2; offset > 0; offset /= 2) { 
      mySum += __shfl_down(mySum, offset); 
     } 
    } 
#else 
    // fully unroll reduction within a single warp 
    if ((blockSize >= 64) && (tid < 32)) 
    { 
     sdata[tid] = mySum = mySum + sdata[tid + 32]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 32) && (tid < 16)) 
    { 
     sdata[tid] = mySum = mySum + sdata[tid + 16]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 16) && (tid < 8)) 
    { 
     sdata[tid] = mySum = mySum + sdata[tid + 8]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 8) && (tid < 4)) 
    { 
     sdata[tid] = mySum = mySum + sdata[tid + 4]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 4) && (tid < 2)) 
    { 
     sdata[tid] = mySum = mySum + sdata[tid + 2]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 2) && (tid < 1)) 
    { 
     sdata[tid] = mySum = mySum + sdata[tid + 1]; 
    } 

    __syncthreads(); 
#endif 

    // write result for this block to global mem 
    if (tid == 0) 
     g_odata[blockIdx.x] = mySum; 
} 


/* 
This version adds multiple elements per thread sequentially. This reduces the overall 
cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n). 
(Brent's Theorem optimization) 

Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory. 
In other words if blockSize <= 32, allocate 64*sizeof(T) bytes. 
If blockSize > 32, allocate blockSize*sizeof(T) bytes. 
*/ 
template<class T, unsigned int blockSize, bool nIsPow2> 
__global__ void reduceMin6(T *g_idata, int *g_idxs, T *g_odata, int *g_oIdxs, unsigned int n) { 
    T *sdata = SharedMemory<T>(); 

    int *sdataIdx = SharedMemory<int>(); 

    // perform first level of reduction, 
    // reading from global memory, writing to shared memory 
    unsigned int tid = threadIdx.x; 
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; 
    unsigned int gridSize = blockSize * 2 * gridDim.x; 


    T myMin = 99999; 
    int myMinIdx = -1; 
    // we reduce multiple elements per thread. The number is determined by the 
    // number of active thread blocks (via gridDim). More blocks will result 
    // in a larger gridSize and therefore fewer elements per thread 
    while (i < n) { 
     myMinIdx = MIN_IDX(g_idata[i], myMin, g_idxs[i], myMinIdx); 
     myMin = MIN(g_idata[i], myMin); 

     // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays 
     if (nIsPow2 || i + blockSize < n){ 
      //myMin += g_idata[i + blockSize]; 
      myMinIdx = MIN_IDX(g_idata[i + blockSize], myMin, g_idxs[i + blockSize], myMinIdx); 
      myMin = MIN(g_idata[i + blockSize], myMin); 
     } 

     i += gridSize; 
    } 

    // each thread puts its local sum into shared memory 
    sdata[tid] = myMin; 
    sdataIdx[tid] = myMinIdx; 
    __syncthreads(); 

    // do reduction in shared mem 
    if ((blockSize >= 512) && (tid < 256)) { 
     //sdata[tid] = mySum = mySum + sdata[tid + 256]; 

     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 256], myMin, sdataIdx[tid + 256], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 256], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 256) && (tid < 128)) { 
     //sdata[tid] = myMin = myMin + sdata[tid + 128]; 

     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 128], myMin, sdataIdx[tid + 128], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 128], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 128) && (tid < 64)) { 
     //sdata[tid] = myMin = myMin + sdata[tid + 64]; 

     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 64], myMin, sdataIdx[tid + 64], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 64], myMin); 
    } 

    __syncthreads(); 

#if (__CUDA_ARCH__ >= 300) 
    if (tid < 32) { 
     // Fetch final intermediate sum from 2nd warp 
     if (blockSize >= 64){ 
     //myMin += sdata[tid + 32]; 
      myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx); 
      myMin = MIN(sdata[tid + 32], myMin); 
     } 
     // Reduce final warp using shuffle 
     for (int offset = warpSize/2; offset > 0; offset /= 2) { 
      //myMin += __shfl_down(myMin, offset); 
      int tempMyMinIdx = __shfl_down(myMinIdx, offset); 
      float tempMyMin = __shfl_down(myMin, offset); 

      myMinIdx = MIN_IDX(tempMyMin, myMin, tempMyMinIdx , myMinIdx); 
      myMin = MIN(tempMyMin, myMin); 
     } 

    } 
#else 
    // fully unroll reduction within a single warp 
    if ((blockSize >= 64) && (tid < 32)) 
    { 
     //sdata[tid] = myMin = myMin + sdata[tid + 32]; 
     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 32], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 32) && (tid < 16)) 
    { 
     //sdata[tid] = myMin = myMin + sdata[tid + 16]; 

     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 16], myMin, sdataIdx[tid + 16], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 16], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 16) && (tid < 8)) 
    { 
     //sdata[tid] = myMin = myMin + sdata[tid + 8]; 

     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 8], myMin, sdataIdx[tid + 8], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 8], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 8) && (tid < 4)) 
    { 
     //sdata[tid] = myMin = myMin + sdata[tid + 4]; 

     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 4], myMin, sdataIdx[tid + 4], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 4], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 4) && (tid < 2)) 
    { 
     //sdata[tid] = myMin = myMin + sdata[tid + 2]; 
     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 2], myMin, sdataIdx[tid + 2], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 2], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 2) && (tid < 1)) 
    { 
     //sdata[tid] = myMin = myMin + sdata[tid + 1]; 
     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 1], myMin, sdataIdx[tid + 1], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 1], myMin); 
    } 

    __syncthreads(); 
#endif 

    __syncthreads(); 
    // write result for this block to global mem 
    if (tid == 0){ 
     g_odata[blockIdx.x] = myMin; 
     g_oIdxs[blockIdx.x] = myMinIdx; 
    } 
} 

//////////////////////////////////////////////////////////////////////////////// 
// Compute the number of threads and blocks to use for the given reduction kernel 
// For the kernels >= 3, we set threads/block to the minimum of maxThreads and 
// n/2. For kernels < 3, we set to the minimum of maxThreads and n. For kernel 
// 6, we observe the maximum specified number of blocks, because each thread in 
// that kernel can process a variable number of elements. 
//////////////////////////////////////////////////////////////////////////////// 
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, 
     int maxThreads, int &blocks, int &threads) { 

    //get device capability, to avoid block/grid size exceed the upper bound 
    cudaDeviceProp prop; 
    int device; 
    cudaGetDevice(&device); 
    cudaGetDeviceProperties(&prop, device); 

    if (whichKernel < 3) { 
     threads = (n < maxThreads) ? nextPow2(n) : maxThreads; 
     blocks = (n + threads - 1)/threads; 
    } else { 
     threads = (n < maxThreads * 2) ? nextPow2((n + 1)/2) : maxThreads; 
     blocks = (n + (threads * 2 - 1))/(threads * 2); 
    } 

    if ((float) threads * blocks 
      > (float) prop.maxGridSize[0] * prop.maxThreadsPerBlock) { 
     printf("n is too large, please choose a smaller number!\n"); 
    } 

    if (blocks > prop.maxGridSize[0]) { 
     printf(
       "Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\n", 
       blocks, prop.maxGridSize[0], threads * 2, threads); 

     blocks /= 2; 
     threads *= 2; 
    } 

    if (whichKernel == 6) { 
     blocks = MIN(maxBlocks, blocks); 
    } 
} 

//////////////////////////////////////////////////////////////////////////////// 
// Wrapper function for kernel launch 
//////////////////////////////////////////////////////////////////////////////// 
template<class T> 
void reduce(int size, int threads, int blocks, int whichKernel, T *d_idata, 
     T *d_odata) { 
    dim3 dimBlock(threads, 1, 1); 
    dim3 dimGrid(blocks, 1, 1); 

    // when there is only one warp per block, we need to allocate two warps 
    // worth of shared memory so that we don't index shared memory out of bounds 
    int smemSize = 
      (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); 

    if (isPow2(size)) { 
     switch (threads) { 
     case 512: 
      reduce6<T, 512, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 256: 
      reduce6<T, 256, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 128: 
      reduce6<T, 128, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 64: 
      reduce6<T, 64, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 32: 
      reduce6<T, 32, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 16: 
      reduce6<T, 16, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 8: 
      reduce6<T, 8, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 4: 
      reduce6<T, 4, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 2: 
      reduce6<T, 2, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 1: 
      reduce6<T, 1, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 
     } 
    } else { 
     switch (threads) { 
     case 512: 
      reduce6<T, 512, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 256: 
      reduce6<T, 256, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 128: 
      reduce6<T, 128, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 64: 
      reduce6<T, 64, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 32: 
      reduce6<T, 32, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 16: 
      reduce6<T, 16, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 8: 
      reduce6<T, 8, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 4: 
      reduce6<T, 4, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 2: 
      reduce6<T, 2, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 1: 
      reduce6<T, 1, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 
     } 
    } 

} 


//////////////////////////////////////////////////////////////////////////////// 
// Wrapper function for kernel launch 
//////////////////////////////////////////////////////////////////////////////// 
template<class T> 
void reduceMin(int size, int threads, int blocks, int whichKernel, T *d_idata, 
     T *d_odata, int *idxs, int *oIdxs) { 
    dim3 dimBlock(threads, 1, 1); 
    dim3 dimGrid(blocks, 1, 1); 

    // when there is only one warp per block, we need to allocate two warps 
    // worth of shared memory so that we don't index shared memory out of bounds 
    int smemSize = 
      (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); 

    if (isPow2(size)) { 
     switch (threads) { 
     case 512: 
      reduceMin6<T, 512, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 256: 
      reduceMin6<T, 256, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 128: 
      reduceMin6<T, 128, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 64: 
      reduceMin6<T, 64, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 32: 
      reduceMin6<T, 32, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 16: 
      reduceMin6<T, 16, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 8: 
      reduceMin6<T, 8, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 4: 
      reduceMin6<T, 4, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 2: 
      reduceMin6<T, 2, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 1: 
      reduceMin6<T, 1, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 
     } 
    } else { 
     switch (threads) { 
     case 512: 
      reduceMin6<T, 512, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 256: 
      reduceMin6<T, 256, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 128: 
      reduceMin6<T, 128, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 64: 
      reduceMin6<T, 64, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 32: 
      reduceMin6<T, 32, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 16: 
      reduceMin6<T, 16, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 8: 
      reduceMin6<T, 8, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 4: 
      reduceMin6<T, 4, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 2: 
      reduceMin6<T, 2, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 1: 
      reduceMin6<T, 1, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 
     } 
    } 

} 

//////////////////////////////////////////////////////////////////////////////// 
//! Compute sum reduction on CPU 
//! We use Kahan summation for an accurate sum of large arrays. 
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm 
//! 
//! @param data  pointer to input data 
//! @param size  number of input data elements 
//////////////////////////////////////////////////////////////////////////////// 
template<class T> 
void reduceMINCPU(T *data, int size, T *min, int *idx) 
{ 
    *min = data[0]; 
    int min_idx = 0; 
    T c = (T)0.0; 

    for (int i = 1; i < size; i++) 
    { 
     T y = data[i]; 
     T t = MIN(*min, y); 
     min_idx = MIN_IDX(*min, y, min_idx, i); 
     (*min) = t; 
    } 

    *idx = min_idx; 

    return; 
} 

//////////////////////////////////////////////////////////////////////////////// 
//! Compute sum reduction on CPU 
//! We use Kahan summation for an accurate sum of large arrays. 
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm 
//! 
//! @param data  pointer to input data 
//! @param size  number of input data elements 
//////////////////////////////////////////////////////////////////////////////// 
template<class T> 
T reduceCPU(T *data, int size) 
{ 
    T sum = data[0]; 
    T c = (T)0.0; 

    for (int i = 1; i < size; i++) 
    { 
     T y = data[i] - c; 
     T t = sum + y; 
     c = (t - sum) - y; 
     sum = t; 
    } 

    return sum; 
} 

// Instantiate the reduction function for 3 types 
template void 
reduce<int>(int size, int threads, int blocks, int whichKernel, int *d_idata, 
     int *d_odata); 

template void 
reduce<float>(int size, int threads, int blocks, int whichKernel, 
     float *d_idata, float *d_odata); 

template void 
reduce<double>(int size, int threads, int blocks, int whichKernel, 
     double *d_idata, double *d_odata); 

// Instantiate the reduction function for 3 types 
template void 
reduceMin<int>(int size, int threads, int blocks, int whichKernel, int *d_idata, 
     int *d_odata, int *idxs, int *oIdxs); 

template void 
reduceMin<float>(int size, int threads, int blocks, int whichKernel, float *d_idata, 
     float *d_odata, int *idxs, int *oIdxs); 

template void 
reduceMin<double>(int size, int threads, int blocks, int whichKernel, double *d_idata, 
     double *d_odata, int *idxs, int *oIdxs); 

unsigned long long int my_min_max_test(int num_els) { 

    // timers 

    unsigned long long int start; 
    unsigned long long int delta; 

    int maxThreads = 256; // number of threads per block 
    int whichKernel = 6; 
    int maxBlocks = 64; 

    int testIterations = 100; 



    float* d_in = NULL; 
    float* d_out = NULL; 
    int *d_idxs = NULL; 
    int *d_oIdxs = NULL; 

    printf("%d elements\n", num_els); 
    printf("%d threads (max)\n", maxThreads); 

    int numBlocks = 0; 
    int numThreads = 0; 
    getNumBlocksAndThreads(whichKernel, num_els, maxBlocks, maxThreads, numBlocks, 
      numThreads); 



    // in[1024] = 34.0f; 
    // in[333] = 55.0f; 
    // in[23523] = -42.0f; 

// cudaMalloc((void**) &d_in, size); 
// cudaMalloc((void**) &d_out, size); 
// cudaMalloc((void**) &d_idxs, num_els * sizeof(int)); 

    cudaMalloc((void **) &d_in, num_els * sizeof(float)); 
    cudaMalloc((void **) &d_idxs, num_els * sizeof(int)); 
    cudaMalloc((void **) &d_out, numBlocks * sizeof(float)); 
    cudaMalloc((void **) &d_oIdxs, numBlocks * sizeof(int)); 

    float* in = (float*) malloc(num_els * sizeof(float)); 
    int *idxs = (int*) malloc(num_els * sizeof(int)); 
    float* out = (float*) malloc(numBlocks * sizeof(float)); 
    int* oIdxs = (int*) malloc(numBlocks * sizeof(int)); 

    for (int i = 0; i < num_els; i++) { 
     in[i] = (double) rand()/(double) RAND_MAX; 
     idxs[i] = i; 
    } 

    for (int i = 0; i < num_els; i++) { 

     printf("\n [%d] = %.4f", idxs[i], in[i]); 
    } 

    // copy data directly to device memory 
    cudaMemcpy(d_in, in, num_els * sizeof(float), cudaMemcpyHostToDevice); 
    cudaMemcpy(d_idxs, idxs, num_els * sizeof(int),cudaMemcpyHostToDevice); 
    cudaMemcpy(d_out, out, numBlocks * sizeof(float),cudaMemcpyHostToDevice); 
    cudaMemcpy(d_oIdxs, oIdxs, numBlocks * sizeof(int),cudaMemcpyHostToDevice); 

    // warm-up 
// reduce<float>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out); 
// 
// cudaMemcpy(out, d_out, numBlocks * sizeof(float), cudaMemcpyDeviceToHost); 
// 
// for(int i=0; i< numBlocks; i++) 
// printf("\nFinal Result[BLK:%d]: %f", i, out[i]); 

// printf("\n Reduce CPU : %f", reduceCPU<float>(in, num_els)); 

    reduceMin<float>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out, d_idxs, d_oIdxs); 

    cudaMemcpy(out, d_out, numBlocks * sizeof(float), cudaMemcpyDeviceToHost); 
    cudaMemcpy(oIdxs, d_oIdxs, numBlocks * sizeof(int), cudaMemcpyDeviceToHost); 

    for(int i=0; i< numBlocks; i++) 
     printf("\n Reduce MIN GPU idx: %d value: %f", oIdxs[i], out[i]); 


    int min_idx; 
    float min; 
    reduceMINCPU<float>(in, num_els, &min, &min_idx); 

    printf("\n\n Reduce MIN CPU idx: %d value: %f", min_idx, min); 

    cudaFree(d_in); 
    cudaFree(d_out); 
    cudaFree(d_idxs); 

    free(in); 
    free(out); 

    //system("pause"); 

    return delta; 

} 

int main(int argc, char* argv[]) { 

    printf(" GTS250 @ 70.6 GB/s - Finding min and max"); 
    printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n"); 

//#pragma unroll 
//for(int i = 1024*1024; i <= 32*1024*1024; i=i*2) 
//{ 
// my_min_max_test(i); 
//} 

    printf("\n Non-base 2 tests! \n"); 
    printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n"); 

    my_min_max_test(1024); 



// just some large numbers.... 
//my_min_max_test(14*1024*1024+38); 
//my_min_max_test(14*1024*1024+55); 
//my_min_max_test(18*1024*1024+1232); 
//my_min_max_test(7*1024*1024+94854); 

//for(int i = 0; i < 4; i++) 
//{ 
// 
// float ratio = float(rand())/float(RAND_MAX); 
// ratio = ratio >= 0 ? ratio : -ratio; 
// int big_num = ratio*18*1e6; 
// 
// my_min_max_test(big_num); 
//} 

    return 0; 
}

出典

2016-07-04 Vinay

必ず[MCVE]を投稿してください。「生のCUDA」を使用する必要がありますか、代替アプローチ（推力など）を使用できますか？ –

このコードは、私が並列化しているより大きなアルゴリズムの一部になるでしょう。ですから、きめ細かい並列処理が重要です。このコードは、7.5コードサンプルにあり、合計から最小化に変更されました。 – Vinay

これは、動的に割り当てられた共有メモリを使用するための有効な方法ではない。

T *sdata = SharedMemory<T>(); 

int *sdataIdx = SharedMemory<int>();

ものポインタ（sdataとsdataIdx）の両方が同じ場所を指してしまいます。

また、共有メモリの使用量を2倍に増やしたい場合でも（min + indexの格納に使用）、動的割り当てサイズは増加していません。前回の和削減のみコード：あなたのケースでは

int smemSize = 
     (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);

、threadsは256であり、あなたが索引記憶分間ストレージが、何のために十分な（threads*sizeof(T)）を割り当てています。

T *sdata = SharedMemory<T>(); 

int *sdataIdx = ((int *)sdata) + blockSize;

と：：

私たちは、次のように変更することによって、これらの問題を "解決" することができます

$ cuda-memcheck ./t1182 
========= CUDA-MEMCHECK 
GTS250 @ 70.6 GB/s - Finding min and max 
N [GB/s] [perc] [usec] test 

Non-base 2 tests! 

N [GB/s] [perc] [usec] test 
1024 elements 
256 threads (max) 

Reduce MIN GPU idx: 484 value: 0.001125 
Reduce MIN GPU idx: 972 value: 0.002828 

Reduce MIN CPU idx: 484 value: 0.001125 
========= ERROR SUMMARY: 0 errors 
$

：これらの変更により

int smemSize = 
     (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); 
smemSize += threads*sizeof(int);

を、あなたのコードは、予想される出力を生成します（私はすべての1024の初期値を印刷していたループをコメントアウトしました）

出典

2016-07-04 21:53:37

ありがとうございます！あなたの応答が、私がどこかで実現したデバッグの段階にあったのは、大域メモリから読み込んで共有メモリに値を割り当てた後のことです。しかし、実際の間違いを解決するのにもっと時間がかかりました。 – Vinay

CUDAを使用した配列とそのインデックスの最小値の検索__shfl_down関数

答えて

関連する問題