SSEバージョンとの別の結果

SSEを使用するためにいくつかのコードを書き直そうとしています。しかし、何らかの理由で私のSSEのバージョンが元のものと異なる結果を生みます。 1.47などの代わりに209.1。SSEバージョンとの別の結果

なぜですか？機能全体はhereです。

struct vec_ps 
{ 
    __m128 value; 

    vec_ps(){} 
    vec_ps(float value)   : value(_mm_set1_ps(value)) {} 
    vec_ps(__m128 value)  : value(value)    {} 
    vec_ps(const vec_ps& other) : value(other.value)  {} 

    vec_ps& operator=(const vec_ps& other) 
    { 
     value = other.value; 
     return *this; 
    } 

    vec_ps& operator+=(const vec_ps& other) 
    { 
     value = _mm_add_ps(value, other.value); 
     return *this; 
    } 

    vec_ps& operator-=(const vec_ps& other) 
    { 
     value = _mm_sub_ps(value, other.value); 
     return *this; 
    } 

    vec_ps& operator*=(const vec_ps& other) 
    { 
     value = _mm_mul_ps(value, other.value); 
     return *this; 
    } 

    vec_ps& operator/=(const vec_ps& other) 
    { 
     value = _mm_div_ps(value, other.value); 
     return *this; 
    } 

    static vec_ps load(float* ptr) 
    { 
     return vec_ps(_mm_load_ps(ptr)); 
    } 

    static void stream(float* ptr, const vec_ps& other) 
    { 
     _mm_stream_ps(ptr, other.value); 
    } 

    void stream(float* ptr) 
    { 
     _mm_stream_ps(ptr, value); 
    } 
}; 

vec_ps operator+(const vec_ps& lhs, const vec_ps& rhs) 
{  
    return vec_ps(lhs) += rhs; 
} 

vec_ps operator-(const vec_ps& lhs, const vec_ps& rhs) 
{  
    return vec_ps(lhs) -= rhs; 
} 

vec_ps operator*(const vec_ps& lhs, const vec_ps& rhs) 
{  
    return vec_ps(lhs) *= rhs; 
} 

vec_ps operator/(const vec_ps& lhs, const vec_ps& rhs) 
{  
    return vec_ps(lhs) /= rhs; 
} 

void foo(/*...*/) 
{ 
     std::vector<float, tbb::cache_aligned_allocator<float>> ref_mu(w*h); 
     std::vector<float, tbb::cache_aligned_allocator<float>> cmp_mu(w*h); 
     std::vector<float, tbb::cache_aligned_allocator<float>> ref_sigma_sqd(w*h); 
     std::vector<float, tbb::cache_aligned_allocator<float>> cmp_sigma_sqd(w*h); 
     std::vector<float, tbb::cache_aligned_allocator<float>> sigma_both(w*h); 
     int size = w*h*sizeof(float); 

     /*...*/ 

     float ssim_sum = 0.0; 
     float ssim_sum2 = 0.0; 

     vec_ps ssim_sum_ps(0.0f);  

     for(int n = 0; n < size/16; ++n) 
     { 
      auto ref_mu_ps   = vec_ps::load(ref_mu.data()  + n*4); 
      auto cmp_mu_ps   = vec_ps::load(cmp_mu.data()  + n*4); 
      auto sigma_both_ps  = vec_ps::load(sigma_both.data() + n*4); 
      auto ref_sigma_sqd_ps = vec_ps::load(ref_sigma_sqd.data() + n*4); 
      auto cmp_sigma_sqd_ps = vec_ps::load(cmp_sigma_sqd.data() + n*4); 

      auto numerator = (2.0f * ref_mu_ps * cmp_mu_ps + C1) * (2.0f * sigma_both_ps + C2); 
      auto denominator = (ref_mu_ps*ref_mu_ps + cmp_mu_ps*cmp_mu_ps + C1) * (ref_sigma_sqd_ps + cmp_sigma_sqd_ps + C2); 
      ssim_sum_ps += numerator/denominator; 
     } 

     for(int n = 0; n < 4; ++n) 
      ssim_sum2 += ssim_sum_ps.value.m128_f32[n]; 

     for (int y = 0; y < h; ++y) 
     { 
      int offset = y*w; 
      for (int x = 0; x < w; ++x, ++offset) 
      {   
       float numerator = (2.0f * ref_mu[offset] * cmp_mu[offset] + C1) * (2.0f * sigma_both[offset] + C2); 
       float denominator = (ref_mu[offset]*ref_mu[offset] + cmp_mu[offset]*cmp_mu[offset] + C1) * (ref_sigma_sqd[offset] + cmp_sigma_sqd[offset] + C2); 
       ssim_sum += numerator/denominator;     
      } 
     } 
     assert(ssim_sum2 == ssim_sum); // FAILS! 
}

出典

2012-01-15 ronag

自分でデバッグすることができます。デバッガで実行するか、printf呼び出しを追加して中間結果を出力します。期待どおりに動作していないステップを分離するときは、最小のテストケースを書き込んで、ここでそれについて尋ねてください。しかし、「ここにはコードの壁があり、何が間違っているかを把握する」というのは良い質問ではありません。 –

@BenVoigt; Ofc、あなたは正しいです。しかし、私はあなたがそれを把握することができなくて、投稿する前に提案したことをalrdyしました。 – ronag

どのようなコード行が「間違った」結果を生むのですか？ TBBアロケータなどを削除して単純化することはできますか？ –

ちょうど質問への答えであると考えられるよう上記のコメントは、：* hが4で割り切れるwはそのいずれかの保証はありますか？そうでない場合、SSEバージョンの最後の反復は乱数に基づいて行われます。 1つの場所でsizeof（float）を使用し、別の場所で4 * sizeof（float）の代わりに16を使用すると、多少混乱します。なぜ浮動小数点のサイズを離れないのですか？また、非SSEバージョンは、マトリックスの幅と高さに従うのではなく、その領域をちょうど通過するのはなぜですか？

出典

2012-01-19 23:16:12

SSEバージョンとの別の結果

答えて

関連する問題