i'm trying to sum all the element of array (unsigned char)
but the result of cv::Mat sum is different from SSE result(below code)
with sse, sum of array result bigger than without, but why??
ex) i got 2042115 for sse sum, but cv::mat's sum results 2041104.
__m128i srcVal;
__m128i src16bitlo;
__m128i src16bithi;
__m128i src32bitlolo;
__m128i src32bitlohi;
__m128i src32bithilo;
__m128i src32bithihi;
__m128i vsum = _mm_setzero_si128();
for (int i = 0; i < nSrcSize; i += 16)
{
srcVal = _mm_loadu_si128((__m128i*) (pSrc + i));
src16bitlo = _mm_unpacklo_epi8(srcVal, _mm_setzero_si128());
src16bithi = _mm_unpackhi_epi8(srcVal, _mm_setzero_si128());
src32bitlolo = _mm_unpacklo_epi16(src16bitlo, _mm_setzero_si128());
src32bitlohi = _mm_unpackhi_epi16(src16bitlo, _mm_setzero_si128());
src32bithilo = _mm_unpacklo_epi16(src16bithi, _mm_setzero_si128());
src32bithihi = _mm_unpackhi_epi16(src16bithi, _mm_setzero_si128());
vsum = _mm_add_epi32(src32bitlolo, vsum);
vsum = _mm_add_epi32(src32bitlohi, vsum);
vsum = _mm_add_epi32(src32bithilo, vsum);
vsum = _mm_add_epi32(src32bithihi, vsum);
// cout << "sumSrc : " << sumSrc << endl;
}
int sumSrc = vsum.m128i_i32[0] + vsum.m128i_i32[1] + vsum.m128i_i32[2] + vsum.m128i_i32[3];
//int check = sumSrc;
int remainSize = nSrcSize % 16;
if (remainSize > 0)
{
unsigned char* arrTemp = new unsigned char[16](); // 0으로 초기화
memcpy(arrTemp, pSrc + nSrcSize - remainSize -1, remainSize);
__m128i srcVal = _mm_loadu_si128((__m128i*)arrTemp);
vsum = _mm_sad_epu8(srcVal, _mm_setzero_si128());
sumSrc += vsum.m128i_i16[0] + vsum.m128i_i16[1] + vsum.m128i_i16[2] + vsum.m128i_i16[3] + vsum.m128i_i16[4] + vsum.m128i_i16[5] + vsum.m128i_i16[6] + vsum.m128i_i16[7];
}