I am trying to make vectorization for this formula
res[i] = 255 - ((255 - img1[i]) * (255 - img2[i])) / 255;
I have already written the vectorization, but it seems not working correctly
about code:
- load from memory using this _mm256_loadu_epi8
- t = (255-img) subtract _mm256_subs_epu8(one_value_vector8, partImg1)
- t * b for multiplication I used function suggested here
here is the code
v0 and v1 are 8 bit input vectors. r0 and r1 are 18 bit results of multiplications
void _mm256_mul_epi8(__m256i v0, __m256i v1, __m256i* r0, __m256i* r1){
__m256i tmp0, tmp1;
__m128i m128_v0, m128_v1;
m128_v0 = _mm256_extractf128_si256 (v0, 0);
m128_v1 = _mm256_extractf128_si256 (v1, 0);
tmp0= _mm256_cvtepi8_epi16 (m128_v0); //printf("\ntmp0 = ");_mm256_print_epi16(tmp0);
tmp1= _mm256_cvtepi8_epi16 (m128_v1); //printf("\ntmp1 = ");_mm256_print_epi16(tmp1);
*r0 =_mm256_mullo_epi16(tmp0, tmp1);
m128_v0 = _mm256_extractf128_si256 (v0, 1);
m128_v1 = _mm256_extractf128_si256 (v1, 1);
tmp0= _mm256_cvtepi8_epi16 (m128_v0); //printf("\ntmp0 = ");_mm256_print_epi16(tmp0);
tmp1= _mm256_cvtepi8_epi16 (m128_v1); //printf("\ntmp1 = ");_mm256_print_epi16(tmp1);
*r1 =_mm256_mullo_epi16(tmp0, tmp1);
}
I am working with 8-bit unsigned integer but after the multiplication, I started to work with 16 bit since the result of multiplication is bigger than 255. 4. divByOne1 = (r0/255) then dividing using _mm256_div_epu16(r0, one_value_vector16); 5. partRes = 255 - divByOne get the last result using same like 1 _mm256_subs_epu16 6. store the result back res[i + j] = partRes1.m256i_u16[j];
the result of vectorization differs from the needed one at some indexes here is an example
2
5
8
11
14
0
3
6
9
12
15
1
4
7
10
13
2
5
8
11
14
strangely enough, these indexes get repeated so maybe I have a problem with loading the pixels.
my full code
std::vector<unsigned char> vectorizedImplementation(const std::vector<unsigned char>& img1, const std::vector<unsigned char>& img2)
{
std::vector<unsigned char> res(img1.size());
__m256i partImg1, partImg2, partRes1, partRes2;
int vari = 32;
int partSize = res.size() / vari * vari;
//255 - ((255-Target) * (255-Blend))/255
__m256i one_value_vector16 = _mm256_set_epi16(255, 255, 255, 255, 255, 255, 255, 255, 255, `255,`
255, 255, 255, 255, 255, 255);
__m256i one_value_vector8 = _mm256_set_epi8(255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255);
//https://www.codeproject.com/Articles/874396/Crunching-Numbers-with-AVX-and-AVX
for (int i = 0; i < partSize; i += vari) {
partImg1 = _mm256_loadu_epi8(&img1[i]);
partImg2 = _mm256_loadu_epi8(&img2[i]);
// t = (255-img) , b = (255-img)
__m256i t = _mm256_subs_epu8(one_value_vector8, partImg1);//122*201>255
__m256i b = _mm256_subs_epu8(one_value_vector8, partImg2);
//r0 = t * b , r1 = t * b
__m256i r0, r1;//for 16 bit results
_mm256_mul_epi8(t, b, &r0, &r1);//function from the link provided above
//divByOne1 = (r0/255) , divByOne2 = (r1/255)
__m256i divByOne1 = _mm256_div_epu16(r0, one_value_vector16);
__m256i divByOne2 = _mm256_div_epu16(r1, one_value_vector16);
//partRes = 255 - divByOne
partRes1 = _mm256_subs_epu16(one_value_vector16, divByOne1);
partRes2 = _mm256_subs_epu16(one_value_vector16, divByOne2);
int j = 0;
for (; j < vari/2; j++) {
res[i + j] = partRes1.m256i_u16[j];
}
int k = 0;
for (; j < vari; j++,k++) {
res[i + j] = partRes2.m256i_u16[k];
}
}
return res;
}