I am trying to find the fastest way to add 2 equally sized vectors together. I've wrote a test and ran it on Android, iOS and macOS devices with clang compiler here is the code for macOS which is copy-pasted in codes for other platforms
#include <iostream>
#include <chrono>
#include <algorithm>
#include <array>
typedef float float4 __attribute__((ext_vector_type(4)));
typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
template<typename T>
struct SArray {
std::array<T, 4> arr;
inline SArray& operator+=(const SArray& another){
arr[0]+=another.arr[0];
arr[1]+=another.arr[1];
arr[2]+=another.arr[2];
arr[3]+=another.arr[3];
return *this;
}
inline SArray& operator=(T elem){
arr[0] = arr[1] = arr[2] = arr[3] = elem;
return *this;
}
inline SArray& operator=(const SArray& another){
std::copy(another.arr.begin(),another.arr.end(),arr.begin());
}
inline SArray operator+(const SArray& another){
SArray result;
result.arr[0] = arr[0] + another.arr[0];
result.arr[1] = arr[1] + another.arr[1];
result.arr[2] = arr[2] + another.arr[2];
result.arr[3] = arr[3] + another.arr[3];
return result;
}
};
template<typename T>
struct S {
T x;
T y;
T z;
T w;
S() : x(0),y(0),z(0),w(0){
}
inline bool operator==(const S& another){
return x == another.x && y == another.y && z == another.z && w == another.w;
}
inline S& operator=(const S& another){
// if(*this == another){
// return *this;
// }
x = another.x;
y = another.y;
z = another.z;
w = another.w;
return *this;
}
inline S& operator=(T elem) {
x = y = z = w = elem;
return *this;
}
inline S& operator+=(const S& another){
x += another.x;
y += another.y;
z += another.z;
w += another.w;
return *this;
}
inline S operator+(const S& other) {
S s;
s.x = x + other.x;
s.y = y + other.y;
s.z = z + other.z;
s.w = w + other.w;
return s;
}
};
int main(int argc, const char * argv[]) {
const int width = 4096;
const int height = 4096;
const int n = 100;
{
uchar4 **arr1 = new uchar4*[height];
uchar4 **arr2 = new uchar4*[height];
uchar4 **result = new uchar4*[height];
for(int i = 0; i < height; ++i){
arr1[i] = new uchar4[width];
arr2[i] = new uchar4[width];
result[i] = new uchar4[width];
}
for(int i = 0; i < height; ++i){
for(int j = 0; j < width; ++j){
arr1[i][j] = i;
arr2[i][j] = j;
result[i][j] = 0;
}
}
auto start = std::chrono::high_resolution_clock::now();
for(int times = 0; times < n; ++ times){
for(int i = 0; i < height; ++i){
for(int j = 0; j < width; ++j){
result[i][j] += arr1[i][j] + arr2[i][j];
}
}
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
std::cout<<"For ext vector time is:" << elapsed.count()/1000.0f <<std::endl<< result[0][0].x<< std::endl;
}
{
S<unsigned char> **arr1 = new S<unsigned char>*[height];
S<unsigned char> **arr2 = new S<unsigned char>*[height];
S<unsigned char> **result = new S<unsigned char>*[height];
for(int i = 0; i < height; ++i){
arr1[i] = new S<unsigned char>[width];
arr2[i] = new S<unsigned char>[width];
result[i] = new S<unsigned char>[width];
}
for(int i = 0; i < height; ++i){
for(int j = 0; j < width; ++j){
arr1[i][j] = i;
arr2[i][j] = j;
result[i][j] = 0;
}
}
auto start = std::chrono::high_resolution_clock::now();
for(int times = 0; times < n; ++ times){
for(int i = 0; i < height; ++i){
for(int j = 0; j < width; ++j){
result[i][j] += arr1[i][j] + arr2[i][j];
}
}
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
std::cout<<"For struct time is:" << elapsed.count()/1000.0f << std::endl<< result[0][0].x<< std::endl;
}
{
SArray<unsigned char>** arr1 = new SArray<unsigned char>*[height];
SArray<unsigned char>** arr2 = new SArray<unsigned char>*[height];
SArray<unsigned char>** result = new SArray<unsigned char>*[height];
for(int i = 0; i < height; ++i){
arr1[i] = new SArray<unsigned char>[width];
arr2[i] = new SArray<unsigned char>[width];
result[i] = new SArray<unsigned char>[width];
}
for(int i = 0; i < height; ++i){
for(int j = 0; j < width; ++j){
arr1[i][j] = i;
arr2[i][j] = j;
result[i][j] = 0;
}
}
auto start = std::chrono::high_resolution_clock::now();
for(int times = 0; times < n; ++ times){
for(int i = 0; i < height; ++i){
for(int j = 0; j < width; ++j){
result[i][j]+=arr1[i][j] + arr2[i][j];
}
}
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
std::cout<<"For c-type array time is:" << elapsed.count()/1000.0f << std::endl << result[0][0].arr[0] << std::endl;
}
return 0;
}
Now, step-by-step. I've read about clang language extensions and noted that it has extended vectors, which have to be faster as clang optimizes them on CPU. typedef float float4 __attribute__((ext_vector_type(4)));
and typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
Are actually the way of creating extended vectors as mentioned in the documentation. In my first test-case I use these extended vectors. The second test case tests plain struct addition, as you see in my code
template<typename T>
struct S {
T x;
T y;
T z;
T w;
S() : x(0),y(0),z(0),w(0){
}
inline bool operator==(const S& another){
return x == another.x && y == another.y && z == another.z && w == another.w;
}
inline S& operator=(const S& another){
// if(*this == another){
// return *this;
// }
x = another.x;
y = another.y;
z = another.z;
w = another.w;
return *this;
}
inline S& operator=(T elem) {
x = y = z = w = elem;
return *this;
}
inline S& operator+=(const S& another){
x += another.x;
y += another.y;
z += another.z;
w += another.w;
return *this;
}
inline S operator+(const S& other) {
S s;
s.x = x + other.x;
s.y = y + other.y;
s.z = z + other.z;
s.w = w + other.w;
return s;
}
};
And the third test case is similar to second case's struct but it uses array instead, here it is
template<typename T>
struct SArray {
std::array<T, 4> arr;
inline SArray& operator+=(const SArray& another){
arr[0]+=another.arr[0];
arr[1]+=another.arr[1];
arr[2]+=another.arr[2];
arr[3]+=another.arr[3];
return *this;
}
inline SArray& operator=(T elem){
arr[0] = arr[1] = arr[2] = arr[3] = elem;
return *this;
}
inline SArray& operator=(const SArray& another){
std::copy(another.arr.begin(),another.arr.end(),arr.begin());
}
inline SArray operator+(const SArray& another){
SArray result;
result.arr[0] = arr[0] + another.arr[0];
result.arr[1] = arr[1] + another.arr[1];
result.arr[2] = arr[2] + another.arr[2];
result.arr[3] = arr[3] + another.arr[3];
return result;
}
};
As I mentioned I tested this code on different platforms where C++ is compiled with clang and here are the results with different clang optimization flags.
-iOS
-Os release build
For ext vector time is: 8.603s
For struct time is: 4.665s
For c-type array time is: 4.677s
-Ofast release build
For ext vector time is: 8.604s
For struct time is: 1.186s
For c-type array time is: 0.942s
-O3 release build
For ext vector time is:8.603
For struct time is:1.186
For c-type array time is:0.933
-Android
-Os release build
For ext vector time is: 21.417999s
For struct time is: 11.012000s
For c-type array time is: 11.126000s
-Ofast release build
For ext vector time is:21.436s
For struct time is:4.717s
For c-type array time is: 4.813s
-O3 release build
For ext vector time is: 21.629999s
For struct time is: 4.721s
For c-type array time is: 4.806s
-Mac OS
-Os release build
For ext vector time is: 1.721s
For struct time is: 3.26s
For c-type array time is: 3.156s
-Ofast release build
For ext vector time is: 1.75s
For struct time is: 3.168s
For c-type array time is: 3.09s
-O3 release build
For ext vector time is: 1.699s
For struct time is: 3.204s
For c-type array time is: 3.179s
As you can see extended-vectors are "losing" both on iOS and Android devices, while they are fastest on Mac. By the way my target platforms are Android iOS and Linux. I need the fastest way to do these kind of operations on all these three platforms, or find the fastest for each individually and use them accordingly depending on platform. I've also heard about SIMD commands, how can I achieve SIMD in this case? I've read that clang automatically vectorizes for loops. So is this the fastest I can get on the platforms I mentioned? Or is there a better way to do that? Does std::transform do the operation SIMD?