0

I am trying to find the fastest way to add 2 equally sized vectors together. I've wrote a test and ran it on Android, iOS and macOS devices with clang compiler here is the code for macOS which is copy-pasted in codes for other platforms

#include <iostream>
#include <chrono>
#include <algorithm>
#include <array>

typedef float float4 __attribute__((ext_vector_type(4)));
typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));

template<typename T>
struct SArray {
    std::array<T, 4> arr;
    
    inline SArray& operator+=(const SArray& another){
        arr[0]+=another.arr[0];
        arr[1]+=another.arr[1];
        arr[2]+=another.arr[2];
        arr[3]+=another.arr[3];
        return *this;
    }
    
    inline SArray& operator=(T elem){
        arr[0] = arr[1] = arr[2] = arr[3] = elem;
        return *this;
    }
    
    inline SArray& operator=(const SArray& another){
        std::copy(another.arr.begin(),another.arr.end(),arr.begin());
    }
    
    inline SArray operator+(const SArray& another){
        SArray result;
        result.arr[0] = arr[0] + another.arr[0];
        result.arr[1] = arr[1] + another.arr[1];
        result.arr[2] = arr[2] + another.arr[2];
        result.arr[3] = arr[3] + another.arr[3];
        return result;
    }
    
    
};

template<typename T>
struct S {
    T x;
    T y;
    T z;
    T w;
    
    S() : x(0),y(0),z(0),w(0){
        
    }
    
    inline bool operator==(const S& another){
        return x == another.x && y == another.y && z == another.z && w == another.w;
    }
    
    inline S& operator=(const S& another){
        //        if(*this == another){
        //            return *this;
        //        }
        x = another.x;
        y = another.y;
        z = another.z;
        w = another.w;
        return *this;
    }
    
    inline S& operator=(T elem) {
        x = y = z = w = elem;
        return *this;
    }
    
    inline S& operator+=(const S& another){
        x += another.x;
        y += another.y;
        z += another.z;
        w += another.w;
        return *this;
    }
    
    inline S operator+(const S& other) {
        S s;
        s.x = x + other.x;
        s.y = y + other.y;
        s.z = z + other.z;
        s.w = w + other.w;
        return s;
    }
};


int main(int argc, const char * argv[]) {
    const int width = 4096;
    const int height = 4096;
    const int n = 100;
    {
        
        uchar4 **arr1 = new uchar4*[height];
        uchar4 **arr2 = new uchar4*[height];
        uchar4 **result = new uchar4*[height];
        
        for(int i = 0; i < height; ++i){
            arr1[i] = new uchar4[width];
            arr2[i] = new uchar4[width];
            result[i] = new uchar4[width];
        }
        
        for(int i = 0; i < height; ++i){
            for(int j = 0; j < width; ++j){
                arr1[i][j] = i;
                arr2[i][j] = j;
                result[i][j] = 0;
            }
        }
        
        
        
        auto start = std::chrono::high_resolution_clock::now();
        
        for(int times = 0;  times < n; ++ times){
            for(int i = 0; i < height; ++i){
                for(int j = 0; j < width; ++j){
                    result[i][j] += arr1[i][j] + arr2[i][j];
                }
            }
        }
        
        
        
        auto end = std::chrono::high_resolution_clock::now();
        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
        
        std::cout<<"For ext vector time is:" << elapsed.count()/1000.0f <<std::endl<< result[0][0].x<< std::endl;
    }
    {
        S<unsigned char> **arr1 = new S<unsigned char>*[height];
        S<unsigned char> **arr2 = new S<unsigned char>*[height];
        S<unsigned char> **result = new S<unsigned char>*[height];
        
        for(int i = 0; i < height; ++i){
            arr1[i] = new S<unsigned char>[width];
            arr2[i] = new S<unsigned char>[width];
            result[i] = new S<unsigned char>[width];
        }
        
        for(int i = 0; i < height; ++i){
            for(int j = 0; j < width; ++j){
                arr1[i][j] = i;
                arr2[i][j] = j;
                result[i][j] = 0;
            }
        }
        
        auto start = std::chrono::high_resolution_clock::now();
        
        for(int times = 0;  times < n; ++ times){
            for(int i = 0; i < height; ++i){
                for(int j = 0; j < width; ++j){
                    result[i][j] += arr1[i][j] + arr2[i][j];
                }
            }
        }
        
        auto end = std::chrono::high_resolution_clock::now();
        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
        
        std::cout<<"For struct time is:" << elapsed.count()/1000.0f << std::endl<< result[0][0].x<< std::endl;
    }
    {
        SArray<unsigned char>** arr1 = new SArray<unsigned char>*[height];
        SArray<unsigned char>** arr2 = new SArray<unsigned char>*[height];
        SArray<unsigned char>** result = new SArray<unsigned char>*[height];
        
        for(int i = 0; i < height; ++i){
            arr1[i] = new SArray<unsigned char>[width];
            arr2[i] = new SArray<unsigned char>[width];
            result[i] = new SArray<unsigned char>[width];
        }
        
        for(int i = 0;  i < height; ++i){
            for(int j = 0; j < width; ++j){
                arr1[i][j] = i;
                arr2[i][j] = j;
                result[i][j] = 0;
            }
        }
        
        auto start = std::chrono::high_resolution_clock::now();
        for(int times = 0;  times < n; ++ times){
            for(int i = 0; i < height; ++i){
                for(int j = 0; j < width; ++j){
                    result[i][j]+=arr1[i][j] + arr2[i][j];
                }
            }
        }
        
        auto end = std::chrono::high_resolution_clock::now();
        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
        
        std::cout<<"For c-type array time is:" << elapsed.count()/1000.0f << std::endl << result[0][0].arr[0] << std::endl;
    }
    
    
    return 0;
}

Now, step-by-step. I've read about clang language extensions and noted that it has extended vectors, which have to be faster as clang optimizes them on CPU. typedef float float4 __attribute__((ext_vector_type(4))); and typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); Are actually the way of creating extended vectors as mentioned in the documentation. In my first test-case I use these extended vectors. The second test case tests plain struct addition, as you see in my code

template<typename T>
struct S {
    T x;
    T y;
    T z;
    T w;
    
    S() : x(0),y(0),z(0),w(0){
        
    }
    
    inline bool operator==(const S& another){
        return x == another.x && y == another.y && z == another.z && w == another.w;
    }
    
    inline S& operator=(const S& another){
        //        if(*this == another){
        //            return *this;
        //        }
        x = another.x;
        y = another.y;
        z = another.z;
        w = another.w;
        return *this;
    }
    
    inline S& operator=(T elem) {
        x = y = z = w = elem;
        return *this;
    }
    
    inline S& operator+=(const S& another){
        x += another.x;
        y += another.y;
        z += another.z;
        w += another.w;
        return *this;
    }
    
    inline S operator+(const S& other) {
        S s;
        s.x = x + other.x;
        s.y = y + other.y;
        s.z = z + other.z;
        s.w = w + other.w;
        return s;
    }
};

And the third test case is similar to second case's struct but it uses array instead, here it is

template<typename T>
struct SArray {
    std::array<T, 4> arr;
    
    inline SArray& operator+=(const SArray& another){
        arr[0]+=another.arr[0];
        arr[1]+=another.arr[1];
        arr[2]+=another.arr[2];
        arr[3]+=another.arr[3];
        return *this;
    }
    
    inline SArray& operator=(T elem){
        arr[0] = arr[1] = arr[2] = arr[3] = elem;
        return *this;
    }
    
    inline SArray& operator=(const SArray& another){
        std::copy(another.arr.begin(),another.arr.end(),arr.begin());
    }
    
    inline SArray operator+(const SArray& another){
        SArray result;
        result.arr[0] = arr[0] + another.arr[0];
        result.arr[1] = arr[1] + another.arr[1];
        result.arr[2] = arr[2] + another.arr[2];
        result.arr[3] = arr[3] + another.arr[3];
        return result;
    }
    
    
};

As I mentioned I tested this code on different platforms where C++ is compiled with clang and here are the results with different clang optimization flags.

-iOS
    -Os release build
        For ext vector time is: 8.603s
        For struct time is: 4.665s
        For c-type array time is: 4.677s
    -Ofast release build
        For ext vector time is: 8.604s
        For struct time is: 1.186s
        For c-type array time is: 0.942s
    -O3 release build
        For ext vector time is:8.603
        For struct time is:1.186
        For c-type array time is:0.933
-Android
    -Os release build
        For ext vector time is: 21.417999s
        For struct time is: 11.012000s
        For c-type array time is: 11.126000s
    
    -Ofast release build
        For ext vector time is:21.436s
        For struct time is:4.717s
        For c-type array time is: 4.813s
    
    -O3 release build
        For ext vector time is: 21.629999s
        For struct time is: 4.721s
        For c-type array time is: 4.806s
-Mac OS
    -Os release build   
        For ext vector time is: 1.721s
        For struct time is: 3.26s
        For c-type array time is: 3.156s
    
    -Ofast release build
        For ext vector time is: 1.75s
        For struct time is: 3.168s
        For c-type array time is: 3.09s
    
    -O3 release build
        For ext vector time is: 1.699s
        For struct time is: 3.204s
        For c-type array time is: 3.179s

As you can see extended-vectors are "losing" both on iOS and Android devices, while they are fastest on Mac. By the way my target platforms are Android iOS and Linux. I need the fastest way to do these kind of operations on all these three platforms, or find the fastest for each individually and use them accordingly depending on platform. I've also heard about SIMD commands, how can I achieve SIMD in this case? I've read that clang automatically vectorizes for loops. So is this the fastest I can get on the platforms I mentioned? Or is there a better way to do that? Does std::transform do the operation SIMD?

Hrant Nurijanyan
  • 789
  • 2
  • 9
  • 26
  • Why would you tell the compiler to use 4-byte vectors of 4 `char` elements (`uchar4`)? Of course that's going to be slow unless it "vectorizes" that into 4x uchar4 in a 16-byte XMM register so it can take full advantage of `movdqu` + `paddb`, instead of only doing 4-byte loads with `movd`. (Or the ARM equivalent) – Peter Cordes Dec 01 '20 at 01:54
  • Also, generally avoid arrays of pointers. Just use a big contiguous 2D array. [Malloc a 2D array in C](https://stackoverflow.com/q/36890624). Or in C++, see [How do I declare a 2d array in C++ using new?](https://stackoverflow.com/a/28841507), or in C++11 and later, [another answer](//stackoverflow.com/a/16239446/224132) shows `new` directly supports 2D arrays. (C++ compilers that support variable-length arrays may handly runtime variable len). Only use array-of-pointers if you need to be able to rearrange rows by swapping pointers, or replacing rows, or having one row shorter than another. – Peter Cordes Dec 01 '20 at 02:01

0 Answers0