1

I am implementing pattern matching algorithm, by moving template gradient info over entire target's gradient image , that too at each rotation (-60 to 60). I have already saved the template info for each rotation ,i.e. 121 templates are already preprocessed and saved.

But the issue is, this is consuming lot of time (approx 110ms), so decided to split the matching at set of rotations (-60 to -30 , -30 to 0, 0 to 30 and 30 to 60) into 4 threads, but threading is taking more time that single process (approx 115ms to 120ms).

Snippet of code is...


#define MAXTARGETNUM    64
MatchResultA totalResultsTemp[MAXTARGETNUM];



void CShapeMatch::match(ShapeInfo *ShapeInfoVec, search_region SearchRegion, float MinScore, float Greediness, int width,int height, int16_t  *pBufGradX ,int16_t  *pBufGradY,float  *pBufMag,  bool corr)
{
  MatchResultA resultsPerDeg[MAXTARGETNUM];
....
....
    int startX =  SearchRegion.StartX;
    int startY =  SearchRegion.StartY;
    int endX   =  SearchRegion.EndX;
    int endY   =  SearchRegion.EndY;
    
    float AngleStep  = SearchRegion.AngleStep;
    float AngleStart = SearchRegion.AngleStart;
    float AngleStop = SearchRegion.AngleStop;

    int startIndex = (int)(ShapeInfoVec[0].AngleNum/2) + ShapeInfoVec[0].AngleNum%2+(int)AngleStart/AngleStep;
    int stopIndex = (int)(ShapeInfoVec[0].AngleNum/2) + ShapeInfoVec[0].AngleNum%2+(int)AngleStop/AngleStep;

for (int k = startIndex; k < stopIndex ; k++){
         .... 
         for(int j = startY; j < endY; j++){
            for(int i = startX; i < endX; i++){
                
                    for(int m = 0; m < ShapeInfoVec[k].NoOfCordinates; m++)
                    {
                        curX = i + (ShapeInfoVec[k].Coordinates + m)->x;        // template X coordinate
                        curY = j + (ShapeInfoVec[k].Coordinates + m)->y ;       // template Y coordinate
                        
                        iTx = *(ShapeInfoVec[k].EdgeDerivativeX + m);           // template X derivative
                        iTy = *(ShapeInfoVec[k].EdgeDerivativeY + m);           // template Y derivative
                        iTm   = *(ShapeInfoVec[k].EdgeMagnitude + m);           // template gradients magnitude
                        
                        if(curX < 0 ||curY < 0||curX > width-1 ||curY > height-1)
                            continue;
                        offSet = curY*width + curX;
                        iSx = *(pBufGradX + offSet);            // get corresponding  X derivative from source image
                        iSy = *(pBufGradY + offSet);            // get corresponding  Y derivative from source image
                        iSm = *(pBufMag   + offSet);

                        if (PartialScore > MinScore)
                    {   
                    
                        float Angle = ShapeInfoVec[k].Angel;
                        bool hasFlag = false;
                        for(int n = 0; n < resultsNumPerDegree; n++)
                        {       
                            if(abs(resultsPerDeg[n].CenterLocX - i) < 5 && abs(resultsPerDeg[n].CenterLocY - j) < 5)
                            {   
                                hasFlag = true;
                                if(resultsPerDeg[n].ResultScore < PartialScore)
                                {   
                                    resultsPerDeg[n].Angel = Angle;
                                    resultsPerDeg[n].CenterLocX = i;
                                    resultsPerDeg[n].CenterLocY = j;
                                    resultsPerDeg[n].ResultScore = PartialScore;
                                    
                                    break;
                                }
                            }
                        }
                        if(!hasFlag)
                        {   
                            resultsPerDeg[resultsNumPerDegree].Angel = Angle;
                            resultsPerDeg[resultsNumPerDegree].CenterLocX = i;
                            resultsPerDeg[resultsNumPerDegree].CenterLocY = j;
                            resultsPerDeg[resultsNumPerDegree].ResultScore = PartialScore;
    
                            resultsNumPerDegree ++;
                        }
                        minScoreTemp = minScoreTemp < PartialScore ? PartialScore : minScoreTemp;   
                    }
                }
            }
            
            
            
            for(int i = 0; i < resultsNumPerDegree; i++)
                {
                    mtx.lock();
                    totalResultsTemp[totalResultsNum] = resultsPerDeg[i];
                    totalResultsNum++;
                    mtx.unlock();
                }
        
            n++;
}

void CallerFunction(){
            int16_t  *pBufGradX   = (int16_t *) malloc(bufferSize * sizeof(int16_t));
            int16_t  *pBufGradY   = (int16_t *) malloc(bufferSize * sizeof(int16_t));
            float    *pBufMag     = (float *) malloc(bufferSize * sizeof(float));
          
          clock_t start = clock();

          float temp_stop = SearchRegion->AngleStop;

            SearchRegion->AngleStop = -30;
            thread t1(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,  width, height, pBufGradX ,pBufGradY,pBufMag, corr);

            SearchRegion->AngleStart = -30;
            SearchRegion->AngleStop=0;
            thread t2(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,  width, height, pBufGradX ,pBufGradY,pBufMag, corr);            

            SearchRegion->AngleStart = 0;
            SearchRegion->AngleStop=30;
            thread t3(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,width, height, pBufGradX ,pBufGradY,pBufMag, corr);          

            SearchRegion->AngleStart = 30;
            SearchRegion->AngleStop=temp_stop;
            thread t4(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,width, height, pBufGradX ,pBufGradY,pBufMag, corr);

            t1.join();
            t2.join();
            t3.join();
            t4.join();
            
            clock_t end = clock();
            cout  << 1000*(double)(end-start)/CLOCKS_PER_SEC << endl;
}

As we can see there are plenty of heap access but they just are read-only. Only totalResultTemp and totalResultNum are shared global resource on which write are performed.

My PC configuration is, i5-7200U CPU @ 2.50GHz 4 cores 4 Gig RAM Ubuntu 18

wasi akbar
  • 31
  • 6
  • How are you measuring the time? – Paul Floyd Aug 21 '20 at 11:52
  • 1
    details do matter and for benchmarking even more. Did you turn on optimizations? Please include your compiler setup. How did you measure the time? Please include the benchmark in the question – 463035818_is_not_an_ai Aug 21 '20 at 11:57
  • updated the code , fetch clock() from caller before and after thread execution – wasi akbar Aug 21 '20 at 12:01
  • 2
    Threading isn't a magic bullet that always reduces execution time. There are overheads with threading (creating threads, switching context between them, etc) that take time. Unless the calculations can genuinely be done in parallel (e.g. multiple physical cores) it is quite normal that total elapsed time exceeds time to do everything in one thread. The fact you are joining all four threads means that all wait for the last one to complete - if there is only one CPU core being used, the BEST POSSIBLE CASE is that the elapsed time with threading is equal to to the time to do it all sequentially. – Peter Aug 21 '20 at 12:01
  • I have also checked the CPU allocation and I can confirm that all 4 threads are allocated to different CPU. – wasi akbar Aug 21 '20 at 12:12
  • 1
    Does this answer your question? [C: using clock() to measure time in multi-threaded programs](https://stackoverflow.com/questions/2962785/c-using-clock-to-measure-time-in-multi-threaded-programs) — `clock` measures the time allocated to all threads, it’s not wall clock time. – Cris Luengo Aug 23 '20 at 14:42
  • 1
    @CrisLuengo, yeah man !! thanx . I was using clock() which is CPU ticks not actual wall clock. Now I can see my threads are working as expected. – wasi akbar Aug 24 '20 at 07:37

2 Answers2

0
for(int i = 0; i < resultsNumPerDegree; i++)
                {
                    mtx.lock();
                    totalResultsTemp[totalResultsNum] = resultsPerDeg[i];
                    totalResultsNum++;
                    mtx.unlock();
                }

You writing into static array, and mutexes are really time consuming. Instead of creating locks try to use std::atomic_int, or in my opinion even better, just pass to function exact place where to store result, so problem with sync is not your problem anymore

  • 1
    even if I comment out this loop , means do not consider that results are available or not, still no improvement, so I think this part don't matter much. – wasi akbar Aug 21 '20 at 12:07
  • have you considered using a profiler? I recommend to increase fragmentation of your code to functions, so profiler, will give you more precise results – Krzysztof Mochocki Aug 21 '20 at 12:13
-1

POSIX Threads in c/c++ are not concurrent since the time assigned by the operative system to each parent process must be split into the number of threads it has. Thus, your algorithm is executing only core. To leverage multicore technology, you must use OpenMP. This interface library let you split your algorithm in different physic cores. This is a good OpenMP tutorial