28

I have an array A with zeros and ones. I want to find the sum of all numbers in A. I want to test two functions:

First function

void test1(int curIndex){
    if(curIndex == size) return;
    test1(curIndex+1);
    s+=A[curIndex];
}

Second function

void test2(int curIndex){
    if(curIndex == size) return;
    s+=A[curIndex];
    test2(curIndex+1);
}

I used the PAPI library to count the number of instructions, here is the entire experiment:

#include <iostream>
#include <fstream>
#include "Statistics.h"

using namespace std;


int size;
int *A;
int s;

void test3(int curIndex){
    if(curIndex == size) return;
    test3(curIndex+1);
    s+=A[curIndex];
}

int main(int argc, char* argv[]){

    size = atoi(argv[1]);
    if(argc!=2){
        cout<<"type ./executable size{odd integer}"<<endl;
        return 1;
    }
    if(size%2!=1){
        cout<<"size must be an odd number"<<endl;
        return 1;
    }
    A = new int[size];
    int i;
    for(i=0;i<size;i++){
        if(i%2==0){
            A[i] = false;
        }
        else{
            A[i] = true;
        }
    }

    Statistics stat(1);
    stat.start();
    test3(0);
    stat.stop();
    stat.printWithHelp();
    cout<<s<<endl;

    return 0;
}

Here is the Statistics.h file:

#ifndef TRIPLETPARSER_STATISTICS_H
#define TRIPLETPARSER_STATISTICS_H

#include <time.h>
#include <unistd.h>
#include <fstream>
#include <papi.h>
#include <iostream>
#include <iostream>

#define BILLION  1000000000LL

using namespace std;

class Statistics {

private:
    timespec s, e;
    /*
    PAPI_BR_CN  Conditional branch instructions
    PAPI_BR_INS     Branch instructions
    PAPI_BR_MSP     Conditional branch instructions mispredicted
    PAPI_BR_NTK     Conditional branch instructions not taken
    PAPI_BR_PRC     Conditional branch instructions correctly predicted
    PAPI_BR_TKN     Conditional branch instructions taken
    PAPI_BR_UCN     Unconditional branch instructions
    PAPI_BRU_IDL    Cycles branch units are idle
    PAPI_BTAC_M     Branch target address cache misses

    PAPI_TLB_DM     Data translation lookaside buffer misses
    */
    int events[10];  // , PAPI_L2_TCA,PAPI_L3_TCM,PAPI_L3_TCA PAPI_BR_CN, PAPI_BR_PRC}; //type of events we are interested in
    int num_hwcntrs;  //total amount of events stored in 'events' array
    long long values[10];
    long long counters[10];

    void handle_error(int err){
        std::cerr << "PAPI error: " << err << std::endl;
    }

public:
    Statistics(int papi){
        for(size_t i = 0; i< 10; i++)
            counters[i]=0.0;

        switch(papi){
            case 0:
                num_hwcntrs = 0;
                break;
            case 1:
                num_hwcntrs = 6;
                events[0] = PAPI_L2_TCA;
                events[1] = PAPI_L3_TCA;
                events[2] = PAPI_L3_TCM;
                events[3] = PAPI_TOT_INS;
                events[4] = PAPI_BR_INS;
                events[5] = PAPI_BR_MSP;
                break;
        }

    }

    void start(){

        for(size_t i = 0; i< 10; i++)
            counters[i]=0.0;

        if (num_hwcntrs != 0 && PAPI_start_counters(events, num_hwcntrs) != PAPI_OK)
            handle_error(1);
    }


    void start(float ratio){

        if (num_hwcntrs != 0 && PAPI_start_counters(events, num_hwcntrs) != PAPI_OK)
            handle_error(1);
    }


    void stop(){
        if (num_hwcntrs != 0 && PAPI_stop_counters(values, num_hwcntrs) != PAPI_OK)
            handle_error(1);
        update();
    }

    void stop(float ratio){
        if (num_hwcntrs != 0 && PAPI_stop_counters(values, num_hwcntrs) != PAPI_OK)
            handle_error(1);
        update();
    }


    void update(){
        for(size_t i = 0; i < num_hwcntrs; i++)
            counters[i] += values[i];
    }

    void print(){
        for(int i=0;i<num_hwcntrs;i++)
            std::cout << counters[i] << "\t";
        //cout<<"L2 cache miss ratio: "<<counters[1]/(double)counters[0]<<endl;
        //cout<<"L3 cache miss ratio: "<<counters[3]/(double)counters[2]<<endl;
    }

    void printWithHelp(){
        cout<<"L2 accesses: "<<counters[0]<<endl;
        cout<<"L2 miss/access ratio: "<<(double)counters[1]/counters[0]<<endl;
        cout<<"L3 accesses: "<<counters[1]<<endl;
        cout<<"L3 misses: "<<counters[2]<<endl;
        cout<<"L3 miss/access ratio: "<<(double)counters[2]/counters[1]<<endl;
        cout<<"Instructions: "<<counters[3]<<endl;
        cout<<"Branches: "<<counters[4]<<endl;
        cout<<"Branch mispredictions: "<<counters[5]<<endl;
        cout<<"Branch miss/predict ratio: "<<(double)counters[5]/counters[4]<<endl;
    }

    void print(float avg_ratio){
        for (int i = 0; i<num_hwcntrs; i++)
            std::cout << (double)(avg_ratio*counters[i]) << "\t";
    }

};

#endif //TRIPLETPARSER_STATISTICS_H

This is the output that I get from the first function when the size of A is 111,111

L2 accesses: 24126
L2 miss/access ratio: 0.131559
L3 accesses: 3174
L3 misses: 587
L3 miss/access ratio: 0.18494
Instructions: 1022776
Branches: 178113
Branch mispredictions: 6976
Branch miss/predict ratio: 0.0391661

This is the output that I get from the second function when the size of A is 111,111

L2 accesses: 7090
L2 miss/access ratio: 0.163752
L3 accesses: 1161
L3 misses: 507
L3 miss/access ratio: 0.436693
Instructions: 555860
Branches: 111189
Branch mispredictions: 25
Branch miss/predict ratio: 0.000224842

Why the difference in the results? The instructions reduce by half, the branch mispredictions are almost eliminated. What is happening here?

jsguy
  • 2,069
  • 1
  • 25
  • 36
  • Emm.. Because your first function is recursive, and second is not? – Galimov Albert Sep 19 '16 at 13:00
  • 11
    @PSIAlt, the second is also recursive, albeit eligible for tail-call optimization (I think). – Frédéric Hamidi Sep 19 '16 at 13:01
  • @FrédéricHamidi oh, didnt noticed that. But how it effects to reduce number of L2 accesses? – Galimov Albert Sep 19 '16 at 13:05
  • 1
    @PSIAlt, that's the question. If I had the answer, I would be writing it :) – Frédéric Hamidi Sep 19 '16 at 13:06
  • 5
    google "tail recursion". Because the recursive call is at the end, the compiler can skip the stack and not bother keeping track of each instance of calling the next step. (Recognizing that some code can be reduced to a special pattern is a lot harder than recognizing that it is already in the special pattern.) – Kenny Ostrom Sep 19 '16 at 13:06
  • 19
    _"Why isn't the G++ compiler smart enough to realize that these two functions are exactly the same?"_ Because they're not... – Lightness Races in Orbit Sep 19 '16 at 13:06
  • 8
    The functions don't perform the additions in the same order. Deducing that they still produce the same result is pretty difficult. ([Here](http://ideone.com/9VQnZH) is a string version that illustrates the difference.) – molbdnilo Sep 19 '16 at 13:40
  • a) Functions in a mathematical sens are the same iff they have the same domain and return the same output for all inputs of that domain. b) Every fold-right (recursive functions) can be transform into a fold-left (tail recursion) and sometimes the needed stack just disappears. Unfortunatly you have a procedure rather than a function (side effects). And do not expect miracles from your compiler! – knivil Sep 19 '16 at 17:59
  • 4
    @molbdnilo: for that matter, if you're not using `-fwrapv` than integer addition isn't necessarily associative. Adding together `INT_MAX`, `1` and `-1` in one order results in an overflow with undefined behaviour, in the other order has a defined result. So in some cases (maybe not this one) gcc needs to be cautious in order to avoid violating the assumptions made in its own optimisations elsewhere, turning good code into bad. – Steve Jessop Sep 19 '16 at 18:22
  • @SteveJessop: It's easy for UB-based optimizations can easily turn what would naturally be equivalent operations into non-equivalent operations, which can easily break other optimizations that would presume them equivalent. Sometimes gcc will decide it can omit a statement need not generate any code, but then fail to recognize that the existence of the statement would make otherwise-undefined behavior defined and thus invalidate another optimization. – supercat Sep 19 '16 at 18:45

1 Answers1

55

Your second function is tail-recursive. That means the compiler can optimize it to something like:

void test2(int curIndex){
  while(true)
  {
    if(curIndex == size) return;
    s+=A[curIndex];
    curIndex = curIndex + 1;
  }
}

That reduces the number of instructions significantly. It also reduces the number of stack frames needed to (at most) one. As a result, it uses a lot less memory, which results in the reduction of cache misses.

The compiler is not able to make this optimization for the first function.

UPDATE: Some people have asked why the compiler cannot make that optimization on the first function.

Let's start by observing the function is not tail-recursive. A function is tail recursive if the very last thing that happens is a recursive call to the same function, followed by returning the result of that recursive call (if any).

Clearly, that is not the case for the first function, s+=A[curIndex]; is executed after the recursive call.

So then people have asked why the compiler cannot turn the first function into the second one.

The question is "why does G++ not have this feature?" The answer to that question is always the same. Features are unimplemented by default; G++ does not have that feature because no one designed, implemented and shipped the feature to customers.

That should be the end of it, but of course people will want to know why nobody designed, implemented and tested this feature. Well, maybe nobody thought of doing it. But more importantly, the feature would be far from trivial.

First of all, the compiler would have to understand that

test1(curIndex+1);
s+=A[curIndex];

and

s+=A[curIndex];
test1(curIndex+1);

are equivalent. That is a non trivial observation, given that from a mechanical point of view they are not equivalent! Indeed, the first one effectively loops from the end of the array to the start, whereas the second one loops from the start to the end. Is that the same? It yields the same result when A is an int* (and s in an int), but it doesn't in other cases (e.g. when A is a double* and s is a double). Do we expect a compiler to be that smart?

So here we have a potential feature with a high cost to implement. But the cost may be worth it, if the benefit is high. Is the benefit high? I would guess that this happens very little in real code, i.e. developers are likely to write the second form anyway. So there you have it: an expensive feature with little benefit. IMHO, compiler developers are wise to spend their valuable time on more useful features.

Community
  • 1
  • 1
Kris Vandermotten
  • 10,111
  • 38
  • 49
  • 2
    But the big question is: Why is the compiler not able to optimize the first function? Or: Is there a compiler that do this optimization? Is the optimisation consistent with the C++ standard? – knivil Sep 19 '16 at 18:03
  • 2
    @knivil - the question for optimization is always "can a conforming program tell the difference?" Cache hits or misses are not detectable in a conforming program. – Pete Becker Sep 19 '16 at 18:21
  • 1
    So this is a partial answer. It provides a feature one function has that the other does not, and optimizing tail-call is a well known optimization. But so is turning non-tail-call recursive functions into tail-call recursive functions, before doing a tail-call optimization. What makes it unable to do *that* two-step optimization? – Yakk - Adam Nevraumont Sep 19 '16 at 18:37
  • 12
    @Yakk In languages like C++, proving that you can interchange _any_ operation with a non-inlinable function call is very difficult, to the point where many compilers don't even bother. I know GCC _can_ do this in at least some cases for functions that it has been explicitly informed (with `__attribute__`) have no side effects, but this is not such a function. – zwol Sep 19 '16 at 19:03
  • 16
    @zwol Exactly! This question is less an example about how bad gcc is at reasoning on code, and more an example of how simple a problem can be and still look very much like the halting problem to a compiler – Steve Cox Sep 19 '16 at 19:55
  • 1
    @knivil TCO is a pretty simple optimization.  The compiler has to check for little more than _“When this function call happens, do I no longer need any of the enclosing function's stack state (nothing else uses any vars that I've been keeping on the stack)?  And if there's a return value, is it just the result as that function call's return?”_ – Slipp D. Thompson Sep 20 '16 at 01:43
  • TCO is a pretty simple optimisation if only ONE function is involved. If two function call each other recusivly then it gets more tricky. Please add third function and your general statement that it is very simple becomes more questionable. – knivil Sep 20 '16 at 20:26
  • 2
    @knivil please note that I am talking only about the optimisation of a tail *recursive* function, which is a special case of tail call optimisation. In the recursive case, only one function is involved, and it is easy to demonstrate the optimization in source. General TCO replaces a call with a jump, possibly into the body of a different function, and cannot always be demonstrated in source. Depending on the calling convention, it is still relatively easy at the assembly level though. – Kris Vandermotten Sep 22 '16 at 12:00
  • Yes this allows the compiler to bake in full optimization – Chuck Norrris Oct 03 '16 at 22:38