7

See BlendingTable::create and BlendingTable::print. Both have the same form of tail recursion, but while create will be optimized as a loop, print will not and cause a stack overflow.

Go down to see a fix, which I got from a hint from one of the gcc devs on my bug report of this problem.

#include <cstdlib>
#include <iostream>
#include <memory>
#include <array>
#include <limits>

class System {
public:
    template<typename T, typename... Ts>
    static void print(const T& t, const Ts&... ts) {
        std::cout << t << std::flush;
        print(ts...);
    }

    static void print() {}

    template<typename... Ts>
    static void printLine(const Ts&... ts) {
        print(ts..., '\n');
    }
};

template<typename T, int dimension = 1>
class Array {
private:
    std::unique_ptr<T[]> pointer;
    std::array<int, dimension> sizes;
    int realSize;

public:
    Array() {}

    template<typename... Ns>
    Array(Ns... ns):
    realSize(1) {
        checkArguments(ns...);
        create(1, ns...);
    }

private:
    template<typename... Ns>
    static void checkArguments(Ns...) {
        static_assert(sizeof...(Ns) == dimension, "dimension mismatch");
    }

    template<typename... Ns>
    void create(int d, int n, Ns... ns) {
        realSize *= n;
        sizes[d - 1] = n;
        create(d + 1, ns...);
    }

    void create(int) {
        pointer = std::unique_ptr<T[]>(new T[realSize]);
    }

    int computeSubSize(int d) const {
        if (d == dimension) {
            return 1;
        }
        return sizes[d] * computeSubSize(d + 1);
    }

    template<typename... Ns>
    int getIndex(int d, int n, Ns... ns) const {
        return n * computeSubSize(d) + getIndex(d + 1, ns...);
    }

    int getIndex(int) const {
        return 0;
    }

public:
    template<typename... Ns>
    T& operator()(Ns... ns) const {
        checkArguments(ns...);
        return pointer[getIndex(1, ns...)];
    }

    int getSize(int d = 1) const {
        return sizes[d - 1];
    }
};

class BlendingTable : public Array<unsigned char, 3> {
private:
    enum {
        SIZE = 0x100,
        FF = SIZE - 1,
    };

public:
    BlendingTable():
    Array<unsigned char, 3>(SIZE, SIZE, SIZE) {
        static_assert(std::numeric_limits<unsigned char>::max() == FF, "unsupported byte format");
        create(FF, FF, FF);
    }

private:
    void create(int dst, int src, int a) {
        (*this)(dst, src, a) = (src * a + dst * (FF - a)) / FF;
        if (a > 0) {
            create(dst, src, a - 1);
        } else if (src > 0) {
            create(dst, src - 1, FF);
        } else if (dst > 0) {
            create(dst - 1, FF, FF);
        } else {
            return;
        }
    }

    void print(int dst, int src, int a) const {
        System::print(static_cast<int>((*this)(FF - dst, FF - src, FF - a)), ' ');
        if (a > 0) {
            print(dst, src, a - 1);
        } else if (src > 0) {
            print(dst, src - 1, FF);
        } else if (dst > 0) {
            print(dst - 1, FF, FF);
        } else {
            System::printLine();
            return;
        }
    }

public:
    void print() const {
        print(FF, FF, FF);
    }
};

int main() {
    BlendingTable().print();
    return EXIT_SUCCESS;
}

Changing the class definition of System from

class System {
public:
    template<typename T, typename... Ts>
    static void print(const T& t, const Ts&... ts) {
        std::cout << t << std::flush;
        print(ts...);
    }

    static void print() {}

    template<typename... Ts>
    static void printLine(const Ts&... ts) {
        print(ts..., '\n');
    }
};

to

class System {
public:
    template<typename T, typename... Ts>
    static void print(T t, Ts... ts) {
        std::cout << t << std::flush;
        print(ts...);
    }

    static void print() {}

    template<typename... Ts>
    static void printLine(Ts... ts) {
        print(ts..., '\n');
    }
};

magically allows gcc to eliminate the tail calls.

Why does 'whether or not passing function arguments by reference' make such a big difference in gcc's behaviour? Semantically they both look the same to me in this case.

  • 1
    I'm genuinely curious if the same problem exhibits with [perfect forwarding](http://pastebin.com/u2xK3pmL). – WhozCraig Jul 07 '15 at 21:44
  • In general, pass by reference makes Program analysis more difficult for the compiler and consequently it is more difficult to determine what transformations are legal. Whether that is the actual problem here or if there is another reason (like a quirk in the standard, that makes this optimization illegal) I can't say – MikeMB Jul 07 '15 at 22:20
  • 2
    One hypothesis is that the cast creates a temporary whose reference is passed to the `print` function, and now the compiler may feel it needs to hang on to the temporary until the `print` function returns. – jxh Jul 08 '15 at 18:10

1 Answers1

0

As it is noted by @jxh the cast static_cast<int>() creates a temporary whose reference is passed to the print function. Without such cast the tail recursion is optimized correctly.

The issue is very similar to the old case Why isn't g++ tail call optimizing while gcc is? and the workaround may be similar to https://stackoverflow.com/a/31793391/4023446.

It is still possible to use System with the arguments passed by reference if call to System::print will be moved to a separate private helper function SystemPrint:

class BlendingTable : public Array<unsigned char, 3> {

//...

private:
    void SystemPrint(int dst, int src, int a) const
    {
        System::print(static_cast<int>((*this)(FF - dst, FF - src, FF - a)), ' ');
    }

    void print(int dst, int src, int a) const {
        SystemPrint(dst, src, a);
        if (a > 0) {
            print(dst, src, a - 1);
        } else if (src > 0) {
            print(dst, src - 1, FF);
        } else if (dst > 0) {
            print(dst - 1, FF, FF);
        } else {
            System::printLine();
            return;
        }
    }

// ...

}

Now the tail call optimization works (g++ (Ubuntu/Linaro 4.7.2-2ubuntu1) 4.7.2 with the optimization option -O2) and the print does not cause a stack overflow.

Update

I verified it with other compilers:

  • the original code without any change is perfectly optimized by clang++ Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn) with -O1 optimization
  • g++ (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4 fails to perform TCO even without the cast or with the wrapping function SystemPrint workaround; here only the workaround with System::print arguments by values works.

So, the issue is very specific to compiler versions.

Community
  • 1
  • 1
Orest Hera
  • 6,706
  • 2
  • 21
  • 35
  • I still find this a little strange. `print` calls `print` recursively twice. First, using a set of parameters which includes a `static_cast` cast, and the second call from among the three candidates (selected via `if...else if...`). It's only the second one that is relevant to tall call optimization. So I don't see how the first one can affect this at all, even if it does have a temporary? – Aaron McDaid Aug 03 '15 at 18:12
  • @Aaron McDaid The recursion is related to `BlendingTable::print`, however before the recursive exit there is a call to `System::print`. During that call a temporary is created on stack of `BlendingTable::print`. The address of that temporary on stack is passed as argument to `System::print`. So, it is not obvious for current TCO implementation that the stack frame may be reused for tail recursion optimization. – Orest Hera Aug 03 '15 at 18:24