20

I noticed that sometimes even if I don't use iostream and related I/O libraries, my binaries produced by Mingw were still unreasonably large.

For example, I wrote a code to use vector and cstdio only and compiled it with -O2 -flto, my program can go as large as 2MB! I run nm main.exe > e.txt and was shocked to see all the iostream related functions in it.

After some googling, I learnt to use -ffunction-sections -Wl,-gc-sections, that reduces the program size from 2MB to ~300KB (if with -s, 100+KB). Excellent!

To further test the effect of -ffunction-sections -Wl,-gc-sections, here is another code:

#include <cstdio>
#include <vector>
#include <tuple>
#include <algorithm>
#include <chrono>
#include <windows.h>

#undef min

struct Point {
    int x, y;
};

constexpr int length = 5;
constexpr int half_length() {
    return length & 1 ? length : length - 1;
}

template<class F>
int func_template(F&& f) {
#ifdef _MSC_VER
    puts(__FUNCSIG__);
#else
    puts(__PRETTY_FUNCTION__);
#endif
    printf("\n");
    return f();
}

struct fake_func {
    int operator()() const { return 59; };
};

template<class F, class... Args>
int pass_args(F&& f, Args&&... args) {
#ifdef _MSC_VER
    puts(__FUNCSIG__);
#else
    puts(__PRETTY_FUNCTION__);
#endif
    printf("\n");
    return f(std::forward<Args>(args)...);
}

template<class T>
T min(T x) {
    return x;
}

template<class T, class... Args>
T min(T x, Args... args) {
    T y = min(args...);
    return x < y ? x : y;
}

void type_verifier(int x) {
    printf("%dd ", x);
}

void type_verifier(char x) {
    printf("'%c' ", x);
}

void type_verifier(double x) {
    printf("%lff ", x);
}

template<class T>
void type_verifier(T x) {
    printf("unknown ");
}

template<class T, class... Args>
void type_verifier(T x, Args... args) {
    type_verifier(x);
    type_verifier(args...);
}

int bufLen;
char buf[100];

template<class... Args>
inline int send(Args... args) {
    bufLen = sprintf(buf, std::forward<Args>(args)...);
    return bufLen;
}

namespace std {

inline namespace v1 {
    void func() {
        printf("I am v1\n");
    }
}

namespace v2 {
    void func() {
        printf("I am v2\n");
    }
}

}

int main() {
    std::vector<int> v {1, 2, 3, 4, 5};
    for (auto &i : v) printf("%d ", i);
    printf("\n");

    Point p {1, 2};
    printf("%d %d\n", p.x, p.y);

    auto t = std::make_tuple("Hello World", 12);
    printf("%s %d\n", std::get<0>(t), std::get<1>(t));
    int a, b;
    auto f = []() { return std::make_tuple(1, 2); };
    std::tie(a, b) = f();
    printf("%d %d\n", a, b);

    //int test_constexpr[half_length() + 4];

    int ft = func_template([]{ return 42; });
    printf("func_template: %d\n", ft);
    ft = func_template(fake_func {});
    printf("func_template: %d\n", ft);
    ft = pass_args([](int x, int y) { return x + y; }, 152, 58);
    printf("pass_args: %d\n", ft);
    ft = pass_args([](int n, const char *m) {
        for (int i = 0; i < n; i++) printf("%c ", m[i]);
        printf("\n");
        return 0;
    }, 5, "Hello");

    printf("min: %d\n", min(3, 4, 2, 1, 5));
    type_verifier(12, 'A', 0.5, "Hello");
    printf("\n");

/*  send("Hello World");
    send("%d", 1);
    send("%d", "1234");
    sprintf(buf, "%d", "123");*/

    std::func();
    std::v1::func();
    std::v2::func();

    std::rotate(v.begin(), v.begin() + 2, v.end());
    for (auto &i : v) printf("%d ", i);
    printf("\n");

    auto start = std::chrono::steady_clock::now();

    std::vector<int> x {2, 4, 2, 0, 5, 10, 7, 3, 7, 1}; 
    printf("insertion sort: ");
    for (auto &i: x) printf("%d ", i);
    printf("\n");
    // insertion sort
    for (auto i = x.begin(); i != x.end(); ++i) {
        std::rotate(std::upper_bound(x.begin(), i, *i), i, i+1);
        for (auto &j: x) printf("%d ", j);
        printf("\n");
    }

    std::vector<int> heap {7, 5, 3, 4, 2};
    std::make_heap(heap.begin(), heap.end());
    std::pop_heap(heap.begin(), heap.end());
    printf("Pop heap (%d)\n", heap.back());
    heap.pop_back();
    heap.push_back(1);
    std::push_heap(heap.begin(), heap.end());
    std::sort_heap(heap.begin(), heap.end());
    for (auto &i: heap) printf("%d ", i);
    printf("\n");

    auto end = std::chrono::steady_clock::now();
    auto diff = end - start;
    printf("time: %I64d ms\n",
        std::chrono::duration_cast<std::chrono::milliseconds>(diff).count());

    {
        auto u = v;
        std::move_backward(u.begin(), u.begin() + u.size() - 1, u.begin() + u.size());
        for (auto &i : u) printf("%d ", i);
        printf("\n");
    }

    {
        auto u = v;
        std::move(u.begin() + 1, u.begin() + u.size(), u.begin());
        for (auto &i : u) printf("%d ", i);
        printf("\n");
    }

    start = std::chrono::steady_clock::now();
    Sleep(2000);
    end = std::chrono::steady_clock::now();
    diff = end - start;
    printf("time: %I64d ms\n",
        std::chrono::duration_cast<std::chrono::milliseconds>(diff).count());

    std::chrono::steady_clock::time_point before;
    before = std::chrono::steady_clock::now();
    Sleep(2000);
    auto after = std::chrono::steady_clock::now();
    printf("%f seconds\n", std::chrono::duration<double>(after - before).count());

    return 0;
}

To my disappointment, the final program is once again > 2MB.

Interestingly, cl.exe thoughtfully remove all iostream related functions consistently even if I didn't use /O2 or any other flags, just cl.exe main.cpp. (For the code above, cl.exe produces 100+KB binary).

Did I miss any other useful gcc flags for this?

Specification:

  • Mingw-w64 gcc 6.1.0
  • Mingw-w64 gcc 6.2.0
  • Visual Studio 2017 RC
  • All binaries are linked statically

Compare with Linux

I compared the binaries produced by gcc 4.9.2 (Linux) and gcc 4.9.3 (mingw-w64) for the above code (except windows.h and Sleep were removed).

Compile flag

g++ -o c++11 c++11.cpp -std=c++11 -static-libgcc -static-libstdc++ -ffunction-sections -Wl,-gc-sections -O2

Linux gcc did successfully strip away iostream and functions without the need for -flto while Mingw-w64 gcc just can't do it properly.

Windows only support PE format while Linux supports ELF format, allowing Linux to use Gold linker. Maybe this is the explanation?

Update

I eventually filed a bug at https://sourceforge.net/p/mingw-w64/bugs/578/ . Let's hope it gets some attentions!

John London
  • 1,250
  • 2
  • 14
  • 32
  • 1
    This might help: [How to remove unused C/C++ symbols with GCC and ld?](https://stackoverflow.com/questions/6687630/how-to-remove-unused-c-c-symbols-with-gcc-and-ld) – benbuck Dec 27 '16 at 11:25
  • Tried everything: -Os (reduce 2KB), -fwhole-program (no change), -fomit-frame-pointer (no change). -why_live is not available. – John London Dec 27 '16 at 14:22
  • `length & 1 ? length : length - 1` can be changed to `length + (length & 1) - 1` – phuclv Dec 30 '16 at 05:16
  • For me `-fwhole-program` did it, with gcc 8.2.0 from MSYS2. – Michel Rouzic Aug 06 '18 at 16:01
  • See the bug report https://sourceware.org/bugzilla/show_bug.cgi?id=11539 with a working in progress patch (that was not updated since a couple of year now...) – benjarobin Oct 18 '19 at 10:29
  • [GCC's warning on `-ffunctionsections`](https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html#index-ffunction-sections): _Only use these options when there are significant benefits from doing so. When you specify these options, the assembler and linker create larger object and executable files and are also slower. These options affect code generation. They prevent optimizations by the compiler and assembler using relative locations inside a translation unit since the locations are unknown until link time._ – legends2k Dec 07 '21 at 10:12

1 Answers1

4

Try stripping debug and symbol info from static libstdc++ via -Wl,--strip-all. This reduced my executable from 9M to 670K on Cygwin (13x) and from 6M to 80K on Ubuntu (80x).

yugr
  • 19,769
  • 3
  • 51
  • 96
  • `-Wl,--strip-all` is the same as `-s` I mentioned above. Stripping debug info and symbol table does not strip the unused `iostream` related functions, which is something MSVC does without telling (no special flags required)! – John London Nov 30 '16 at 00:28
  • Ok, I see. FYI a 4.8.2 x86_64-w64-mingw32-g++ (Ubuntu 14.04) with `-std=c++11 -O2 -ffunction-sections -Wl,-gc-sections -static-libstdc++ -s` generates a 100K executable so the size issue may be specific to a particular toolchain version (and thus harder to investigate remotely). Have you tried analyzing linker map (`-Wl,--print-map`)? – yugr Nov 30 '16 at 06:05
  • Since I use the latest MSVC, I changed to gcc 6.2.0 to be fair. `-Wl,-print-gc-sections` shows that gcc does strip some `iostream` and `locale` functions, but `-Wl,--print-map` shows that there are still leftovers. MSVC: 150KB, gcc: 738KB. See http://pastebin.com/raw/4uGCm7Yy – John London Nov 30 '16 at 07:08
  • I think you attached gc.txt rather than map.txt (the dump containts output from -print-gc-sections). "gcc: 738KB" - I'm confused, originally you said it's over 2M... – yugr Nov 30 '16 at 17:49
  • FWIW experimental support for `--gc-sections` on PE/COFF targets (i.e. MinGW/Cygwin) has only [landed in Bintools 2.25](https://sourceware.org/bugzilla/show_bug.cgi?id=11539) in min-2015 so it's likely to be sub-optimal. – yugr Dec 01 '16 at 05:21
  • Sorry, 2MB was without `-s`. I am using binutils 2.27. – John London Dec 01 '16 at 09:11
  • You can also -fdata-sections in addition to what @yugr told, but I don't know on MinGW it's general in GCC. What I'm wondering why there is nobody who already implemented an : -frtti-without-strings, because this is what most of the time costs alot in binaries. – daminetreg Dec 12 '16 at 06:54
  • @daminetreg I've tried `-fdata-sections` for John's code but it didn't change anything so I didn't bother to mention it. I've also tried linking with `-fno-exceptions` and `-fno-rtti` but it didn't help either. – yugr Dec 12 '16 at 07:19
  • @daminetreg I added my finding on Linux's gcc in my question, please take a look. – John London Dec 19 '16 at 13:16