9

I just wrote a test program to find the fastest way to allocate & free many objects which managed by shared_ptr.

I tried shared_ptr with new, shared_ptr with pool, make_shared, allocate_shared. What make me surprised is allocate_shared is slower than shared_ptr with pool.

I test the code in vs2017+win10 with release build. The release build setting is default(/O2). I also test it in gcc4.8.5+centos6.2 with g++ -std=c++11 -O3.

The code is:

#include <memory>
#include <iostream>
#include <vector>
#include <assert.h>
#include <chrono>
#include <mutex>
using namespace std;

struct noncopyable {
protected:
    noncopyable() = default;
    ~noncopyable() = default;
private:
    noncopyable(const noncopyable&) = delete;
    noncopyable& operator=(const noncopyable&) = delete;
    noncopyable(noncopyable&&) = delete;
    noncopyable& operator=(noncopyable&&) = delete;
};

class BlockPool : noncopyable {
public:
    BlockPool(size_t block_size) :block_size_(block_size) {}
    ~BlockPool() {
        assert(total_count_ == datas_.size());
        for (size_t i = 0; i < datas_.size(); ++i) {
            free(datas_[i]);
        }
    }
    size_t size() const { return block_size_; }
    void* pop() {
        std::lock_guard<std::mutex> lock(mutex_);
        if (datas_.empty()) {
            const size_t kNextSize = 1024;
            for (size_t i = 0; i < kNextSize; ++i) {
                void* p = malloc(block_size_);
                datas_.push_back(p);
            }
            total_count_ += kNextSize;
        }
        void* p = datas_.back();
        datas_.pop_back();
        return p;
    }
    void push(void* data) {
        std::lock_guard<std::mutex> lock(mutex_);
        datas_.push_back(data);
    }
    void reserve(size_t count) {
        std::lock_guard<std::mutex> lock(mutex_);
        if (count <= datas_.size()) return;
        datas_.reserve(count);
        count -= datas_.size();
        for (size_t i = 0; i < count; ++i) {
            void* p = malloc(block_size_);
            datas_.push_back(p);
        }
        total_count_ += count;
    }
private:
    size_t const block_size_;
    size_t total_count_{ 0 };
    std::vector<void*> datas_;
    std::mutex mutex_;
};

struct Packet : noncopyable {
    Packet() = default;
    ~Packet() = default;
    char data_[1000];
};

const uint32_t kLoopCount = 1000 * 1000;

BlockPool pool(sizeof(Packet) + 64);

std::vector<shared_ptr<Packet>> packets;

void test_make_shared() {
    auto begin = std::chrono::steady_clock::now();
    for (uint32_t i = 0; i < kLoopCount; ++i) {
        auto packet = make_shared<Packet>();
        packets.emplace_back(std::move(packet));
    }
    packets.clear();
    auto end = std::chrono::steady_clock::now();
    auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
    std::cout << "make_shared: " << ms << " ms\n";
}

void test_shared_ptr_with_pool() {
    auto begin = std::chrono::steady_clock::now();
    for (uint32_t i = 0; i < kLoopCount; ++i) {
        Packet* p = (Packet*)pool.pop();
        new(p)Packet();
        shared_ptr<Packet> packet(p, [](Packet* packet) {
            packet->~Packet();
            pool.push(packet);
        });
        packets.emplace_back(std::move(packet));
    }
    packets.clear();
    auto end = std::chrono::steady_clock::now();
    auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
    std::cout << "shared_ptr with pool: " << ms << " ms\n";
}

void test_shared_ptr_with_new() {
    auto begin = std::chrono::steady_clock::now();
    for (uint32_t i = 0; i < kLoopCount; ++i) {
        shared_ptr<Packet> packet(new Packet);
        packets.emplace_back(std::move(packet));
    }
    packets.clear();
    auto end = std::chrono::steady_clock::now();
    auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
    std::cout << "shared_ptr with new: " << ms << " ms\n";
}

template <class T>
struct Mallocator {
    typedef T value_type;
    Mallocator(BlockPool* pool) : pool_(pool) { }

    template <class U> Mallocator(const Mallocator<U>& u) {
        pool_ = u.pool_;
    }
    inline T* allocate(std::size_t n) {
#ifdef _DEBUG
        assert(n == 1);
        auto len = n * sizeof(T);
        assert(len <= pool_->size());
#endif
        return static_cast<T*>(pool_->pop());
    }
    inline void deallocate(T* p, std::size_t n) {
#ifdef _DEBUG
        assert(n == 1);
        auto len = n * sizeof(T);
        assert(len <= pool_->size());
#endif
        pool_->push(p);
    }
    BlockPool* pool_;
};

template <class T, class U>
bool operator==(const Mallocator<T>&, const Mallocator<U>&) { return true; }
template <class T, class U>
bool operator!=(const Mallocator<T>&, const Mallocator<U>&) { return false; }

void test_allocate_shared() {    
    Mallocator<Packet> alloc(&pool);
    auto begin = std::chrono::steady_clock::now();
    for (uint32_t i = 0; i < kLoopCount; ++i) {
        shared_ptr<Packet> packet = allocate_shared<Packet, Mallocator<Packet>>(alloc);
        packets.emplace_back(std::move(packet));
    }
    packets.clear();
    auto end = std::chrono::steady_clock::now();
    auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
    std::cout << "allocate_shared: " << ms << " ms\n";    
}

void test_new_delete() {
    std::vector<Packet*> raw_packets;
    raw_packets.reserve(kLoopCount);
    auto begin = std::chrono::steady_clock::now();
    for (uint32_t i = 0; i < kLoopCount; ++i) {
        raw_packets.push_back(new Packet);
    }
    for (uint32_t i = 0; i < kLoopCount; ++i) {
        delete raw_packets[i];
    }
    raw_packets.clear();
    auto end = std::chrono::steady_clock::now();
    auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
    std::cout << "new_delete: " << ms << " ms\n";
}

int main() {
    std::cout << "loop for " << kLoopCount << " times to ceate and free shared_ptr\n\n";

    packets.reserve(kLoopCount);
    for (int i = 0; i < 3; ++i) {
        test_make_shared();
    }
    std::cout << "======\n";

    pool.reserve(kLoopCount);    

    for (int i = 0; i < 3; ++i) {
        test_shared_ptr_with_new();
    }
    std::cout << "======\n";

    for (int i = 0; i < 3; ++i) {
        test_shared_ptr_with_pool();
    }
    std::cout << "======\n";

    for (int i = 0; i < 3; ++i) {
        test_allocate_shared();
    }
    std::cout << "======\n";

    for (int i = 0; i < 3; ++i) {
        test_new_delete();
    }

    return 0;
}

In my computer(vs2017, windows 10), the result is:

loop for 1000000 times to ceate and free shared_ptr

make_shared: 616 ms
make_shared: 586 ms
make_shared: 581 ms
======
shared_ptr with new: 532 ms
shared_ptr with new: 541 ms
shared_ptr with new: 525 ms
======
shared_ptr with pool: 292 ms
shared_ptr with pool: 293 ms
shared_ptr with pool: 290 ms
======
allocate_shared: 346 ms
allocate_shared: 340 ms
allocate_shared: 345 ms
======
new_delete: 424 ms
new_delete: 408 ms
new_delete: 403 ms

I also tested it in gcc 4.8, centos6.2, the result is same, that is for speed, shared_ptr_with_pool > allocate_shared > shared_ptr_with_new > make_shared.

As I know, the shared_ptr::shared_ptr(T* p) need to allocate a small memory to hold the refcount and the deleter, so need to allocate twice, and the make_shared just need to allocate one times, and the allocate_shared does not need to allocate even once.

So as my understand, the speed relation should be allocate_shared > shared_ptr_with_pool > make_shared > shared_ptr_with_new, but not shared_ptr_with_pool > allocate_shared > shared_ptr_with_new > make_shared.

Could someone tell me the reason, very thanks!

Update:

After some dig by vs2017+windows10, I found std::allocate_shared or boost::allocate_shared call memset(p, 0, sizeof(Packet)) which slow down the while operation.

It's because some codes looks like this in vs2017 library header:

class Pair {
public:
    template<class ... T>
    Pair(T&...t) : v_(std::forward<T>(t)...){
    }
    std::_Align_type<char, 1500> v_;
};

void test_align() {
    Pair p;
}

The Pair constructor call memset(addr, 0, sizeof(Pair)).

I do not know why the Pair constructor call memset, and I wrote some testing code:

struct A {
    char data_[1500];
};

class B {
public:
    template<class ... T> B(T&...t) 
        : a_(std::forward<T>(t)...) {
    }
    A a_;
};

int main() {
    B b;
    return 0;
}

I compiled the code with vs2017 and I found the memset(addr, 0, 1500) is called. The asm code (Debug build, the Release build is same) is:

class B {
public:
    template<class ... T> B(T&...t) 
        : a_(std::forward<T>(t)...) {
00C516A0  push        ebp  
00C516A1  mov         ebp,esp  
00C516A3  sub         esp,0CCh  
00C516A9  push        ebx  
00C516AA  push        esi  
00C516AB  push        edi  
00C516AC  push        ecx  
00C516AD  lea         edi,[ebp-0CCh]  
00C516B3  mov         ecx,33h  
00C516B8  mov         eax,0CCCCCCCCh  
00C516BD  rep stos    dword ptr es:[edi]  
00C516BF  pop         ecx  
00C516C0  mov         dword ptr [this],ecx  
00C516C3  push        5DCh  
00C516C8  push        0  
00C516CA  mov         eax,dword ptr [this]  
00C516CD  push        eax  
00C516CE  call        _memset (0C510BEh)  
00C516D3  add         esp,0Ch  
    }
00C516D6  mov         eax,dword ptr [this]  
00C516D9  pop         edi  
00C516DA  pop         esi  
00C516DB  pop         ebx  
00C516DC  add         esp,0CCh  
00C516E2  cmp         ebp,esp  
00C516E4  call        __RTC_CheckEsp (0C51118h)  
00C516E9  mov         esp,ebp  
00C516EB  pop         ebp  
00C516EC  ret  

If I add a empty constructor looks like:

struct A {
    A() {}
    char data_[1500];
};

class B {
public:
    template<class ... T> B(T&...t) 
        : a_(std::forward<T>(t)...) {
    }
    A a_;
};

int main() {
    B b;
    return 0;
}

The asm code(Debug build, the Release build is same) is:

class B {
public:
    template<class ... T> B(T&...t) 
        : a_(std::forward<T>(t)...) {
010A1D40  push        ebp  
010A1D41  mov         ebp,esp  
010A1D43  sub         esp,0CCh  
010A1D49  push        ebx  
010A1D4A  push        esi  
010A1D4B  push        edi  
010A1D4C  push        ecx  
010A1D4D  lea         edi,[ebp-0CCh]  
010A1D53  mov         ecx,33h  
010A1D58  mov         eax,0CCCCCCCCh  
010A1D5D  rep stos    dword ptr es:[edi]  
010A1D5F  pop         ecx  
010A1D60  mov         dword ptr [this],ecx  
010A1D63  mov         ecx,dword ptr [this]  
010A1D66  call        A::A (010A1456h)  
    }
010A1D6B  mov         eax,dword ptr [this]  
010A1D6E  pop         edi  
010A1D6F  pop         esi  
010A1D70  pop         ebx  
010A1D71  add         esp,0CCh  
010A1D77  cmp         ebp,esp  
010A1D79  call        __RTC_CheckEsp (010A126Ch)  
010A1D7E  mov         esp,ebp  
010A1D80  pop         ebp  
010A1D81  ret  

The call _memset (0C510BEh) changed to call A::A (010A1456h) .

So it looks like if the type A has constructor, the a_(std::forward<T>(t)...) call the constructor, if type A does not have constructor, the a_(std::forward<T>(t)...) call memset(addr,0,sizeof(A)). (Why?)

The reason of the std::allocate_shared call memset is because the following code (vs2017, xutility, in my computer, at C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.10.25017\include):

template<class _Ty1,
    class _Ty2>
    class _Compressed_pair<_Ty1, _Ty2, false> final

    {   // store a pair of values, not deriving from first
private:
    _Ty1 _Myval1;
    _Ty2 _Myval2;

public:
    template<class... _Other2>
        constexpr explicit _Compressed_pair(_Zero_then_variadic_args_t,
            _Other2&&... _Val2)
        : _Myval1(), _Myval2(_STD forward<_Other2>(_Val2)...)
        {   // construct from forwarded values
        }

    template<class _Other1,
        class... _Other2>
        _Compressed_pair(_One_then_variadic_args_t,
            _Other1&& _Val1, _Other2&&... _Val2)
        : _Myval1(_STD forward<_Other1>(_Val1)),
            _Myval2(_STD forward<_Other2>(_Val2)...)
        {   // construct from forwarded values
        }

the type of _Myval2 is std::_Align_type, which define is

template<class _Ty,
    size_t _Len>
    union _Align_type
    {   // union with size _Len bytes and alignment of _Ty
    _Ty _Val;
    char _Pad[_Len];
    };

The _Align_type does not have constructor, so the _Myval2(_STD forward<_Other2>(_Val2)...) call memset(addr,0, sizeof(T)).

So I changed the _Align_type define (add a dummy constructor) and test again, I found the std::allocate_shared does not call memset, and much faster than before.

template<class _Ty,
    size_t _Len>
    union _Align_type
    {   // union with size _Len bytes and alignment of _Ty
    _Ty _Val;
    char _Pad[_Len];
    _Align_type() { }
    };

After I change the define of the _Align_type, now the speed of the test_allocate_shared is equal or slightly faster than test_shared_ptr_with_pool.

Till now, I know why std::allocate_shared is that slow, but I still not know why the code call memset when type T does not have constructor but does not call memset when T has constructor.

template<class ... T> B(T&...t) 
        : a_(std::forward<T>(t)...) {}

Is it a c++ standard?

And, since the allocate_shared should not call memset(sizeof(T)), is it a bug of the compiler?

Update:

struct A {
    //A() {}
    char data_[1500];
    void dummy() {
        for (int i = 0; i < sizeof(data_); ++i) {
            data_[i] = rand();
        }
    }
    int dummy2() { // avoid optimize erase by compiler
        int ret = 0;
        for (int i = 0; i < sizeof(data_); ++i) {
            ret += data_[i];
        }
        return ret;
    }
};

class B {
public:
    template<class ... T> B(T&...t) 
        : a_(std::forward<T>(t)...) {
    }
    A a_;
};

class C {
public:
    C() : a_() {
    }
    A a_;
};

int main() {
    //B b;
    C c;
    c.a_.dummy();
    return c.a_.dummy2();
}

I compile above code by vs2017, x86 release build, and the asm code is:

int main() {
009E1000  push        ebp  
009E1001  mov         ebp,esp  
009E1003  sub         esp,5E0h  
009E1009  mov         eax,dword ptr [__security_cookie (09E3004h)]  
009E100E  xor         eax,ebp  
009E1010  mov         dword ptr [ebp-4],eax  
009E1013  push        ebx  
009E1014  push        esi  
009E1015  push        edi  
    //B b;
    C c;
009E1016  push        5DCh  
009E101B  lea         eax,[c]  
009E1021  push        0  
009E1023  push        eax  
009E1024  call        _memset (09E1BCAh)  
    c.a_.dummy();
009E1029  mov         edi,dword ptr [__imp__rand (09E20B4h)]  
    //B b;
    C c;
009E102F  add         esp,0Ch  
    c.a_.dummy();
009E1032  xor         esi,esi  
009E1034  call        edi  
009E1036  mov         byte ptr c[esi],al  
009E103D  inc         esi  
009E103E  cmp         esi,5DCh  
009E1044  jb          main+34h (09E1034h)  
    return c.a_.dummy2();
009E1046  xor         esi,esi  
009E1048  xor         edx,edx  
009E104A  xor         edi,edi  
009E104C  xor         ebx,ebx  
    return c.a_.dummy2();
009E104E  xchg        ax,ax  
009E1050  movsx       eax,byte ptr c[edx]  
009E1058  movsx       ecx,byte ptr [ebp+edx-5DEh]  
009E1060  add         esi,eax  
009E1062  movsx       eax,byte ptr [ebp+edx-5DFh]  
009E106A  add         edi,ecx  
009E106C  add         edx,3  
009E106F  add         ebx,eax  
009E1071  cmp         edx,5DCh  
009E1077  jb          main+50h (09E1050h)  
}
009E1079  mov         ecx,dword ptr [ebp-4]  
009E107C  lea         eax,[edi+ebx]  
009E107F  pop         edi  
009E1080  add         eax,esi  
009E1082  xor         ecx,ebp  
009E1084  pop         esi  
009E1085  pop         ebx  
009E1086  call        __security_check_cookie (09E108Fh)  
009E108B  mov         esp,ebp  
009E108D  pop         ebp  
009E108E  ret  

There is still a memset(addr, 0, 1500)!

Update: It seems there is a bug in visual studio 2017 std::allocate_shared. The code try to perfect-forwarding construct a std::_Align_type which does not have constructor, and so do value-initialize the std::_Align_type, that is memset.

alpha
  • 1,228
  • 1
  • 11
  • 26
  • 2
    I see a flaw with your testing: You reserve space in the vector, but you do it *after* `make_shared` test meaning that test will include the vectors work too. *And* you add quite a lot more elements than you reserve space for, meaning the vector have have to reallocate its contents multiple times. Clear and reserve/resize first thing you do in the test functions (before you get the start time) – Some programmer dude Jul 13 '17 at 06:06
  • 1
    I assume you are testing a optimized/release build and not a debug build - right? – Jesper Juhl Jul 13 '17 at 06:06
  • In order for your results to be relevant you should mix and randomize the sequence of you tests. In such fragile profiling cases it might be a crucial thing that will impact the results. Also, timing is not the best way to determine the performance. You should rather rely on some profiler, like valgrind for instance. Time of execution can be easily influenced by random events. – K. Kirsz Jul 13 '17 at 06:07
  • Very thanks for packets.reserve issue. I just fix the bug and test, the result is same. – alpha Jul 13 '17 at 06:09
  • Your Pool tried to lock the mutex recursively when calling reserve from within pop. – MikeMB Jul 13 '17 at 06:13
  • Shared_ptr_with_new doesn't two initialize the data in the packet. The others do. – MikeMB Jul 13 '17 at 06:16
  • @MikeMB Very Thanks. That's a bug. The pop will never enter the if (datas_.empty()) in test. I will fix it. – alpha Jul 13 '17 at 06:18
  • @Someprogrammerdude I call packets.clear() in every test. – alpha Jul 13 '17 at 06:21
  • @alpha Ah sorry didn't see that. However, you should probably do it *after* you get the end-time. – Some programmer dude Jul 13 '17 at 06:25
  • @Someprogrammerdude I call the packets.clear() before the "auto end = ... " to include the free time. So the last result is the time of create 1000000 shared objects and destroy them. – alpha Jul 13 '17 at 06:29
  • Okay. Then a last thing: With all but the pool test, the allocation of the `Packet` data is included in the test. With the pool you allocate that data in the call to `BlockPool::reserve`. – Some programmer dude Jul 13 '17 at 06:33
  • @Someprogrammerdude test_allocate_shared() also use the BlockPool. As my understand, the test_allocate_shared should faster than the test_shared_ptr_with_pool. – alpha Jul 13 '17 at 06:37
  • 1
    With `test_allocate_shared` there is some overhead due to the `allocate_shared` call and the use of an allocator, you use the pool *indirectly*. In `test_shared_ptr_with_pool` you use the pool *directly*. This should account for the difference between those two tests. – Some programmer dude Jul 13 '17 at 06:42
  • Just as a data point: In my tests, make_shared becomes better for smaller packets – MikeMB Jul 13 '17 at 07:27
  • @MikeMB Yes,new(small_object) is much faster than new(normal_object). In the test code, the sizeof(Packet) is 1000, and new Packet is much slower than new small object which hold the shared_ptr refcount and deleter. I think that is why the test_shared_ptr_with_new is not that slow. But I still not understand why make_shared is that slow. – alpha Jul 13 '17 at 07:36
  • @alpha: I was talking about objects with the same size for both, the new and the make_shared case. – MikeMB Jul 13 '17 at 08:23
  • @MikeMB I think I understand what you said: You change the Packet::data_[1000] to Packet::data_[10]. – alpha Jul 13 '17 at 08:27
  • @alpha: exactly. – MikeMB Jul 13 '17 at 08:41
  • 2
    This [tag:performance] question fails to list compilation flags. Almost every single performance question in C++ gets asked to list compilation flags. Half of them are actually debug builds. I have no idea if this is a debug build, and hence a waste of time to consider, or not. Edit this fact into the question itself. Feel free to ping me here as well. – Yakk - Adam Nevraumont Jul 13 '17 at 13:50
  • @Yakk I test it in vs2017+win10 with release build. The release build setting is default(/O2). I also test it in gcc4.8.5+centos6.2 with g++ -std=c++11 -O3. – alpha Jul 14 '17 at 00:57
  • @MikeMB I just did some dig and add some content. Now the old question has answer, but produce an new question. – alpha Jul 14 '17 at 04:28
  • Possible duplicate of [Does std::make\_shared perform value initialization (GCC and clang disagree)?](https://stackoverflow.com/questions/32783394/does-stdmake-shared-perform-value-initialization-gcc-and-clang-disagree) – rustyx Jul 14 '17 at 10:44

1 Answers1

3

After reading why c++ use memset(addr,0,sizeof(T)) to construct a object? Standard or compiler bug? and Default Initialization Versus Zero Initialization, now I understand why there has a memset.

It is because that the allocate_shared implement in vs2017 use a type _Align_type, and this type does not have constructor. When allocate_shared try to value-initialize the _Align_type, it call memset.

It seems a bug in vs2017.

Before the bug fix, I think maybe no good idea to workaround it.

Update:

I posted a bug report to MS, and they have confirm it.

Update: This bug still exist in vs2017 update 3.

alpha
  • 1,228
  • 1
  • 11
  • 26