when I copy blocks (16MiB, page aligned) with memcpy() I need more than one thread to push a machine to it's limits. WHy is that so? I would expect that the interface to the external memory is the slowest part in the equation, so that should be the limit even with one thread.
Test program:
#include <string.h>
#include <chrono>
#include <thread>
#include <string>
#include <vector>
#include <stdexcept>
#include <iostream>
#include <sys/mman.h>
class block {
size_t blockSize;
void *buffer;
public:
block() {};
block(size_t aSize) {init(aSize);}
void init(size_t aSize) {
blockSize = aSize;
buffer = mmap(nullptr, blockSize,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (buffer == nullptr) {
throw std::runtime_error("can't allocate block");
}
}
~block() {
munmap(buffer, blockSize);
}
char *begin() const {
return reinterpret_cast<char*>(buffer);
}
char *end() const {
return reinterpret_cast<char*>(buffer) + blockSize;
}
size_t size() const {
return blockSize;
}
};
void copyThread(const block& src,
const std::vector<block>& dst) {
for (auto& b: dst) {
memcpy(b.begin(), src.begin(), src.size());
}
}
typedef std::chrono::system_clock clock_type;
int main(int argc, const char *argv[]) {
auto nThreads = argc > 1 ? std::stoul(argv[1]) : 1ul;
auto nBlocks = argc > 2 ? std::stoul(argv[2]) : 10ul;
auto blocksize = argc > 3 ? std::stoul(argv[3]) : 16*1024*1014ul;
block src(blocksize);
std::vector<std::vector<block>> dstBlocks;
for (unsigned long i=0; i<nThreads; i++) {
dstBlocks.emplace_back(nBlocks);
}
for (auto& v: dstBlocks) {
for (auto& b: v) {
b.init(blocksize);
}
}
std::cerr << "blocks allocated\n";
std::vector<std::thread> workers(nThreads);
int i = 0;
auto before = clock_type::now();
for (auto& worker: workers) {
worker = std::thread(copyThread, std::ref(src), std::ref(dstBlocks.at(i++)));
}
for (auto& worker: workers) {
worker.join();
}
auto deltaT = std::chrono::duration_cast<std::chrono::duration<double>>(clock_type::now() - before).count();
auto total = blocksize * nBlocks * nThreads;
std::cout << nThreads
<< " " << blocksize
<< " " << nBlocks
<< " " << total
<< " " << deltaT
<< " " << total/deltaT
<< " " << total/deltaT/(1024*1014*1014)
<< "\n";
return 0;
}
gives in a loop the following output:
for n in $(seq 32); do ./memcpy $n 2>/dev/null; done
1 16613376 10 166133760 0.0204955 8.10587e+09 7.69881
2 16613376 10 332267520 0.021766 1.52654e+10 14.4988
3 16613376 10 498401280 0.0227502 2.19075e+10 20.8074
4 16613376 10 664535040 0.0228769 2.90483e+10 27.5896
5 16613376 10 830668800 0.0238712 3.47979e+10 33.0504
6 16613376 10 996802560 0.025281 3.94289e+10 37.4489
7 16613376 10 1162936320 0.0266224 4.36827e+10 41.489
8 16613376 10 1329070080 0.0263878 5.03668e+10 47.8375
9 16613376 10 1495203840 0.0298019 5.01715e+10 47.652
10 16613376 10 1661337600 0.0312424 5.31757e+10 50.5053
11 16613376 10 1827471360 0.0335261 5.45089e+10 51.7716
12 16613376 10 1993605120 0.035536 5.6101e+10 53.2838
13 16613376 10 2159738880 0.0414056 5.21605e+10 49.5411
14 16613376 10 2325872640 0.0500519 4.64692e+10 44.1357
15 16613376 10 2492006400 0.0507584 4.90954e+10 46.63
16 16613376 10 2658140160 0.0529706 5.01814e+10 47.6614
17 16613376 10 2824273920 0.0538962 5.24021e+10 49.7706
18 16613376 10 2990407680 0.059596 5.0178e+10 47.6582
19 16613376 10 3156541440 0.0571108 5.52705e+10 52.4949
20 16613376 10 3322675200 0.0616152 5.39262e+10 51.2182
21 16613376 10 3488808960 0.0643704 5.4199e+10 51.4772
22 16613376 10 3654942720 0.0645592 5.66138e+10 53.7708
23 16613376 10 3821076480 0.0678021 5.63563e+10 53.5263
24 16613376 10 3987210240 0.0707682 5.63418e+10 53.5125
25 16613376 10 4153344000 0.0775049 5.35882e+10 50.8971
26 16613376 10 4319477760 0.0866202 4.98669e+10 47.3627
27 16613376 10 4485611520 0.0882388 5.08349e+10 48.2821
28 16613376 10 4651745280 0.0900769 5.1642e+10 49.0486
29 16613376 10 4817879040 0.0928807 5.18717e+10 49.2668
30 16613376 10 4984012800 0.0931539 5.3503e+10 50.8162
31 16613376 10 5150146560 0.0958964 5.37053e+10 51.0084
32 16613376 10 5316280320 0.100783 5.27498e+10 50.1008
So only with ~10 threads the full speed (last column) is reached. That is on a AMD EPYC 7F72 24-Core Processor