I observe cudaFree operation takes unacceptably long time to operate it. To verify it, I wrote a simple micro benchmark code to test cudaFree overhead. It shows the similar trend.
first step: cudaMallocManaged
It doesn't take that long time. Actually it is less than 1ms just only for cudaMallocManaged.
second step: init
To initialize malloced memory region, init function is called. And it also doesn't take less than 1ms.
third step: cudaFree
Now, the overhead appears. It takes a lot of time. One more things is bigger gets the memory size, Longer it takes. (beautifully linearly)
Question is "Why does cudaFree have such a huge overhead? Does it fill whole memory region with zero for security issue? Or any other critical path it walks through?"
Here is the code and the measured result. Thank you in advance :) !!!
10 int getMilliCount(){
11 timeb tb;
12 ftime(&tb);
13 int nCount = tb.millitm + (tb.time & 0xfffff) * 1000;
14 return nCount;
15 }
16
17 int getTimeDiff(int baseTime){
18 int diff = getMilliCount() - baseTime;
19 return diff;
20 }
21
22 __global__ void init(int* x, size_t bytes_){
23 int num_ = bytes_/sizeof(int);
24 for (int i=0; i<num_; i++){
25 x[i] = i;
26 }
27 }
28
29 int main(){
30 printf("sizeof(size_t): %zu\n", sizeof(size_t));
31 printf("sizeof(unsigned int): %zu\n", sizeof(unsigned int));
32 printf("sizeof(int): %zu\n", sizeof(int));
33 printf("sizeof(long): %zu\n", sizeof(long));
34
35 std::ofstream myfile;
36 myfile.open("output3.csv");
37 myfile<<"operation, num_bytes, start, end, duration\n";
38 int baseTime = getMilliCount();
39 int* dptr;
40 int ts1 = 0;
41 int ts2 = 0;
42 size_t KB = 1024; // start from 1KB
43 int num_trial_ = 1;
44 for (int j=10; j<25; j++){
45 size_t num_bytes_ = KB<<j;
46 for (int i=0; i<num_trial_; i++){
47 // measuring cudaMallocManaged
48 ts1 = getTimeDiff(baseTime);
>> 49 cudaMallocManaged((void**)&dptr, num_bytes_);
50 ts2 = getTimeDiff(baseTime);
51 myfile<<"cudaMallocManaged, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
52 //printf("cudaMallocManaged, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
53 printf("cudaMallocManaged, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
54
55 // measuring initialization
56 ts1 = getTimeDiff(baseTime);
>> 57 init<<<1,1>>>(dptr, num_bytes_);
58 ts2 = getTimeDiff(baseTime);
59 myfile<<"initialization, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
60 //printf("init, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
61 printf("init, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
62
63 // measuring cudaFree
64 ts1 = getTimeDiff(baseTime);
>> 65 cudaFree(dptr);
66 ts2 = getTimeDiff(baseTime);
67 myfile<<"cudaFree, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
68 //printf("cudaFree, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
69 printf("cudaFree, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
70 sleep(1);
71 printf("\n");
72 }
73 }
74 myfile.close();
75 return 1;
76 }
Results
cudaMallocManaged, memory_size:1MB, duration:360
init, memory_size:1MB, duration:0
cudaFree, memory_size:1MB, **duration:2**
cudaMallocManaged, memory_size:2MB, duration:1
init, memory_size:2MB, duration:0
cudaFree, memory_size:2MB, **duration:4**
cudaMallocManaged, memory_size:4MB, duration:0
init, memory_size:4MB, duration:0
cudaFree, memory_size:4MB, **duration:9**
cudaMallocManaged, memory_size:8MB, duration:0
init, memory_size:8MB, duration:0
cudaFree, memory_size:8MB, **duration:18**
cudaMallocManaged, memory_size:16MB, duration:0
init, memory_size:16MB, duration:0
cudaFree, memory_size:16MB, **duration:34**
cudaMallocManaged, memory_size:32MB, duration:0
init, memory_size:32MB, duration:0
cudaFree, memory_size:32MB, **duration:69**
cudaMallocManaged, memory_size:64MB, duration:0
init, memory_size:64MB, duration:0
cudaFree, memory_size:64MB, **duration:132**
cudaMallocManaged, memory_size:128MB, duration:0
init, memory_size:128MB, duration:0
cudaFree, memory_size:128MB, **duration:241**
cudaMallocManaged, memory_size:256MB, duration:0
init, memory_size:256MB, duration:0
cudaFree, memory_size:256MB, **duration:476**
cudaMallocManaged, memory_size:512MB, duration:0
init, memory_size:512MB, duration:0
cudaFree, memory_size:512MB, **duration:984**
cudaMallocManaged, memory_size:1024MB, duration:0
init, memory_size:1024MB, duration:0
cudaFree, memory_size:1024MB, **duration:1910**
cudaMallocManaged, memory_size:2048MB, duration:0
init, memory_size:2048MB, duration:1
cudaFree, memory_size:2048MB, **duration:3830**
cudaMallocManaged, memory_size:4096MB, duration:0
init, memory_size:4096MB, duration:0
cudaFree, memory_size:4096MB, **duration:7715**
cudaMallocManaged, memory_size:8192MB, duration:0
init, memory_size:8192MB, duration:0
cudaFree, memory_size:8192MB, **duration:0**
cudaMallocManaged, memory_size:16384MB, duration:0
init, memory_size:16384MB, duration:0
cudaFree, memory_size:16384MB, **duration:0**
- weird thing is that 8192MB and 16384MB shows less than 0ms not only for cudaManagedMalloc and init but also for cudaFree....
Please enlighten me