CUDA multithreading: __threadfence not preventing multiple threads from accessing resource

Question

So, I started CUDA programming recently.

and I tried to make a program that start more than one thread, go into an array of global memory and the order in which it started.

However, part of the exclusive control does not seem to be working well.
I want to prevent multiple threads to simultaneously access the array Log.

now, the array Log is like this.

Log[0]=160
Log[1]=128
Log[2]=256
Log[3]=96
Log[4]=0
Log[5]=0
Log[6]=0
...etc

I want to prevent multiple threads simultaneously access to exclusive control of the memory array Log.

Is it doing wrong how to use "__threadfence ()" of?
I use CUDA5.5 and compute capability is 2.1.
Please advice someone.

Following is the source code.

#include <cuda_runtime.h>
#include <stdio.h>  
#include <cuda.h> 
#include <cstdio>
#include <thrust/device_ptr.h>
#define N 256

//Prototype declaration
__global__ void CudaThreadfenceTest(int *Log_d);

int main(){
    int i,j;
    int Log[N];
    int *Log_d;
    //
    for(j=0;j<N;j++){
        Log[j]=0;
    }
    // GPU memory hold
    cudaMalloc((void**)&Log_d, N*sizeof(int));
    // host→device
    cudaMemcpy(Log_d,Log,N*sizeof(int),cudaMemcpyHostToDevice);
    /*****************
    *block & thread
    ******************/
    dim3 blocks(1,1,1);
    dim3 threads(256,1,1);

    //run kernel
    CudaThreadfenceTest<<<blocks,threads>>>(Log_d);
    cudaDeviceSynchronize();

    cudaMemcpy(Log,Log_d,N*sizeof(int),cudaMemcpyDeviceToHost); 
    for(j=0;j<N;j++){
        printf("Log[ %d ]=%d \n",j,Log[j]);
    }
    getchar();
    cudaFree(Log_d);
    return 0;
}


/*************************
/* kernel
/*************************/
__global__ void CudaThreadfenceTest(int *Log_d){

    printf("threadIdx.x = %d , \n",threadIdx.x);
    __threadfence();
    //for Log
    for(int j=0;j<N;j++){
        if(Log_d[j]==0){
            Log_d[j]=threadIdx.x + 1;
            break;
        }
    }
}

score 1 · Answer 1 · edited May 23 '17 at 12:28

threadfence() by itself cannot be used to protect access to a memory region. It does not "fence threads" it actually has to do with updating memory. The documentation is here

What you want is either atomics (your example can be made to work with atomicCAS, for example) or a critical section.

Here is your example re-worked using atomics:

Program:

$ cat t258.cu
#include <stdio.h>
#include <cstdio>
#define N 256

//Prototype declaration
__global__ void atomicsTest(int *);

int main(){
    int j;
    int Log[N];
    int *Log_d;
    //
    for(j=0;j<N;j++){
        Log[j]=0;
    }
    // GPU memory hold
    cudaMalloc((void**)&Log_d, N*sizeof(int));
    // host.device
    cudaMemcpy(Log_d,Log,N*sizeof(int),cudaMemcpyHostToDevice);
    /*****************
    *block & thread
    ******************/
    dim3 blocks(1,1,1);
    dim3 threads(256,1,1);

    //run kernel
    atomicsTest<<<blocks,threads>>>(Log_d);
    cudaMemcpy(Log,Log_d,N*sizeof(int),cudaMemcpyDeviceToHost);
    for(j=0;j<N;j++){
        printf("Log[ %3d ]=%3d ",j,Log[j]);
        if (!((j+1)%4)) printf("\n");
    }
    getchar();
    cudaFree(Log_d);
    return 0;
}


__global__ void atomicsTest(int *Log_d){

     // printf("threadIdx.x = %d , \n",threadIdx.x);
    for (int j = 0; j < N; j++)
      if(atomicCAS(Log_d+j, 0, threadIdx.x + 1)==0)    break;
}

Compile with:

$ nvcc -arch=sm_20 -o t258 t258.cu

Output:

$ ./t258
Log[   0 ]=  1 Log[   1 ]=161 Log[   2 ]=162 Log[   3 ]=163
Log[   4 ]=164 Log[   5 ]=165 Log[   6 ]=166 Log[   7 ]=167
Log[   8 ]=168 Log[   9 ]=169 Log[  10 ]=  2 Log[  11 ]=  3
Log[  12 ]=  4 Log[  13 ]=  5 Log[  14 ]=170 Log[  15 ]=171
Log[  16 ]=172 Log[  17 ]=  6 Log[  18 ]=173 Log[  19 ]=174
Log[  20 ]=175 Log[  21 ]=176 Log[  22 ]=177 Log[  23 ]=178
Log[  24 ]=179 Log[  25 ]=180 Log[  26 ]=181 Log[  27 ]=182
Log[  28 ]=183 Log[  29 ]=184 Log[  30 ]=185 Log[  31 ]=186
Log[  32 ]=187 Log[  33 ]=188 Log[  34 ]=189 Log[  35 ]=190
Log[  36 ]=191 Log[  37 ]=192 Log[  38 ]=  7 Log[  39 ]=  8
Log[  40 ]=  9 Log[  41 ]= 10 Log[  42 ]= 11 Log[  43 ]= 12
Log[  44 ]= 13 Log[  45 ]= 14 Log[  46 ]= 15 Log[  47 ]= 16
Log[  48 ]= 17 Log[  49 ]= 18 Log[  50 ]= 19 Log[  51 ]= 20
Log[  52 ]= 21 Log[  53 ]= 22 Log[  54 ]= 23 Log[  55 ]= 24
Log[  56 ]= 25 Log[  57 ]= 26 Log[  58 ]= 27 Log[  59 ]= 28
Log[  60 ]= 29 Log[  61 ]= 30 Log[  62 ]= 31 Log[  63 ]= 32
Log[  64 ]= 33 Log[  65 ]= 34 Log[  66 ]= 35 Log[  67 ]= 36
Log[  68 ]= 37 Log[  69 ]= 38 Log[  70 ]= 39 Log[  71 ]= 40
Log[  72 ]= 41 Log[  73 ]= 42 Log[  74 ]= 43 Log[  75 ]= 44
Log[  76 ]= 45 Log[  77 ]= 46 Log[  78 ]= 47 Log[  79 ]= 48
Log[  80 ]= 49 Log[  81 ]= 50 Log[  82 ]= 51 Log[  83 ]= 52
Log[  84 ]= 53 Log[  85 ]= 54 Log[  86 ]= 55 Log[  87 ]= 56
Log[  88 ]= 57 Log[  89 ]= 58 Log[  90 ]= 59 Log[  91 ]= 60
Log[  92 ]= 61 Log[  93 ]= 62 Log[  94 ]= 63 Log[  95 ]= 64
Log[  96 ]=225 Log[  97 ]=226 Log[  98 ]=227 Log[  99 ]=228
Log[ 100 ]=229 Log[ 101 ]=230 Log[ 102 ]=231 Log[ 103 ]=232
Log[ 104 ]=233 Log[ 105 ]=234 Log[ 106 ]=235 Log[ 107 ]=236
Log[ 108 ]=237 Log[ 109 ]=238 Log[ 110 ]=239 Log[ 111 ]=240
Log[ 112 ]=241 Log[ 113 ]=242 Log[ 114 ]=243 Log[ 115 ]=244
Log[ 116 ]=245 Log[ 117 ]=246 Log[ 118 ]=247 Log[ 119 ]=248
Log[ 120 ]=249 Log[ 121 ]=250 Log[ 122 ]=251 Log[ 123 ]=252
Log[ 124 ]=253 Log[ 125 ]=254 Log[ 126 ]=255 Log[ 127 ]=256
Log[ 128 ]= 97 Log[ 129 ]= 98 Log[ 130 ]= 99 Log[ 131 ]=100
Log[ 132 ]=101 Log[ 133 ]=102 Log[ 134 ]=103 Log[ 135 ]=104
Log[ 136 ]=105 Log[ 137 ]=106 Log[ 138 ]=107 Log[ 139 ]=108
Log[ 140 ]=109 Log[ 141 ]=110 Log[ 142 ]=111 Log[ 143 ]=112
Log[ 144 ]=113 Log[ 145 ]=114 Log[ 146 ]=115 Log[ 147 ]=116
Log[ 148 ]=117 Log[ 149 ]=118 Log[ 150 ]=119 Log[ 151 ]=120
Log[ 152 ]=121 Log[ 153 ]=122 Log[ 154 ]=123 Log[ 155 ]=124
Log[ 156 ]=125 Log[ 157 ]=126 Log[ 158 ]=127 Log[ 159 ]=128
Log[ 160 ]=129 Log[ 161 ]=130 Log[ 162 ]=131 Log[ 163 ]=132
Log[ 164 ]=133 Log[ 165 ]=134 Log[ 166 ]=135 Log[ 167 ]=136
Log[ 168 ]=137 Log[ 169 ]=138 Log[ 170 ]=139 Log[ 171 ]=140
Log[ 172 ]=141 Log[ 173 ]=142 Log[ 174 ]=143 Log[ 175 ]=144
Log[ 176 ]=145 Log[ 177 ]=146 Log[ 178 ]=147 Log[ 179 ]=148
Log[ 180 ]=149 Log[ 181 ]=150 Log[ 182 ]=151 Log[ 183 ]=152
Log[ 184 ]=153 Log[ 185 ]=154 Log[ 186 ]=155 Log[ 187 ]=156
Log[ 188 ]=157 Log[ 189 ]=158 Log[ 190 ]=159 Log[ 191 ]=160
Log[ 192 ]= 65 Log[ 193 ]=193 Log[ 194 ]=194 Log[ 195 ]=195
Log[ 196 ]=196 Log[ 197 ]=197 Log[ 198 ]=198 Log[ 199 ]=199
Log[ 200 ]=200 Log[ 201 ]=201 Log[ 202 ]=202 Log[ 203 ]=203
Log[ 204 ]=204 Log[ 205 ]=205 Log[ 206 ]=206 Log[ 207 ]=207
Log[ 208 ]=208 Log[ 209 ]=209 Log[ 210 ]=210 Log[ 211 ]=211
Log[ 212 ]=212 Log[ 213 ]=213 Log[ 214 ]=214 Log[ 215 ]=215
Log[ 216 ]=216 Log[ 217 ]=217 Log[ 218 ]=218 Log[ 219 ]=219
Log[ 220 ]=220 Log[ 221 ]=221 Log[ 222 ]=222 Log[ 223 ]=223
Log[ 224 ]=224 Log[ 225 ]= 66 Log[ 226 ]= 67 Log[ 227 ]= 68
Log[ 228 ]= 69 Log[ 229 ]= 70 Log[ 230 ]= 71 Log[ 231 ]= 72
Log[ 232 ]= 73 Log[ 233 ]= 74 Log[ 234 ]= 75 Log[ 235 ]= 76
Log[ 236 ]= 77 Log[ 237 ]= 78 Log[ 238 ]= 79 Log[ 239 ]= 80
Log[ 240 ]= 81 Log[ 241 ]= 82 Log[ 242 ]= 83 Log[ 243 ]= 84
Log[ 244 ]= 85 Log[ 245 ]= 86 Log[ 246 ]= 87 Log[ 247 ]= 88
Log[ 248 ]= 89 Log[ 249 ]= 90 Log[ 250 ]= 91 Log[ 251 ]= 92
Log[ 252 ]= 93 Log[ 253 ]= 94 Log[ 254 ]= 95 Log[ 255 ]= 96

$

Thank you Robert Crovella! , but I have delayed , Got similar results where you run as say Robert. I appreciate you! — alu21, Oct 31 '13 at 14:45

CUDA multithreading: __threadfence not preventing multiple threads from accessing resource

1 Answers1